Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,16 @@ Further documentation can be found at <https://hexdocs.pm/webls>.

## Utility Support

| Type | to_string | Builder Functions | Validators |
| ---------- | --------- | ----------------- | ---------- |
| Sitemap | Complete | Complete | None |
| RSS v2.0 | Complete | Complete | None |
| Robots.txt | Complete | Complete | None |
| Atom | Complete | Complete | None |
| Type | Builder Functions | to_string | from_string |
| ---------- | ----------------- | --------- | ----------- |
| Sitemap | Complete | Complete | Complete |
| RSS v2.0 | Complete | Complete | Complete |
| Robots.txt | Complete | Complete | Complete |
| Atom | Complete | Complete | None |

## Development

```sh
gleam run # Run the project
gleam test # Run the tests
```

Expand Down
3 changes: 2 additions & 1 deletion gleam.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "webls"
version = "1.6.1"
version = "2.0.0"

description = "A simple web utility library for RSS feeds, Sitemaps, Robots.txt, etc."
licences = ["Apache-2.0"]
Expand All @@ -8,6 +8,7 @@ repository = { type = "github", user = "versecafe", repo = "webls" }
[dependencies]
gleam_stdlib = ">= 0.34.0 and < 2.0.0"
gleam_time = ">= 1.6.0 and < 2.0.0"
parsed_it = ">= 0.1.1 and < 0.2.0"

[dev-dependencies]
gleeunit = ">= 1.0.0 and < 2.0.0"
Expand Down
4 changes: 3 additions & 1 deletion manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
packages = [
{ name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" },
{ name = "gleam_stdlib", version = "0.68.1", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "F7FAEBD8EF260664E86A46C8DBA23508D1D11BB3BCC6EE1B89B3BC3E5C83FF1E" },
{ name = "gleam_time", version = "1.6.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_time", source = "hex", outer_checksum = "0DF3834D20193F0A38D0EB21F0A78D48F2EC276C285969131B86DF8D4EF9E762" },
{ name = "gleam_time", version = "1.7.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_time", source = "hex", outer_checksum = "56DB0EF9433826D3B99DB0B4AF7A2BFED13D09755EC64B1DAAB46F804A9AD47D" },
{ name = "gleeunit", version = "1.9.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "DA9553CE58B67924B3C631F96FE3370C49EB6D6DC6B384EC4862CC4AAA718F3C" },
{ name = "parsed_it", version = "0.1.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "parsed_it", source = "hex", outer_checksum = "9F8BA3C634FEA847AD195E3322FD1DA51980F57C4171B02DCF069C6FC807944A" },
{ name = "simplifile", version = "2.3.2", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "E049B4DACD4D206D87843BCF4C775A50AE0F50A52031A2FFB40C9ED07D6EC70A" },
]

[requirements]
gleam_stdlib = { version = ">= 0.34.0 and < 2.0.0" }
gleam_time = { version = ">= 1.6.0 and < 2.0.0" }
gleeunit = { version = ">= 1.0.0 and < 2.0.0" }
parsed_it = { version = ">= 0.1.1 and < 0.2.0" }
simplifile = { version = ">= 2.3.2 and < 3.0.0" }
215 changes: 203 additions & 12 deletions src/webls/robots.gleam
Original file line number Diff line number Diff line change
@@ -1,16 +1,57 @@
//// Functions for building and parsing robots.txt files.
////
//// ## Building a robots.txt
////
//// ```gleam
//// import webls/robots
////
//// robots.config("https://example.com/sitemap.xml")
//// |> robots.with_config_robot(
//// robots.robot("*")
//// |> robots.with_robot_disallowed_route("/admin/")
//// )
//// |> robots.to_string
//// ```
////
//// ## Parsing a robots.txt
////
//// ```gleam
//// import webls/robots
////
//// let assert Ok(config) = robots.from_string(robots_txt_content)
//// // Access config.sitemap_url and config.robots
//// ```
////
//// The parser handles comments, extra whitespace, and case-insensitive
//// directives. Unknown directives are ignored. Malformed lines (missing `:`)
//// return an error.

import gleam/list
import gleam/option.{type Option, None, Some}
import gleam/result
import gleam/string

// Stringify ------------------------------------------------------------------

/// Converts a RobotsConfig to a robots.txt formatted string.
///
/// The output format follows the standard robots.txt specification:
/// - Sitemap directive at the top (if present)
/// - User-agent blocks separated by blank lines
/// - Allow directives followed by Disallow directives for each agent
pub fn to_string(config: RobotsConfig) -> String {
"Sitemap: "
<> config.sitemap_url
<> "\n\n"
<> config.robots
|> list.map(fn(robot) { robot |> robot_to_string })
|> list.reduce(fn(acc, line) { acc <> "\n\n" <> line })
|> result.unwrap("")
let sitemap_section = case config.sitemap_url {
Some(url) -> "Sitemap: " <> url <> "\n\n"
None -> ""
}

let robots_section =
config.robots
|> list.map(fn(robot) { robot |> robot_to_string })
|> list.reduce(fn(acc, line) { acc <> "\n\n" <> line })
|> result.unwrap("")

sitemap_section <> robots_section
}

fn robot_to_string(robot: Robot) -> String {
Expand All @@ -28,11 +69,24 @@ fn robot_to_string(robot: Robot) -> String {
|> result.unwrap("")
}

// Builder Patern -------------------------------------------------------------
// Builder Pattern ------------------------------------------------------------

/// Creates a robots config with a sitemap url
pub fn config(sitemap_url: String) -> RobotsConfig {
RobotsConfig(sitemap_url: sitemap_url, robots: [])
RobotsConfig(sitemap_url: Some(sitemap_url), robots: [])
}

/// Creates a robots config without a sitemap url
pub fn config_without_sitemap() -> RobotsConfig {
RobotsConfig(sitemap_url: None, robots: [])
}

/// Sets the sitemap url on a robots config
pub fn with_config_sitemap(
config: RobotsConfig,
sitemap_url: String,
) -> RobotsConfig {
RobotsConfig(..config, sitemap_url: Some(sitemap_url))
}

/// Adds a list of robots to the robots config
Expand All @@ -58,7 +112,7 @@ pub fn with_robot_allowed_routes(robot: Robot, routes: List(String)) -> Robot {
Robot(..robot, allowed_routes: list.flatten([robot.allowed_routes, routes]))
}

/// Adds a allowed route to the robot policy
/// Adds an allowed route to the robot policy
pub fn with_robot_allowed_route(robot: Robot, route: String) -> Robot {
Robot(..robot, allowed_routes: [route, ..robot.allowed_routes])
}
Expand All @@ -81,8 +135,8 @@ pub fn with_robot_disallowed_route(robot: Robot, route: String) -> Robot {
/// The configuration for a robots.txt file
pub type RobotsConfig {
RobotsConfig(
/// The url of the sitemap for crawlers to use
sitemap_url: String,
/// The optional url of the sitemap for crawlers to use
sitemap_url: Option(String),
/// A list of robot policies
robots: List(Robot),
)
Expand All @@ -99,3 +153,140 @@ pub type Robot {
disallowed_routes: List(String),
)
}

/// Error returned when parsing a malformed robots.txt line
pub type RobotsParseError {
/// A line could not be parsed as a valid directive (missing `:`)
InvalidDirective(line: String)
}

// Parse ----------------------------------------------------------------------

/// Parses a robots.txt string into a RobotsConfig.
///
/// The parser handles:
/// - Case-insensitive directives (e.g., `USER-AGENT`, `user-agent`)
/// - Comments (lines starting with `#` or inline `# comment`)
/// - Extra whitespace around directives and values
/// - Unknown directives (silently ignored)
///
/// Returns an error if a non-empty, non-comment line is malformed (missing `:`).
/// An empty config (no sitemap, no robots) is valid.
/// Directives appearing before any `User-agent:` line are ignored.
pub fn from_string(input: String) -> Result(RobotsConfig, RobotsParseError) {
let lines =
input
|> string.split("\n")
|> list.map(strip_comment)
|> list.map(string.trim)
|> list.filter(fn(line) { line != "" })

case validate_lines(lines) {
Error(e) -> Error(e)
Ok(_) -> {
let sitemap_url = find_sitemap(lines)
let robot_lines = list.filter(lines, fn(line) { !is_sitemap_line(line) })
let robots = parse_robots(robot_lines, [], None)
Ok(RobotsConfig(sitemap_url: sitemap_url, robots: robots))
}
}
}

/// Validates that all lines are valid directives (contain `:`)
fn validate_lines(lines: List(String)) -> Result(Nil, RobotsParseError) {
case lines {
[] -> Ok(Nil)
[line, ..rest] ->
case string.contains(line, ":") {
True -> validate_lines(rest)
False -> Error(InvalidDirective(line))
}
}
}

/// Strips inline comments from a line (everything after `#`)
fn strip_comment(line: String) -> String {
case string.split_once(line, "#") {
Ok(#(before, _)) -> before
Error(_) -> line
}
}

/// Splits a directive line into key and value on the first `:`
fn split_directive(line: String) -> Result(#(String, String), Nil) {
case string.split_once(line, ":") {
Ok(#(key, value)) -> Ok(#(string.trim(key), string.trim(value)))
Error(_) -> Error(Nil)
}
}

fn is_sitemap_line(line: String) -> Bool {
case split_directive(line) {
Ok(#(key, _)) -> string.lowercase(key) == "sitemap"
Error(_) -> False
}
}

fn find_sitemap(lines: List(String)) -> Option(String) {
lines
|> list.find(is_sitemap_line)
|> result.map(fn(line) {
case split_directive(line) {
Ok(#(_, value)) -> value
Error(_) -> ""
}
})
|> option.from_result
}

fn parse_robots(
lines: List(String),
acc: List(Robot),
current: Option(Robot),
) -> List(Robot) {
case lines {
[] ->
case current {
Some(r) -> list.reverse([r, ..acc])
None -> list.reverse(acc)
}
[line, ..rest] -> {
case split_directive(line) {
Ok(#(key, value)) -> {
let lower_key = string.lowercase(key)
case lower_key {
"user-agent" -> {
let new_robot = Robot(value, [], [])
case current {
Some(r) -> parse_robots(rest, [r, ..acc], Some(new_robot))
None -> parse_robots(rest, acc, Some(new_robot))
}
}
_ ->
case current {
Some(r) -> {
let updated = parse_directive(lower_key, value, r)
parse_robots(rest, acc, Some(updated))
}
None -> parse_robots(rest, acc, None)
}
}
}
Error(_) -> parse_robots(rest, acc, current)
}
}
}
}

fn parse_directive(key: String, value: String, robot: Robot) -> Robot {
case key {
"allow" ->
Robot(..robot, allowed_routes: list.append(robot.allowed_routes, [value]))
"disallow" ->
Robot(
..robot,
disallowed_routes: list.append(robot.disallowed_routes, [value]),
)
_ -> robot
}
}
Loading