defmodule LazyHTML do @external_resource "README.md" [_, readme_docs, _] = "README.md" |> File.read!() |> String.split("") @moduledoc readme_docs defstruct [:resource] @behaviour Access @type t :: %__MODULE__{resource: reference()} @doc """ Parses an HTML document. This function expects a complete document, therefore if either of ``, `` or `` tags is missing, it will be added, which matches the usual browser behaviour. To parse a part of an HTML document, use `from_fragment/1` instead. ## Examples iex> LazyHTML.from_document(~S|Hello world!|) #LazyHTML< 1 node #1 Hello world! > iex> LazyHTML.from_document(~S|
Hello world!
|) #LazyHTML< 1 node #1
Hello world!
> """ @spec from_document(String.t()) :: t() def from_document(html) when is_binary(html) do LazyHTML.NIF.from_document(html) end @doc """ Parses a segment of an HTML document. As opposed to `from_document/1`, this function does not expect a full document and does not add any extra tags. ## Examples iex> LazyHTML.from_fragment(~S|Click me|) #LazyHTML< 1 node #1 Click me > iex> LazyHTML.from_fragment(~S|Hello world|) #LazyHTML< 3 nodes #1 Hello #2 [whitespace] #3 world > """ @spec from_fragment(String.t()) :: t() def from_fragment(html) when is_binary(html) do LazyHTML.NIF.from_fragment(html) end @doc ~S''' Serializes `lazy_html` as an HTML string. ## Options * `:skip_whitespace_nodes` - when `true`, ignores text nodes that consist entirely of whitespace, usually whitespace between tags. Defaults to `false`. ## Examples iex> lazy_html = LazyHTML.from_document(~S|Hello world!|) iex> LazyHTML.to_html(lazy_html) "Hello world!" iex> lazy_html = LazyHTML.from_fragment(~S|Hello world|) iex> LazyHTML.to_html(lazy_html) "Hello world" iex> lazy_html = ...> LazyHTML.from_fragment(""" ...>

...> Hello ...> world ...>

...> """) iex> LazyHTML.to_html(lazy_html, skip_whitespace_nodes: true) "

Hello world

" ''' @spec to_html(t(), keyword()) :: String.t() def to_html(%LazyHTML{} = lazy_html, opts \\ []) when is_list(opts) do opts = Keyword.validate!(opts, skip_whitespace_nodes: false) LazyHTML.NIF.to_html(lazy_html, opts[:skip_whitespace_nodes]) end @doc """ Builds an Elixir tree data structure representing the `lazy_html` document. ## Options * `:sort_attributes` - when `true`, attributes lists are sorted alphabetically by name. Defaults to `false`. * `:skip_whitespace_nodes` - when `true`, ignores text nodes that consist entirely of whitespace, usually whitespace between tags. Defaults to `false`. ## Examples iex> lazy_html = LazyHTML.from_document(~S|PageHello world|) iex> LazyHTML.to_tree(lazy_html) [{"html", [], [{"head", [], [{"title", [], ["Page"]}]}, {"body", [], ["Hello world"]}]}] iex> lazy_html = LazyHTML.from_fragment(~S|
Elixir
|) iex> LazyHTML.to_tree(lazy_html) [ {"div", [], [{:comment, " Link "}, {"a", [{"href", "https://elixir-lang.org"}], ["Elixir"]}]} ] You can get a normalized tree by passing `sort_attributes: true`: iex> lazy_html = LazyHTML.from_fragment(~S|
|) iex> LazyHTML.to_tree(lazy_html, sort_attributes: true) [{"div", [{"class", "layout"}, {"id", "root"}], []}] """ @spec to_tree(t(), keyword()) :: LazyHTML.Tree.t() def to_tree(%LazyHTML{} = lazy_html, opts \\ []) when is_list(opts) do opts = Keyword.validate!(opts, sort_attributes: false, skip_whitespace_nodes: false) LazyHTML.NIF.to_tree(lazy_html, opts[:sort_attributes], opts[:skip_whitespace_nodes]) end @doc """ Builds a lazy HTML document from an Elixir tree data structure. ## Examples iex> tree = [ ...> {"html", [], [{"head", [], [{"title", [], ["Page"]}]}, {"body", [], ["Hello world"]}]} ...> ] iex> LazyHTML.from_tree(tree) #LazyHTML< 1 node #1 PageHello world > iex> tree = [ ...> {"div", [], []}, ...> {:comment, " Link "}, ...> {"a", [{"href", "https://elixir-lang.org"}], ["Elixir"]} ...> ] iex> LazyHTML.from_tree(tree) #LazyHTML< 3 nodes #1
#2 #3 Elixir > """ @spec from_tree(LazyHTML.Tree.t()) :: t() def from_tree(tree) when is_list(tree) do LazyHTML.NIF.from_tree(tree) end @doc ~S''' Finds elements in `lazy_html` matching the given CSS selector. Since `lazy_html` may have multiple root nodes, the root nodes are included in the search and they will appear in the result if they match the given selector. ## Examples iex> lazy_html = ...> LazyHTML.from_fragment(""" ...>
...> Hello ...> world ...>
...> """) iex> LazyHTML.query(lazy_html, "span") #LazyHTML< 2 nodes (from selector) #1 Hello #2 world > iex> LazyHTML.query(lazy_html, ".layout") #LazyHTML< 1 node (from selector) #1
Hello world
> Note that for each root node, the selector respects its actual location in the document. Consequently, if you run one `query/2` the returned nodes are not necessarily siblings, which may impact a subsequent query: iex> lazy_html = ...> LazyHTML.from_fragment(""" ...>
...> Hello ...>
...>
...> World ...>
...> """) iex> spans = LazyHTML.query(lazy_html, "span") #LazyHTML< 2 nodes (from selector) #1 Hello #2 World > iex> LazyHTML.query(spans, ":first-child") #LazyHTML< 2 nodes (from selector) #1 Hello #2 World > In the example above, each of the spans is first child of its respective parent, so the second query matches both. ''' @spec query(t(), String.t()) :: t() def query(%LazyHTML{} = lazy_html, selector) when is_binary(selector) do LazyHTML.NIF.query(lazy_html, selector) end @doc ~S''' Finds elements in `lazy_html` matching the given id. This function is similar to `query/2`, but it accepts unescaped id string. Note that while technically there should be only a single element with the given id, if there are multiple elements, all of them are included in the result. ## Examples iex> lazy_html = ...> LazyHTML.from_fragment(""" ...>
...> Hello ...> world ...>
...> """) iex> LazyHTML.query_by_id(lazy_html, "hello") #LazyHTML< 1 node (from selector) #1 Hello > ''' @spec query_by_id(t(), String.t()) :: t() def query_by_id(%LazyHTML{} = lazy_html, id) when is_binary(id) do if id == "" do raise ArgumentError, "id cannot be empty" end LazyHTML.NIF.query_by_id(lazy_html, id) end @doc ~S''' Filters `lazy_html` root nodes, keeping only elements that match the given CSS selector. ## Examples iex> lazy_html = LazyHTML.from_fragment(""" ...> Hello ...>
...> nested ...>
...> world ...> """) iex> LazyHTML.filter(lazy_html, "span") #LazyHTML< 2 nodes (from selector) #1 Hello #2 world > ''' @spec filter(t(), String.t()) :: t() def filter(%LazyHTML{} = lazy_html, selector) when is_binary(selector) do LazyHTML.NIF.filter(lazy_html, selector) end @doc """ Returns the child_nodes nodes of the root nodes in `lazy_html`. ## Examples iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) iex> LazyHTML.child_nodes(lazy_html) #LazyHTML< 3 nodes (from selector) #1 Hello #2 [whitespace] #3 world > iex> LazyHTML.child_nodes(LazyHTML.child_nodes(lazy_html)) #LazyHTML< 2 nodes (from selector) #1 Hello #2 world > """ @spec child_nodes(t()) :: t() def child_nodes(%LazyHTML{} = lazy_html) do LazyHTML.NIF.child_nodes(lazy_html) end @doc """ Returns the (unique) parent nodes of the root nodes in `lazy_html`. ## Examples iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) iex> spans = LazyHTML.query(lazy_html, "span") iex> LazyHTML.parent_node(spans) #LazyHTML< 1 node (from selector) #1
Hello world
> """ @spec parent_node(t()) :: t() def parent_node(lazy_html) do LazyHTML.NIF.parent_node(lazy_html) end @doc """ Returns the position among its siblings for every root element in `lazy_html`. The position numbering is 1-based and only considers siblings that are elements, as to match the `:nth-child` CSS pseudo-class. Note that if there are text or comment root nodes, they are ignored, and they have no corresponding number in the result. ## Examples iex> lazy_html = LazyHTML.from_fragment(~S|
12
|) iex> spans = LazyHTML.query(lazy_html, "span") iex> LazyHTML.nth_child(spans) [1, 2] """ @spec nth_child(t()) :: list(integer()) def nth_child(lazy_html) do LazyHTML.NIF.nth_child(lazy_html) end @doc """ Returns the text content of all nodes in `lazy_html`. ## Options * `:separator` - a separator used to join the text content from individual nodes. Note that the separator is only inserted between non-empty nodes. Defaults to no separator. ## Examples iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) iex> LazyHTML.text(lazy_html) "Hello world" iex> lazy_html = LazyHTML.from_fragment(~S|
123
|) iex> LazyHTML.text(lazy_html, separator: ", ") "1, 2, 3" iex> lazy_html = LazyHTML.from_fragment(~S|
12
|) iex> LazyHTML.text(lazy_html, separator: ", ") "1, 2" If you want to get the text for each root node separately, you can use `Enum.map/2`: iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) iex> spans = LazyHTML.query(lazy_html, "span") #LazyHTML< 2 nodes (from selector) #1 Hello #2 world > iex> Enum.map(spans, &LazyHTML.text/1) ["Hello", "world"] """ @spec text(t(), keyword()) :: String.t() def text(%LazyHTML{} = lazy_html, opts \\ []) do opts = Keyword.validate!(opts, [:separator]) LazyHTML.NIF.text(lazy_html, opts[:separator]) end @doc ~S''' Returns all values of the given attribute on the `lazy_html` root nodes. ## Examples iex> lazy_html = ...> LazyHTML.from_fragment(""" ...>
...> Hello ...> world ...> ! ...>
...> """) iex> spans = LazyHTML.query(lazy_html, "span") iex> LazyHTML.attribute(spans, "data-id") ["1", "2"] iex> LazyHTML.attribute(spans, "data-other") [] Note that attributes without value, implicitly have an empty value: iex> lazy_html = LazyHTML.from_fragment(~S|
|) iex> button = LazyHTML.query(lazy_html, "button") iex> LazyHTML.attribute(button, "disabled") [""] ''' @spec attribute(t(), String.t()) :: list(String.t()) def attribute(%LazyHTML{} = lazy_html, name) when is_binary(name) do LazyHTML.NIF.attribute(lazy_html, name) end @doc ~S''' Returns attribute lists for every root element in `lazy_html`. Note that if there are text or comment root nodes, they are ignored, and they have no corresponding list in the result. ## Examples iex> lazy_html = ...> LazyHTML.from_fragment(""" ...>
...> Hello ...> world ...>
...> """) iex> spans = LazyHTML.query(lazy_html, "span") iex> LazyHTML.attributes(spans) [ [{"class", "text"}, {"data-id", "1"}], [] ] iex> lazy_html = ...> LazyHTML.from_fragment(""" ...> ...> Hello ...> world ...> """) iex> LazyHTML.attributes(lazy_html) [ [{"class", "text"}] ] ''' @spec attributes(t()) :: list(list({String.t(), String.t()})) def attributes(%LazyHTML{} = lazy_html) do LazyHTML.NIF.attributes(lazy_html) end @doc """ Returns tag name for every root element in `lazy_html`. Note that if there are text or comment root nodes, they are ignored, and they have no corresponding list in the result. ## Examples iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) iex> LazyHTML.tag(lazy_html) ["div"] iex> lazy_html = LazyHTML.from_fragment(~S|Hello world|) iex> LazyHTML.tag(lazy_html) ["span", "span"] """ @spec tag(t()) :: list(String.t()) def tag(%LazyHTML{} = lazy_html) do LazyHTML.NIF.tag(lazy_html) end @doc ~S""" Escapes the given string to make a valid HTML text. ## Examples iex> LazyHTML.html_escape("foo") "foo" iex> LazyHTML.html_escape("") "<foo>" iex> LazyHTML.html_escape("quotes: \" & \'") "quotes: " & '" """ @spec html_escape(String.t()) :: String.t() def html_escape(string) when is_binary(string) do LazyHTML.Tree.html_escape(string) end # Access @impl true def fetch(%LazyHTML{} = lazy_html, selector) when is_binary(selector) do {:ok, query(lazy_html, selector)} end @impl true def get_and_update(%LazyHTML{}, _index, _update) do raise "Access.get_and_update/3 is not supported by LazyHTML" end @impl true def pop(%LazyHTML{}, _index) do raise "Access.pop/2 is not supported by LazyHTML" end end defimpl Inspect, for: LazyHTML do import Inspect.Algebra def inspect(lazy_html, opts) do {nodes, from_selector} = LazyHTML.NIF.nodes(lazy_html) info = case length(nodes) do 1 -> "1 node" n -> "#{n} nodes" end info = if from_selector do info <> " (from selector)" else info end inner = if nodes == [] do empty() else items = Enum.with_index(nodes, 1) {items, last_doc} = apply_limit(items, opts.limit) inner = concat(Enum.map_intersperse(items, concat(separator(), line()), &node_to_doc(&1, opts))) inner = concat([inner, last_doc]) concat([separator(), nest(concat(line(), inner), 2)]) end force_unfit( concat([ "#LazyHTML<", nest(concat([line(), info]), 2), inner, line(), ">" ]) ) end if Application.compile_env(:lazy_html, :inspect_extra_newline, true) do defp separator(), do: line() else defp separator(), do: empty() end defp apply_limit(items, :infinity), do: {items, empty()} defp apply_limit(items, limit) do case Enum.split(items, limit) do {items, []} -> {items, empty()} {items, more} -> {items, concat([separator(), line(), "[#{length(more)} more]"])} end end defp node_to_doc({%LazyHTML{} = node, number}, opts) do html_doc = node |> LazyHTML.to_html() |> apply_printable_limit(opts.printable_limit) |> String.replace(~r/^\s+/, "[whitespace]") |> String.replace(~r/\s+$/, "[whitespace]") |> String.split("\n") |> Enum.intersperse(line()) |> concat() concat([ color("##{number}", :atom, opts), line(), html_doc ]) end defp apply_printable_limit(string, :infinity), do: string defp apply_printable_limit(string, limit) do case String.split_at(string, limit) do {left, ""} -> left {left, _more} -> left <> "[...]" end end end defimpl Enumerable, for: LazyHTML do def count(lazy_html) do {:ok, LazyHTML.NIF.num_nodes(lazy_html)} end def member?(_lazy_html, _element), do: {:error, __MODULE__} def slice(_lazy_html), do: {:error, __MODULE__} def reduce(%LazyHTML{} = lazy_html, acc, fun) do {nodes, _from_selector} = LazyHTML.NIF.nodes(lazy_html) Enumerable.reduce(nodes, acc, fun) end end