Skip to content

Commit

Permalink
Merge pull request #55 from niklak/feature/tree-base-uri
Browse files Browse the repository at this point in the history
- Implemented `Tree::base_uri`, a quick method that returns the base URI of the document based on the `href` attribute of the `<base>` element. `Document::base_uri` and `NodeRef::base_uri` provide the same functionality. Inspired by [Node: baseURI property]( https://developer.mozilla.org/en-US/docs/Web/API/Node/baseURI).
  • Loading branch information
niklak authored Jan 10, 2025
2 parents 41cfa46 + 226213c commit 6c7b21e
Show file tree
Hide file tree
Showing 10 changed files with 147 additions and 15 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/wasm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Install stable rust
uses: dtolnay/rust-toolchain@stable
uses: dtolnay/rust-toolchain@master
with:
toolchain: stable
toolchain: 1.83.0
targets: wasm32-unknown-unknown
- name: Install wasm-bindgen-cli
uses: taiki-e/install-action@v2
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ All notable changes to the `dom_query` crate will be documented in this file.

- Implemented `NodeRef::is_match` and `NodeRef::is` methods, which allow checking if a node matches
a given matcher (`&Matcher`) or selector (`&str`) without creating a `Selection` object.

- Implemented `Tree::base_uri`, a quick method that returns the base URI of the document based on the `href` attribute of the `<base>` element. `Document::base_uri` and `NodeRef::base_uri` provide the same functionality. Inspired by [Node: baseURI property]( https://developer.mozilla.org/en-US/docs/Web/API/Node/baseURI).

### Changed

Expand Down
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ html5ever = "0.29.0"
selectors = "0.26.0"
cssparser = "0.34.0"
tendril = "0.4.3"
foldhash = "0.1.3"
foldhash = "0.1.4"
hashbrown = {version = "0.15.2", default-features = false, features = ["allocator-api2", "inline-more", "default-hasher"], optional = true}
precomputed-hash = "0.1.1"

[dev-dependencies]
ureq = {version = "2.12.1", default-features = false}
wasm-bindgen-test = "0.3"
mini-alloc = "0.6.0"
mini-alloc = "0.7.0"

[features]
hashbrown = ["dep:hashbrown"]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ assert_eq!(doc.select("div.content > p").length(), 4);
## Crate features

- `hashbrown` — optional, standard hashmaps and hashsets will be replaced `hashbrown` hashmaps and hashsets;
- `atomic` - options, switches `NodeData` from using `StrTendril` to `Tendril<tendril::fmt::UTF8, tendril::Atomic>`.
- `atomic` options, switches `NodeData` from using `StrTendril` to `Tendril<tendril::fmt::UTF8, tendril::Atomic>`.
This allows `NodeData` and all ascending structures, including `Document`, to implement the `Send` trait;

## Possible issues
Expand Down
10 changes: 10 additions & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ impl Document {
self.root().text()
}

/// Finds the base URI of the tree by looking for `<base>` tags in document's head.
///
/// The base URI is the value of the `href` attribute of the first
/// `<base>` tag in the document's head. If no such tag is found,
/// the method returns `None`.
///
pub fn base_uri(&self) -> Option<StrTendril> {
self.tree.base_uri()
}

/// Merges adjacent text nodes and removes empty text nodes.
///
/// Normalization is necessary to ensure that adjacent text nodes are merged into one text node.
Expand Down
66 changes: 66 additions & 0 deletions src/dom_tree/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,72 @@ impl TreeNodeOps {
}
None
}

/// Finds the first child element of a node that satisfies the given predicate.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the parent node.
/// * `f` - The predicate to apply to each child element.
///
/// # Returns
///
/// The id of the first element that satisfies the predicate, if any.
pub fn find_child_element<F>(nodes: Ref<Vec<TreeNode>>, id: NodeId, f: F) -> Option<NodeId>
where
F: Fn(&TreeNode) -> bool,
{
child_nodes(Ref::clone(&nodes), &id, false)
.filter_map(|node_id| nodes.get(node_id.value))
.filter(|tree_node| tree_node.is_element())
.find(|tree_node| f(tree_node))
.map(|tree_node| tree_node.id)
}

/// Finds the first child element of a node that has the given name.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the parent node.
/// * `name` - The name of the element to search for.
///
/// # Returns
///
/// The id of the first element that has the given name, if any.
pub fn find_child_element_by_name(
nodes: Ref<Vec<TreeNode>>,
id: NodeId,
name: &str,
) -> Option<NodeId> {
Self::find_child_element(nodes, id, |tree_node| {
tree_node
.as_element()
.map_or(false, |el| el.node_name().as_ref() == name)
})
}

/// Finds the first descendant element of a node that has the given names.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the starting node.
/// * `names` - The names of the elements to search for.
///
/// # Returns
///
/// The id of the first descendant element that has the given names, if any.
pub fn find_descendant_element(
nodes: Ref<Vec<TreeNode>>,
id: NodeId,
names: &[&str],
) -> Option<NodeId> {
names.iter().try_fold(id, |current_id, name| {
Self::find_child_element_by_name(Ref::clone(&nodes), current_id, name)
})
}
}

// manipulation
Expand Down
18 changes: 18 additions & 0 deletions src/dom_tree/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,24 @@ impl Tree {
})
.ok()
}

/// Finds the base URI of the tree by looking for `<base>` tags in document's head.
///
/// The base URI is the value of the `href` attribute of the first
/// `<base>` tag in the document's head. If no such tag is found,
/// the method returns `None`.
///
/// This is a very fast method compare to [`crate::Document::select`].
pub fn base_uri(&self) -> Option<StrTendril> {
// TODO: It is possible to wrap the result of this function with `OnceCell`,
// but then appears a problem with atomicity and the `Send` trait for the Tree.
let root = self.root();
let nodes = self.nodes.borrow();

TreeNodeOps::find_descendant_element(Ref::clone(&nodes), root.id, &["html", "head", "base"])
.and_then(|base_node_id| nodes.get(base_node_id.value))
.and_then(|base_node| base_node.as_element()?.attr("href"))
}
}

impl Tree {
Expand Down
9 changes: 8 additions & 1 deletion src/node/node_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ use tendril::StrTendril;

use crate::entities::copy_attrs;
use crate::Document;
use crate::Matcher;
use crate::Tree;
use crate::TreeNodeOps;
use crate::Matcher;

use super::child_nodes;
use super::id_provider::NodeIdProver;
Expand Down Expand Up @@ -652,4 +652,11 @@ impl NodeRef<'_> {
pub fn is(&self, sel: &str) -> bool {
Matcher::new(sel).map_or(false, |matcher| self.is_match(&matcher))
}

/// Returns the base URI of the document.
///
/// This is the value of the `<base>` element in the document's head, or `None` if the document does not have a `<base>` element.
pub fn base_uri(&self) -> Option<StrTendril> {
self.tree.base_uri()
}
}
38 changes: 35 additions & 3 deletions tests/node-traversal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ fn test_node_prev_sibling() {
assert!(prev_element_sibling_sel.is("#first-child"));
}


#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test_node_is() {
Expand All @@ -233,7 +232,6 @@ fn test_node_is() {
assert!(parent_node.is(":has(#first-child)"));
}


#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test_text_node_is() {
Expand All @@ -245,4 +243,38 @@ fn test_text_node_is() {
assert!(first_child.is_text());

assert!(!first_child.is("#text"));
}
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test_node_base_uri() {
let contents: &str = r#"<!DOCTYPE html>
<html>
<head>
<base href="https://www.example.com/"/>
<title>Test</title>
</head>
<body>
<div id="main"></div>
</div>
</body>
</html>"#;
let doc = Document::from(contents);

// It may be called from document level.
let base_uri = doc.base_uri().unwrap();
assert_eq!(base_uri.as_ref(), "https://www.example.com/");

let sel = doc.select_single("#main");
let node = sel.nodes().first().unwrap();
// Access at any node of the tree.
let base_uri = node.base_uri().unwrap();
assert_eq!(base_uri.as_ref(), "https://www.example.com/");
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test_node_base_uri_none() {
let doc = Document::from(ANCESTORS_CONTENTS);
assert!(doc.base_uri().is_none());
}
9 changes: 4 additions & 5 deletions tests/selection-property.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,17 +290,16 @@ fn test_selection_query() {

let mut font_faces = vec![];
for node in sel.nodes() {
if let Some(face) = node.query(|tree_node| {
tree_node.as_element().and_then(|el| el.attr("face"))
}).flatten() {
if let Some(face) = node
.query(|tree_node| tree_node.as_element().and_then(|el| el.attr("face")))
.flatten()
{
font_faces.push(face.to_string());
}
}
assert_eq!(font_faces, vec!["Times", "Arial", "Courier"]);
}



#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test_doc_try_serialize_html() {
Expand Down

0 comments on commit 6c7b21e

Please sign in to comment.