Skip to main content

rdf_compare/
graph_iri.rs

1use anyhow::{Context, Result};
2use oxrdf::NamedNode;
3use percent_encoding::{AsciiSet, CONTROLS, utf8_percent_encode};
4use std::path::Path;
5
6/// Characters that are *not* safe in the trailing component of our URN graph IRIs.
7/// We keep IRI-friendly chars and percent-encode everything else (incl. spaces,
8/// non-ASCII, and reserved IRI delimiters).
9const URN_NSS: &AsciiSet = &CONTROLS
10    .add(b' ')
11    .add(b'"')
12    .add(b'<')
13    .add(b'>')
14    .add(b'\\')
15    .add(b'^')
16    .add(b'`')
17    .add(b'{')
18    .add(b'|')
19    .add(b'}')
20    .add(b'#')
21    .add(b'%')
22    .add(b'/')
23    .add(b'?')
24    .add(b'&')
25    .add(b'=');
26
27const PREFIX: &str = "urn:rdf-compare:source:";
28
29fn basename_stem(path: &Path) -> String {
30    // Strip a single `.gz` first, then the final extension.
31    let name = path
32        .file_name()
33        .and_then(|s| s.to_str())
34        .unwrap_or("unnamed");
35    let lower = name.to_ascii_lowercase();
36    let trimmed = if lower.ends_with(".gz") {
37        &name[..name.len() - 3]
38    } else {
39        name
40    };
41    match trimmed.rsplit_once('.') {
42        Some((stem, _ext)) if !stem.is_empty() => stem.to_string(),
43        _ => trimmed.to_string(),
44    }
45}
46
47fn iri_for(stem: &str) -> Result<NamedNode> {
48    let encoded: String = utf8_percent_encode(stem, URN_NSS).collect();
49    let iri = format!("{}{}", PREFIX, encoded);
50    NamedNode::new(&iri).with_context(|| format!("invalid generated graph IRI: {}", iri))
51}
52
53/// Resolve graph IRIs for both files, honoring optional CLI overrides and
54/// disambiguating the auto-derived case when both basenames collide.
55pub fn resolve_graph_iris(
56    path_a: &Path,
57    path_b: &Path,
58    override_a: Option<&str>,
59    override_b: Option<&str>,
60) -> Result<(NamedNode, NamedNode)> {
61    let a = match override_a {
62        Some(s) => NamedNode::new(s).with_context(|| format!("invalid --graph-a IRI: {}", s))?,
63        None => iri_for(&basename_stem(path_a))?,
64    };
65    let b = match override_b {
66        Some(s) => NamedNode::new(s).with_context(|| format!("invalid --graph-b IRI: {}", s))?,
67        None => iri_for(&basename_stem(path_b))?,
68    };
69
70    // Disambiguate auto-derived collisions.
71    if override_a.is_none() && override_b.is_none() && a == b {
72        let stem_a = basename_stem(path_a);
73        let stem_b = basename_stem(path_b);
74        let a2 = iri_for(&format!("{}:1", stem_a))?;
75        let b2 = iri_for(&format!("{}:2", stem_b))?;
76        return Ok((a2, b2));
77    }
78    Ok((a, b))
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84    use std::path::PathBuf;
85
86    #[test]
87    fn derives_basic_iri() {
88        let (a, b) = resolve_graph_iris(
89            &PathBuf::from("foo.ttl"),
90            &PathBuf::from("bar.nt"),
91            None,
92            None,
93        )
94        .unwrap();
95        assert_eq!(a.as_str(), "urn:rdf-compare:source:foo");
96        assert_eq!(b.as_str(), "urn:rdf-compare:source:bar");
97    }
98
99    #[test]
100    fn collision_gets_suffix() {
101        let (a, b) = resolve_graph_iris(
102            &PathBuf::from("dir1/data.ttl"),
103            &PathBuf::from("dir2/data.ttl"),
104            None,
105            None,
106        )
107        .unwrap();
108        assert_ne!(a, b);
109        assert!(a.as_str().ends_with(":1"));
110        assert!(b.as_str().ends_with(":2"));
111    }
112
113    #[test]
114    fn override_used_verbatim() {
115        let (a, b) = resolve_graph_iris(
116            &PathBuf::from("a.ttl"),
117            &PathBuf::from("b.ttl"),
118            Some("https://example.com/A"),
119            None,
120        )
121        .unwrap();
122        assert_eq!(a.as_str(), "https://example.com/A");
123        assert_eq!(b.as_str(), "urn:rdf-compare:source:b");
124    }
125
126    #[test]
127    fn handles_gz_double_ext() {
128        let (a, _) = resolve_graph_iris(
129            &PathBuf::from("foo.ttl.gz"),
130            &PathBuf::from("b.ttl"),
131            None,
132            None,
133        )
134        .unwrap();
135        assert_eq!(a.as_str(), "urn:rdf-compare:source:foo");
136    }
137}