Skip to main content

rdf_compare/
diff.rs

1use crate::cli::{InputFormat, OutputFormat};
2use crate::graph_iri::resolve_graph_iris;
3use crate::input::{is_quad_format, open_reader, parse_quads, parse_triples, quad_to_triple};
4use anyhow::{Context, Result, bail};
5use oxrdf::dataset::CanonicalizationAlgorithm;
6use oxrdf::{Dataset, GraphName, NamedNode, NamedOrBlankNode, Quad, Triple};
7use std::collections::HashSet;
8use std::ffi::OsStr;
9use std::fs::File;
10use std::io::{BufWriter, Write, stdout};
11use std::path::{Path, PathBuf};
12
13#[derive(Debug, Default, Clone, Copy)]
14pub struct DiffStats {
15    pub a_total: u64,
16    pub b_total: u64,
17    pub common: u64,
18    pub a_only: u64,
19    pub b_only: u64,
20    pub a_skipped_bnodes: u64,
21    pub b_skipped_bnodes: u64,
22}
23
24impl DiffStats {
25    pub fn has_differences(&self) -> bool {
26        self.a_only > 0 || self.b_only > 0
27    }
28}
29
30/// Full result of a diff computation, kept in memory so it can be either
31/// serialized to disk or served from the web viewer.
32///
33/// `a_only` / `b_only` carry [`Quad`]s. In **triple mode** (`quad_mode = false`)
34/// each quad's graph component is the wrapper graph IRI (`graph_a` / `graph_b`)
35/// and the diff is written as a single TriG/N-Quads document containing two
36/// named graphs. In **quad mode** (at least one input is N-Quads or TriG)
37/// the original graph names from the source are preserved and the diff is
38/// emitted as two separate files (one per side).
39#[derive(Debug, Clone)]
40pub struct DiffResult {
41    pub a_only: Vec<Quad>,
42    pub b_only: Vec<Quad>,
43    /// Merged prefix declarations from A and B. A wins on conflicts.
44    pub prefixes: Vec<(String, String)>,
45    pub graph_a: NamedNode,
46    pub graph_b: NamedNode,
47    pub stats: DiffStats,
48    /// Source file paths, when known (used by the web viewer to lazily
49    /// recompute the set of common triples).
50    pub source_a: Option<PathBuf>,
51    pub source_b: Option<PathBuf>,
52    pub format_a: Option<InputFormat>,
53    pub format_b: Option<InputFormat>,
54    pub quad_mode: bool,
55}
56
57fn graph_name_str(g: &GraphName) -> &str {
58    match g {
59        GraphName::NamedNode(n) => n.as_str(),
60        GraphName::BlankNode(b) => b.as_str(),
61        GraphName::DefaultGraph => "",
62    }
63}
64
65fn quad_order(a: &Quad, b: &Quad) -> std::cmp::Ordering {
66    let sa = match &a.subject {
67        NamedOrBlankNode::NamedNode(n) => n.as_str(),
68        NamedOrBlankNode::BlankNode(bn) => bn.as_str(),
69    };
70    let sb = match &b.subject {
71        NamedOrBlankNode::NamedNode(n) => n.as_str(),
72        NamedOrBlankNode::BlankNode(bn) => bn.as_str(),
73    };
74    graph_name_str(&a.graph_name)
75        .cmp(graph_name_str(&b.graph_name))
76        .then_with(|| sa.cmp(sb))
77        .then_with(|| a.predicate.as_str().cmp(b.predicate.as_str()))
78}
79
80impl DiffResult {
81    pub fn sort_rows(&mut self) {
82        self.a_only.sort_unstable_by(quad_order);
83        self.b_only.sort_unstable_by(quad_order);
84    }
85
86    pub fn a_only_triples(&self) -> impl Iterator<Item = Triple> + '_ {
87        self.a_only.iter().map(quad_to_triple)
88    }
89
90    pub fn b_only_triples(&self) -> impl Iterator<Item = Triple> + '_ {
91        self.b_only.iter().map(quad_to_triple)
92    }
93}
94
95#[derive(Debug, Clone)]
96pub struct DiffInputs {
97    pub file_a: PathBuf,
98    pub file_b: PathBuf,
99    pub format_a: Option<InputFormat>,
100    pub format_b: Option<InputFormat>,
101    pub graph_a: Option<String>,
102    pub graph_b: Option<String>,
103    /// When true, blank-node-bearing statements are skipped instead of
104    /// canonicalised via RDFC-1.0.
105    pub ignore_blank_nodes: bool,
106}
107
108#[derive(Debug, Clone)]
109pub struct LoadDiffInputs {
110    pub diff: PathBuf,
111    pub format: Option<InputFormat>,
112    pub graph_a: Option<String>,
113    pub graph_b: Option<String>,
114}
115
116fn detect_or_override(path: &Path, over: Option<InputFormat>) -> Result<InputFormat> {
117    match over {
118        Some(f) => Ok(f),
119        None => crate::cli::detect_format(path),
120    }
121}
122
123fn open_writer(out: Option<&Path>) -> Result<Box<dyn Write>> {
124    Ok(match out {
125        Some(p) => Box::new(BufWriter::new(
126            File::create(p).with_context(|| format!("failed to create {}", p.display()))?,
127        )),
128        None => Box::new(BufWriter::new(stdout().lock())),
129    })
130}
131
132trait QuadSink {
133    fn write(&mut self, quad: &Quad) -> Result<()>;
134    fn finish(self: Box<Self>) -> Result<()>;
135}
136
137struct TrigSink<W: Write> {
138    inner: oxttl::trig::WriterTriGSerializer<W>,
139}
140impl<W: Write> QuadSink for TrigSink<W> {
141    fn write(&mut self, quad: &Quad) -> Result<()> {
142        self.inner
143            .serialize_quad(quad)
144            .context("failed to serialize quad to TriG")
145    }
146    fn finish(self: Box<Self>) -> Result<()> {
147        self.inner
148            .finish()
149            .context("failed to finalize TriG output")?;
150        Ok(())
151    }
152}
153
154struct NqSink<W: Write> {
155    inner: oxttl::nquads::WriterNQuadsSerializer<W>,
156}
157impl<W: Write> QuadSink for NqSink<W> {
158    fn write(&mut self, quad: &Quad) -> Result<()> {
159        self.inner
160            .serialize_quad(quad)
161            .context("failed to serialize quad to N-Quads")
162    }
163    fn finish(self: Box<Self>) -> Result<()> {
164        let _ = self.inner.finish();
165        Ok(())
166    }
167}
168
169fn make_sink(
170    format: OutputFormat,
171    w: Box<dyn Write>,
172    prefixes: &[(String, String)],
173) -> Result<Box<dyn QuadSink>> {
174    Ok(match format {
175        OutputFormat::Trig => {
176            let mut s = oxttl::TriGSerializer::new();
177            for (name, iri) in prefixes {
178                s = s
179                    .with_prefix(name, iri)
180                    .with_context(|| format!("invalid prefix IRI for `{name}`: <{iri}>"))?;
181            }
182            Box::new(TrigSink {
183                inner: s.for_writer(w),
184            })
185        }
186        OutputFormat::Nq => Box::new(NqSink {
187            inner: oxttl::NQuadsSerializer::new().for_writer(w),
188        }),
189    })
190}
191
192fn merge_prefixes(a: Vec<(String, String)>, b: Vec<(String, String)>) -> Vec<(String, String)> {
193    let mut seen: HashSet<String> = HashSet::with_capacity(a.len() + b.len());
194    let mut out: Vec<(String, String)> = Vec::with_capacity(a.len() + b.len());
195    for (name, iri) in a.into_iter().chain(b) {
196        if seen.insert(name.clone()) {
197            out.push((name, iri));
198        }
199    }
200    out
201}
202
203fn make_quad(t: Triple, g: &GraphName) -> Quad {
204    Quad {
205        subject: t.subject,
206        predicate: t.predicate,
207        object: t.object,
208        graph_name: g.clone(),
209    }
210}
211
212/// Canonicalise `quads` in place using RDFC-1.0 (oxrdf, `rdfc-10` feature).
213/// The blank-node identifiers in the returned vector are stable canonical
214/// labels (`_:c14n…`) determined by the graph's structure.
215fn canonicalize_quads(quads: Vec<Quad>) -> Vec<Quad> {
216    let mut dataset: Dataset = quads.into_iter().collect();
217    dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
218    dataset.iter().map(Quad::from).collect()
219}
220
221pub fn compute_diff(inputs: &DiffInputs) -> Result<DiffResult> {
222    let fmt_a = detect_or_override(&inputs.file_a, inputs.format_a)?;
223    let fmt_b = detect_or_override(&inputs.file_b, inputs.format_b)?;
224
225    let (graph_a, graph_b) = resolve_graph_iris(
226        &inputs.file_a,
227        &inputs.file_b,
228        inputs.graph_a.as_deref(),
229        inputs.graph_b.as_deref(),
230    )?;
231    let quad_mode = is_quad_format(fmt_a) || is_quad_format(fmt_b);
232
233    if inputs.ignore_blank_nodes {
234        return compute_diff_skip_bnodes(inputs, fmt_a, fmt_b, &graph_a, &graph_b, quad_mode);
235    }
236
237    let mut quads_a: Vec<Quad> = Vec::new();
238    let reader_a = open_reader(&inputs.file_a)?;
239    let outcome_a = parse_quads(reader_a, fmt_a, |q| {
240        quads_a.push(q);
241        Ok(())
242    })
243    .with_context(|| format!("while parsing {}", inputs.file_a.display()))?;
244
245    let mut quads_b: Vec<Quad> = Vec::new();
246    let reader_b = open_reader(&inputs.file_b)?;
247    let outcome_b = parse_quads(reader_b, fmt_b, |q| {
248        quads_b.push(q);
249        Ok(())
250    })
251    .with_context(|| format!("while parsing {}", inputs.file_b.display()))?;
252
253    if outcome_a.bnode_count > 0 || outcome_b.bnode_count > 0 {
254        // Per W3C RDFC-1.0: canonicalise each side independently, then
255        // perform a syntactic set-diff on the resulting quads. Identical
256        // sub-graphs receive identical canonical bnode labels and therefore
257        // compare equal across sides.
258        quads_a = canonicalize_quads(quads_a);
259        quads_b = canonicalize_quads(quads_b);
260    }
261
262    let mut set: HashSet<Quad> = quads_a.into_iter().collect();
263    let mut b_only: Vec<Quad> = Vec::new();
264    for q in quads_b {
265        if !set.remove(&q) {
266            b_only.push(q);
267        }
268    }
269    let mut a_only: Vec<Quad> = set.into_iter().collect();
270
271    // In triple mode, statements parsed from triple inputs all carry
272    // `DefaultGraph`. Tag survivors with the per-side wrapper graph IRI
273    // *after* the set-diff so equal triples on both sides cancel.
274    if !quad_mode {
275        let g_a = GraphName::NamedNode(graph_a.clone());
276        let g_b = GraphName::NamedNode(graph_b.clone());
277        for q in &mut a_only {
278            if matches!(q.graph_name, GraphName::DefaultGraph) {
279                q.graph_name = g_a.clone();
280            }
281        }
282        for q in &mut b_only {
283            if matches!(q.graph_name, GraphName::DefaultGraph) {
284                q.graph_name = g_b.clone();
285            }
286        }
287    }
288
289    let prefixes = merge_prefixes(outcome_a.prefixes, outcome_b.prefixes);
290    let a_total = outcome_a.total;
291    let b_total = outcome_b.total;
292    let a_only_count = a_only.len() as u64;
293    let b_only_count = b_only.len() as u64;
294    let common = a_total.saturating_sub(a_only_count);
295
296    let stats = DiffStats {
297        a_total,
298        b_total,
299        common,
300        a_only: a_only_count,
301        b_only: b_only_count,
302        a_skipped_bnodes: 0,
303        b_skipped_bnodes: 0,
304    };
305
306    Ok(DiffResult {
307        a_only,
308        b_only,
309        prefixes,
310        graph_a,
311        graph_b,
312        stats,
313        source_a: Some(inputs.file_a.clone()),
314        source_b: Some(inputs.file_b.clone()),
315        format_a: Some(fmt_a),
316        format_b: Some(fmt_b),
317        quad_mode,
318    })
319}
320
321fn compute_diff_skip_bnodes(
322    inputs: &DiffInputs,
323    fmt_a: InputFormat,
324    fmt_b: InputFormat,
325    graph_a: &NamedNode,
326    graph_b: &NamedNode,
327    quad_mode: bool,
328) -> Result<DiffResult> {
329    let mut set: HashSet<Triple> = HashSet::new();
330    let reader_a = open_reader(&inputs.file_a)?;
331    let outcome_a = parse_triples(reader_a, fmt_a, |t| {
332        set.insert(t);
333        Ok(())
334    })
335    .with_context(|| format!("while parsing {}", inputs.file_a.display()))?;
336
337    let mut b_only_triples: Vec<Triple> = Vec::new();
338    let reader_b = open_reader(&inputs.file_b)?;
339    let outcome_b = parse_triples(reader_b, fmt_b, |t| {
340        if !set.remove(&t) {
341            b_only_triples.push(t);
342        }
343        Ok(())
344    })
345    .with_context(|| format!("while parsing {}", inputs.file_b.display()))?;
346
347    let a_only_triples: Vec<Triple> = set.into_iter().collect();
348    let prefixes = merge_prefixes(outcome_a.prefixes, outcome_b.prefixes);
349
350    let a_only_count = a_only_triples.len() as u64;
351    let b_only_count = b_only_triples.len() as u64;
352    let common = outcome_a.total.saturating_sub(a_only_count);
353
354    let g_a = GraphName::NamedNode(graph_a.clone());
355    let g_b = GraphName::NamedNode(graph_b.clone());
356    let a_only: Vec<Quad> = a_only_triples
357        .into_iter()
358        .map(|t| make_quad(t, &g_a))
359        .collect();
360    let b_only: Vec<Quad> = b_only_triples
361        .into_iter()
362        .map(|t| make_quad(t, &g_b))
363        .collect();
364
365    let stats = DiffStats {
366        a_total: outcome_a.total,
367        b_total: outcome_b.total,
368        common,
369        a_only: a_only_count,
370        b_only: b_only_count,
371        a_skipped_bnodes: outcome_a.skipped,
372        b_skipped_bnodes: outcome_b.skipped,
373    };
374
375    Ok(DiffResult {
376        a_only,
377        b_only,
378        prefixes,
379        graph_a: graph_a.clone(),
380        graph_b: graph_b.clone(),
381        stats,
382        source_a: Some(inputs.file_a.clone()),
383        source_b: Some(inputs.file_b.clone()),
384        format_a: Some(fmt_a),
385        format_b: Some(fmt_b),
386        quad_mode,
387    })
388}
389
390/// Derive per-side output paths for a quad-mode diff. Given `out =
391/// "diff.trig"`, returns `("diff-a.trig", "diff-b.trig")`.
392fn split_output_paths(out: &Path) -> (PathBuf, PathBuf) {
393    let stem = out
394        .file_stem()
395        .and_then(OsStr::to_str)
396        .unwrap_or("diff")
397        .to_string();
398    let ext = out
399        .extension()
400        .and_then(OsStr::to_str)
401        .map(|e| format!(".{e}"))
402        .unwrap_or_default();
403    let parent = out.parent();
404    let make = |suffix: &str| {
405        let name = format!("{stem}-{suffix}{ext}");
406        match parent {
407            Some(p) if !p.as_os_str().is_empty() => p.join(name),
408            _ => PathBuf::from(name),
409        }
410    };
411    (make("a"), make("b"))
412}
413
414pub fn write_diff(result: &DiffResult, out: Option<&Path>, format: OutputFormat) -> Result<()> {
415    if result.quad_mode {
416        let Some(out_path) = out else {
417            bail!(
418                "quad-shaped inputs (N-Quads/TriG) require --output: two files \
419                 are written (one per side) to preserve the source named graphs"
420            );
421        };
422        let (path_a, path_b) = split_output_paths(out_path);
423
424        let writer_a = open_writer(Some(&path_a))?;
425        let mut sink_a = make_sink(format, writer_a, &result.prefixes)?;
426        for q in &result.a_only {
427            sink_a.write(q)?;
428        }
429        sink_a.finish()?;
430
431        let writer_b = open_writer(Some(&path_b))?;
432        let mut sink_b = make_sink(format, writer_b, &result.prefixes)?;
433        for q in &result.b_only {
434            sink_b.write(q)?;
435        }
436        sink_b.finish()?;
437        return Ok(());
438    }
439
440    let writer = open_writer(out)?;
441    let mut sink = make_sink(format, writer, &result.prefixes)?;
442    for q in &result.b_only {
443        sink.write(q)?;
444    }
445    for q in &result.a_only {
446        sink.write(q)?;
447    }
448    sink.finish()?;
449    Ok(())
450}
451
452pub fn run_diff(args: &crate::cli::Args) -> Result<DiffStats> {
453    let inputs = DiffInputs {
454        file_a: args
455            .file_a
456            .clone()
457            .ok_or_else(|| anyhow::anyhow!("<FILE_A> is required"))?,
458        file_b: args
459            .file_b
460            .clone()
461            .ok_or_else(|| anyhow::anyhow!("<FILE_B> is required"))?,
462        format_a: args.format_a,
463        format_b: args.format_b,
464        graph_a: args.graph_a.clone(),
465        graph_b: args.graph_b.clone(),
466        ignore_blank_nodes: args.ignore_blank_nodes,
467    };
468    let result = compute_diff(&inputs)?;
469    write_diff(&result, args.output.as_deref(), args.output_format)?;
470    Ok(result.stats)
471}
472
473/// Stream-iterate the bnode-free common triples of two RDF files.
474pub fn stream_common_triples<F: FnMut(&Triple) -> Result<()>>(
475    file_a: &Path,
476    file_b: &Path,
477    format_a: Option<InputFormat>,
478    format_b: Option<InputFormat>,
479    mut on_triple: F,
480) -> Result<()> {
481    let fmt_a = detect_or_override(file_a, format_a)?;
482    let fmt_b = detect_or_override(file_b, format_b)?;
483
484    let mut set: HashSet<Triple> = HashSet::new();
485    let reader_a = open_reader(file_a)?;
486    parse_triples(reader_a, fmt_a, |t| {
487        set.insert(t);
488        Ok(())
489    })
490    .with_context(|| format!("while parsing {}", file_a.display()))?;
491
492    let reader_b = open_reader(file_b)?;
493    parse_triples(reader_b, fmt_b, |t| {
494        if set.contains(&t) {
495            on_triple(&t)?;
496        }
497        Ok(())
498    })
499    .with_context(|| format!("while parsing {}", file_b.display()))?;
500    Ok(())
501}
502
503/// Load a previously-written diff file (TriG or N-Quads). Expects the
504/// triple-mode topology: a single document with two named graphs.
505pub fn load_diff_file(inputs: &LoadDiffInputs) -> Result<DiffResult> {
506    let fmt = detect_or_override(&inputs.diff, inputs.format)?;
507    let mut prefixes: Vec<(String, String)> = Vec::new();
508
509    let mut quads: Vec<Quad> = Vec::new();
510    match fmt {
511        InputFormat::Trig => {
512            let reader = open_reader(&inputs.diff)?;
513            let mut parser = oxttl::TriGParser::new().for_reader(reader);
514            for q in parser.by_ref() {
515                quads.push(q.context("TriG parse error")?);
516            }
517            prefixes.extend(
518                parser
519                    .prefixes()
520                    .map(|(k, v)| (k.to_string(), v.to_string())),
521            );
522        }
523        InputFormat::Nq => {
524            let reader = open_reader(&inputs.diff)?;
525            let parser = oxttl::NQuadsParser::new().for_reader(reader);
526            for q in parser {
527                quads.push(q.context("N-Quads parse error")?);
528            }
529        }
530        other => bail!(
531            "diff file format {:?} does not carry named graphs; use TriG or N-Quads",
532            other
533        ),
534    }
535
536    let (graph_a, graph_b) = match (inputs.graph_a.as_deref(), inputs.graph_b.as_deref()) {
537        (Some(a), Some(b)) => (
538            NamedNode::new(a).with_context(|| format!("invalid --graph-a IRI: {a}"))?,
539            NamedNode::new(b).with_context(|| format!("invalid --graph-b IRI: {b}"))?,
540        ),
541        _ => {
542            let mut seen: Vec<NamedNode> = Vec::new();
543            for q in &quads {
544                if let GraphName::NamedNode(n) = &q.graph_name
545                    && !seen.iter().any(|s| s == n)
546                {
547                    seen.push(n.clone());
548                    if seen.len() == 2 {
549                        break;
550                    }
551                }
552            }
553            match seen.as_slice() {
554                [a, b] => (a.clone(), b.clone()),
555                [single] => (single.clone(), single.clone()),
556                _ => bail!("diff file contains no named graphs; cannot determine A/B sides"),
557            }
558        }
559    };
560
561    let mut a_only: Vec<Quad> = Vec::new();
562    let mut b_only: Vec<Quad> = Vec::new();
563    for q in quads {
564        match &q.graph_name {
565            GraphName::NamedNode(g) if g == &graph_a => a_only.push(q),
566            GraphName::NamedNode(g) if g == &graph_b => b_only.push(q),
567            _ => {}
568        }
569    }
570
571    let stats = DiffStats {
572        a_total: a_only.len() as u64,
573        b_total: b_only.len() as u64,
574        common: 0,
575        a_only: a_only.len() as u64,
576        b_only: b_only.len() as u64,
577        a_skipped_bnodes: 0,
578        b_skipped_bnodes: 0,
579    };
580
581    Ok(DiffResult {
582        a_only,
583        b_only,
584        prefixes,
585        graph_a,
586        graph_b,
587        stats,
588        source_a: None,
589        source_b: None,
590        format_a: None,
591        format_b: None,
592        quad_mode: false,
593    })
594}
595
596#[cfg(test)]
597mod tests {
598    use super::*;
599    use crate::cli::OutputFormat;
600    use std::path::PathBuf;
601
602    fn fixtures(name: &str) -> PathBuf {
603        let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
604        p.push("tests");
605        p.push("fixtures");
606        p.push(name);
607        p
608    }
609
610    #[test]
611    fn compute_then_load_round_trip_trig() {
612        let inputs = DiffInputs {
613            file_a: fixtures("a.ttl"),
614            file_b: fixtures("b.ttl"),
615            format_a: None,
616            format_b: None,
617            graph_a: None,
618            graph_b: None,
619            ignore_blank_nodes: false,
620        };
621        let computed = compute_diff(&inputs).unwrap();
622
623        let tmp = std::env::temp_dir().join("rdf-compare-roundtrip.trig");
624        let _ = std::fs::remove_file(&tmp);
625        write_diff(&computed, Some(&tmp), OutputFormat::Trig).unwrap();
626
627        let loaded = load_diff_file(&LoadDiffInputs {
628            diff: tmp,
629            format: None,
630            graph_a: Some(computed.graph_a.as_str().to_string()),
631            graph_b: Some(computed.graph_b.as_str().to_string()),
632        })
633        .unwrap();
634
635        let computed_a: HashSet<Triple> = computed.a_only_triples().collect();
636        let computed_b: HashSet<Triple> = computed.b_only_triples().collect();
637        let loaded_a: HashSet<Triple> = loaded.a_only_triples().collect();
638        let loaded_b: HashSet<Triple> = loaded.b_only_triples().collect();
639        assert_eq!(computed_a, loaded_a);
640        assert_eq!(computed_b, loaded_b);
641    }
642}