1use crate::cli::{InputFormat, OutputFormat};
2use crate::graph_iri::resolve_graph_iris;
3use crate::input::{is_quad_format, open_reader, parse_quads, parse_triples, quad_to_triple};
4use anyhow::{Context, Result, bail};
5use oxrdf::dataset::CanonicalizationAlgorithm;
6use oxrdf::{Dataset, GraphName, NamedNode, NamedOrBlankNode, Quad, Triple};
7use std::collections::HashSet;
8use std::ffi::OsStr;
9use std::fs::File;
10use std::io::{BufWriter, Write, stdout};
11use std::path::{Path, PathBuf};
12
13#[derive(Debug, Default, Clone, Copy)]
14pub struct DiffStats {
15 pub a_total: u64,
16 pub b_total: u64,
17 pub common: u64,
18 pub a_only: u64,
19 pub b_only: u64,
20 pub a_skipped_bnodes: u64,
21 pub b_skipped_bnodes: u64,
22}
23
24impl DiffStats {
25 pub fn has_differences(&self) -> bool {
26 self.a_only > 0 || self.b_only > 0
27 }
28}
29
30#[derive(Debug, Clone)]
40pub struct DiffResult {
41 pub a_only: Vec<Quad>,
42 pub b_only: Vec<Quad>,
43 pub prefixes: Vec<(String, String)>,
45 pub graph_a: NamedNode,
46 pub graph_b: NamedNode,
47 pub stats: DiffStats,
48 pub source_a: Option<PathBuf>,
51 pub source_b: Option<PathBuf>,
52 pub format_a: Option<InputFormat>,
53 pub format_b: Option<InputFormat>,
54 pub quad_mode: bool,
55}
56
57fn graph_name_str(g: &GraphName) -> &str {
58 match g {
59 GraphName::NamedNode(n) => n.as_str(),
60 GraphName::BlankNode(b) => b.as_str(),
61 GraphName::DefaultGraph => "",
62 }
63}
64
65fn quad_order(a: &Quad, b: &Quad) -> std::cmp::Ordering {
66 let sa = match &a.subject {
67 NamedOrBlankNode::NamedNode(n) => n.as_str(),
68 NamedOrBlankNode::BlankNode(bn) => bn.as_str(),
69 };
70 let sb = match &b.subject {
71 NamedOrBlankNode::NamedNode(n) => n.as_str(),
72 NamedOrBlankNode::BlankNode(bn) => bn.as_str(),
73 };
74 graph_name_str(&a.graph_name)
75 .cmp(graph_name_str(&b.graph_name))
76 .then_with(|| sa.cmp(sb))
77 .then_with(|| a.predicate.as_str().cmp(b.predicate.as_str()))
78}
79
80impl DiffResult {
81 pub fn sort_rows(&mut self) {
82 self.a_only.sort_unstable_by(quad_order);
83 self.b_only.sort_unstable_by(quad_order);
84 }
85
86 pub fn a_only_triples(&self) -> impl Iterator<Item = Triple> + '_ {
87 self.a_only.iter().map(quad_to_triple)
88 }
89
90 pub fn b_only_triples(&self) -> impl Iterator<Item = Triple> + '_ {
91 self.b_only.iter().map(quad_to_triple)
92 }
93}
94
95#[derive(Debug, Clone)]
96pub struct DiffInputs {
97 pub file_a: PathBuf,
98 pub file_b: PathBuf,
99 pub format_a: Option<InputFormat>,
100 pub format_b: Option<InputFormat>,
101 pub graph_a: Option<String>,
102 pub graph_b: Option<String>,
103 pub ignore_blank_nodes: bool,
106}
107
108#[derive(Debug, Clone)]
109pub struct LoadDiffInputs {
110 pub diff: PathBuf,
111 pub format: Option<InputFormat>,
112 pub graph_a: Option<String>,
113 pub graph_b: Option<String>,
114}
115
116fn detect_or_override(path: &Path, over: Option<InputFormat>) -> Result<InputFormat> {
117 match over {
118 Some(f) => Ok(f),
119 None => crate::cli::detect_format(path),
120 }
121}
122
123fn open_writer(out: Option<&Path>) -> Result<Box<dyn Write>> {
124 Ok(match out {
125 Some(p) => Box::new(BufWriter::new(
126 File::create(p).with_context(|| format!("failed to create {}", p.display()))?,
127 )),
128 None => Box::new(BufWriter::new(stdout().lock())),
129 })
130}
131
132trait QuadSink {
133 fn write(&mut self, quad: &Quad) -> Result<()>;
134 fn finish(self: Box<Self>) -> Result<()>;
135}
136
137struct TrigSink<W: Write> {
138 inner: oxttl::trig::WriterTriGSerializer<W>,
139}
140impl<W: Write> QuadSink for TrigSink<W> {
141 fn write(&mut self, quad: &Quad) -> Result<()> {
142 self.inner
143 .serialize_quad(quad)
144 .context("failed to serialize quad to TriG")
145 }
146 fn finish(self: Box<Self>) -> Result<()> {
147 self.inner
148 .finish()
149 .context("failed to finalize TriG output")?;
150 Ok(())
151 }
152}
153
154struct NqSink<W: Write> {
155 inner: oxttl::nquads::WriterNQuadsSerializer<W>,
156}
157impl<W: Write> QuadSink for NqSink<W> {
158 fn write(&mut self, quad: &Quad) -> Result<()> {
159 self.inner
160 .serialize_quad(quad)
161 .context("failed to serialize quad to N-Quads")
162 }
163 fn finish(self: Box<Self>) -> Result<()> {
164 let _ = self.inner.finish();
165 Ok(())
166 }
167}
168
169fn make_sink(
170 format: OutputFormat,
171 w: Box<dyn Write>,
172 prefixes: &[(String, String)],
173) -> Result<Box<dyn QuadSink>> {
174 Ok(match format {
175 OutputFormat::Trig => {
176 let mut s = oxttl::TriGSerializer::new();
177 for (name, iri) in prefixes {
178 s = s
179 .with_prefix(name, iri)
180 .with_context(|| format!("invalid prefix IRI for `{name}`: <{iri}>"))?;
181 }
182 Box::new(TrigSink {
183 inner: s.for_writer(w),
184 })
185 }
186 OutputFormat::Nq => Box::new(NqSink {
187 inner: oxttl::NQuadsSerializer::new().for_writer(w),
188 }),
189 })
190}
191
192fn merge_prefixes(a: Vec<(String, String)>, b: Vec<(String, String)>) -> Vec<(String, String)> {
193 let mut seen: HashSet<String> = HashSet::with_capacity(a.len() + b.len());
194 let mut out: Vec<(String, String)> = Vec::with_capacity(a.len() + b.len());
195 for (name, iri) in a.into_iter().chain(b) {
196 if seen.insert(name.clone()) {
197 out.push((name, iri));
198 }
199 }
200 out
201}
202
203fn make_quad(t: Triple, g: &GraphName) -> Quad {
204 Quad {
205 subject: t.subject,
206 predicate: t.predicate,
207 object: t.object,
208 graph_name: g.clone(),
209 }
210}
211
212fn canonicalize_quads(quads: Vec<Quad>) -> Vec<Quad> {
216 let mut dataset: Dataset = quads.into_iter().collect();
217 dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
218 dataset.iter().map(Quad::from).collect()
219}
220
221pub fn compute_diff(inputs: &DiffInputs) -> Result<DiffResult> {
222 let fmt_a = detect_or_override(&inputs.file_a, inputs.format_a)?;
223 let fmt_b = detect_or_override(&inputs.file_b, inputs.format_b)?;
224
225 let (graph_a, graph_b) = resolve_graph_iris(
226 &inputs.file_a,
227 &inputs.file_b,
228 inputs.graph_a.as_deref(),
229 inputs.graph_b.as_deref(),
230 )?;
231 let quad_mode = is_quad_format(fmt_a) || is_quad_format(fmt_b);
232
233 if inputs.ignore_blank_nodes {
234 return compute_diff_skip_bnodes(inputs, fmt_a, fmt_b, &graph_a, &graph_b, quad_mode);
235 }
236
237 let mut quads_a: Vec<Quad> = Vec::new();
238 let reader_a = open_reader(&inputs.file_a)?;
239 let outcome_a = parse_quads(reader_a, fmt_a, |q| {
240 quads_a.push(q);
241 Ok(())
242 })
243 .with_context(|| format!("while parsing {}", inputs.file_a.display()))?;
244
245 let mut quads_b: Vec<Quad> = Vec::new();
246 let reader_b = open_reader(&inputs.file_b)?;
247 let outcome_b = parse_quads(reader_b, fmt_b, |q| {
248 quads_b.push(q);
249 Ok(())
250 })
251 .with_context(|| format!("while parsing {}", inputs.file_b.display()))?;
252
253 if outcome_a.bnode_count > 0 || outcome_b.bnode_count > 0 {
254 quads_a = canonicalize_quads(quads_a);
259 quads_b = canonicalize_quads(quads_b);
260 }
261
262 let mut set: HashSet<Quad> = quads_a.into_iter().collect();
263 let mut b_only: Vec<Quad> = Vec::new();
264 for q in quads_b {
265 if !set.remove(&q) {
266 b_only.push(q);
267 }
268 }
269 let mut a_only: Vec<Quad> = set.into_iter().collect();
270
271 if !quad_mode {
275 let g_a = GraphName::NamedNode(graph_a.clone());
276 let g_b = GraphName::NamedNode(graph_b.clone());
277 for q in &mut a_only {
278 if matches!(q.graph_name, GraphName::DefaultGraph) {
279 q.graph_name = g_a.clone();
280 }
281 }
282 for q in &mut b_only {
283 if matches!(q.graph_name, GraphName::DefaultGraph) {
284 q.graph_name = g_b.clone();
285 }
286 }
287 }
288
289 let prefixes = merge_prefixes(outcome_a.prefixes, outcome_b.prefixes);
290 let a_total = outcome_a.total;
291 let b_total = outcome_b.total;
292 let a_only_count = a_only.len() as u64;
293 let b_only_count = b_only.len() as u64;
294 let common = a_total.saturating_sub(a_only_count);
295
296 let stats = DiffStats {
297 a_total,
298 b_total,
299 common,
300 a_only: a_only_count,
301 b_only: b_only_count,
302 a_skipped_bnodes: 0,
303 b_skipped_bnodes: 0,
304 };
305
306 Ok(DiffResult {
307 a_only,
308 b_only,
309 prefixes,
310 graph_a,
311 graph_b,
312 stats,
313 source_a: Some(inputs.file_a.clone()),
314 source_b: Some(inputs.file_b.clone()),
315 format_a: Some(fmt_a),
316 format_b: Some(fmt_b),
317 quad_mode,
318 })
319}
320
321fn compute_diff_skip_bnodes(
322 inputs: &DiffInputs,
323 fmt_a: InputFormat,
324 fmt_b: InputFormat,
325 graph_a: &NamedNode,
326 graph_b: &NamedNode,
327 quad_mode: bool,
328) -> Result<DiffResult> {
329 let mut set: HashSet<Triple> = HashSet::new();
330 let reader_a = open_reader(&inputs.file_a)?;
331 let outcome_a = parse_triples(reader_a, fmt_a, |t| {
332 set.insert(t);
333 Ok(())
334 })
335 .with_context(|| format!("while parsing {}", inputs.file_a.display()))?;
336
337 let mut b_only_triples: Vec<Triple> = Vec::new();
338 let reader_b = open_reader(&inputs.file_b)?;
339 let outcome_b = parse_triples(reader_b, fmt_b, |t| {
340 if !set.remove(&t) {
341 b_only_triples.push(t);
342 }
343 Ok(())
344 })
345 .with_context(|| format!("while parsing {}", inputs.file_b.display()))?;
346
347 let a_only_triples: Vec<Triple> = set.into_iter().collect();
348 let prefixes = merge_prefixes(outcome_a.prefixes, outcome_b.prefixes);
349
350 let a_only_count = a_only_triples.len() as u64;
351 let b_only_count = b_only_triples.len() as u64;
352 let common = outcome_a.total.saturating_sub(a_only_count);
353
354 let g_a = GraphName::NamedNode(graph_a.clone());
355 let g_b = GraphName::NamedNode(graph_b.clone());
356 let a_only: Vec<Quad> = a_only_triples
357 .into_iter()
358 .map(|t| make_quad(t, &g_a))
359 .collect();
360 let b_only: Vec<Quad> = b_only_triples
361 .into_iter()
362 .map(|t| make_quad(t, &g_b))
363 .collect();
364
365 let stats = DiffStats {
366 a_total: outcome_a.total,
367 b_total: outcome_b.total,
368 common,
369 a_only: a_only_count,
370 b_only: b_only_count,
371 a_skipped_bnodes: outcome_a.skipped,
372 b_skipped_bnodes: outcome_b.skipped,
373 };
374
375 Ok(DiffResult {
376 a_only,
377 b_only,
378 prefixes,
379 graph_a: graph_a.clone(),
380 graph_b: graph_b.clone(),
381 stats,
382 source_a: Some(inputs.file_a.clone()),
383 source_b: Some(inputs.file_b.clone()),
384 format_a: Some(fmt_a),
385 format_b: Some(fmt_b),
386 quad_mode,
387 })
388}
389
390fn split_output_paths(out: &Path) -> (PathBuf, PathBuf) {
393 let stem = out
394 .file_stem()
395 .and_then(OsStr::to_str)
396 .unwrap_or("diff")
397 .to_string();
398 let ext = out
399 .extension()
400 .and_then(OsStr::to_str)
401 .map(|e| format!(".{e}"))
402 .unwrap_or_default();
403 let parent = out.parent();
404 let make = |suffix: &str| {
405 let name = format!("{stem}-{suffix}{ext}");
406 match parent {
407 Some(p) if !p.as_os_str().is_empty() => p.join(name),
408 _ => PathBuf::from(name),
409 }
410 };
411 (make("a"), make("b"))
412}
413
414pub fn write_diff(result: &DiffResult, out: Option<&Path>, format: OutputFormat) -> Result<()> {
415 if result.quad_mode {
416 let Some(out_path) = out else {
417 bail!(
418 "quad-shaped inputs (N-Quads/TriG) require --output: two files \
419 are written (one per side) to preserve the source named graphs"
420 );
421 };
422 let (path_a, path_b) = split_output_paths(out_path);
423
424 let writer_a = open_writer(Some(&path_a))?;
425 let mut sink_a = make_sink(format, writer_a, &result.prefixes)?;
426 for q in &result.a_only {
427 sink_a.write(q)?;
428 }
429 sink_a.finish()?;
430
431 let writer_b = open_writer(Some(&path_b))?;
432 let mut sink_b = make_sink(format, writer_b, &result.prefixes)?;
433 for q in &result.b_only {
434 sink_b.write(q)?;
435 }
436 sink_b.finish()?;
437 return Ok(());
438 }
439
440 let writer = open_writer(out)?;
441 let mut sink = make_sink(format, writer, &result.prefixes)?;
442 for q in &result.b_only {
443 sink.write(q)?;
444 }
445 for q in &result.a_only {
446 sink.write(q)?;
447 }
448 sink.finish()?;
449 Ok(())
450}
451
452pub fn run_diff(args: &crate::cli::Args) -> Result<DiffStats> {
453 let inputs = DiffInputs {
454 file_a: args
455 .file_a
456 .clone()
457 .ok_or_else(|| anyhow::anyhow!("<FILE_A> is required"))?,
458 file_b: args
459 .file_b
460 .clone()
461 .ok_or_else(|| anyhow::anyhow!("<FILE_B> is required"))?,
462 format_a: args.format_a,
463 format_b: args.format_b,
464 graph_a: args.graph_a.clone(),
465 graph_b: args.graph_b.clone(),
466 ignore_blank_nodes: args.ignore_blank_nodes,
467 };
468 let result = compute_diff(&inputs)?;
469 write_diff(&result, args.output.as_deref(), args.output_format)?;
470 Ok(result.stats)
471}
472
473pub fn stream_common_triples<F: FnMut(&Triple) -> Result<()>>(
475 file_a: &Path,
476 file_b: &Path,
477 format_a: Option<InputFormat>,
478 format_b: Option<InputFormat>,
479 mut on_triple: F,
480) -> Result<()> {
481 let fmt_a = detect_or_override(file_a, format_a)?;
482 let fmt_b = detect_or_override(file_b, format_b)?;
483
484 let mut set: HashSet<Triple> = HashSet::new();
485 let reader_a = open_reader(file_a)?;
486 parse_triples(reader_a, fmt_a, |t| {
487 set.insert(t);
488 Ok(())
489 })
490 .with_context(|| format!("while parsing {}", file_a.display()))?;
491
492 let reader_b = open_reader(file_b)?;
493 parse_triples(reader_b, fmt_b, |t| {
494 if set.contains(&t) {
495 on_triple(&t)?;
496 }
497 Ok(())
498 })
499 .with_context(|| format!("while parsing {}", file_b.display()))?;
500 Ok(())
501}
502
503pub fn load_diff_file(inputs: &LoadDiffInputs) -> Result<DiffResult> {
506 let fmt = detect_or_override(&inputs.diff, inputs.format)?;
507 let mut prefixes: Vec<(String, String)> = Vec::new();
508
509 let mut quads: Vec<Quad> = Vec::new();
510 match fmt {
511 InputFormat::Trig => {
512 let reader = open_reader(&inputs.diff)?;
513 let mut parser = oxttl::TriGParser::new().for_reader(reader);
514 for q in parser.by_ref() {
515 quads.push(q.context("TriG parse error")?);
516 }
517 prefixes.extend(
518 parser
519 .prefixes()
520 .map(|(k, v)| (k.to_string(), v.to_string())),
521 );
522 }
523 InputFormat::Nq => {
524 let reader = open_reader(&inputs.diff)?;
525 let parser = oxttl::NQuadsParser::new().for_reader(reader);
526 for q in parser {
527 quads.push(q.context("N-Quads parse error")?);
528 }
529 }
530 other => bail!(
531 "diff file format {:?} does not carry named graphs; use TriG or N-Quads",
532 other
533 ),
534 }
535
536 let (graph_a, graph_b) = match (inputs.graph_a.as_deref(), inputs.graph_b.as_deref()) {
537 (Some(a), Some(b)) => (
538 NamedNode::new(a).with_context(|| format!("invalid --graph-a IRI: {a}"))?,
539 NamedNode::new(b).with_context(|| format!("invalid --graph-b IRI: {b}"))?,
540 ),
541 _ => {
542 let mut seen: Vec<NamedNode> = Vec::new();
543 for q in &quads {
544 if let GraphName::NamedNode(n) = &q.graph_name
545 && !seen.iter().any(|s| s == n)
546 {
547 seen.push(n.clone());
548 if seen.len() == 2 {
549 break;
550 }
551 }
552 }
553 match seen.as_slice() {
554 [a, b] => (a.clone(), b.clone()),
555 [single] => (single.clone(), single.clone()),
556 _ => bail!("diff file contains no named graphs; cannot determine A/B sides"),
557 }
558 }
559 };
560
561 let mut a_only: Vec<Quad> = Vec::new();
562 let mut b_only: Vec<Quad> = Vec::new();
563 for q in quads {
564 match &q.graph_name {
565 GraphName::NamedNode(g) if g == &graph_a => a_only.push(q),
566 GraphName::NamedNode(g) if g == &graph_b => b_only.push(q),
567 _ => {}
568 }
569 }
570
571 let stats = DiffStats {
572 a_total: a_only.len() as u64,
573 b_total: b_only.len() as u64,
574 common: 0,
575 a_only: a_only.len() as u64,
576 b_only: b_only.len() as u64,
577 a_skipped_bnodes: 0,
578 b_skipped_bnodes: 0,
579 };
580
581 Ok(DiffResult {
582 a_only,
583 b_only,
584 prefixes,
585 graph_a,
586 graph_b,
587 stats,
588 source_a: None,
589 source_b: None,
590 format_a: None,
591 format_b: None,
592 quad_mode: false,
593 })
594}
595
596#[cfg(test)]
597mod tests {
598 use super::*;
599 use crate::cli::OutputFormat;
600 use std::path::PathBuf;
601
602 fn fixtures(name: &str) -> PathBuf {
603 let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
604 p.push("tests");
605 p.push("fixtures");
606 p.push(name);
607 p
608 }
609
610 #[test]
611 fn compute_then_load_round_trip_trig() {
612 let inputs = DiffInputs {
613 file_a: fixtures("a.ttl"),
614 file_b: fixtures("b.ttl"),
615 format_a: None,
616 format_b: None,
617 graph_a: None,
618 graph_b: None,
619 ignore_blank_nodes: false,
620 };
621 let computed = compute_diff(&inputs).unwrap();
622
623 let tmp = std::env::temp_dir().join("rdf-compare-roundtrip.trig");
624 let _ = std::fs::remove_file(&tmp);
625 write_diff(&computed, Some(&tmp), OutputFormat::Trig).unwrap();
626
627 let loaded = load_diff_file(&LoadDiffInputs {
628 diff: tmp,
629 format: None,
630 graph_a: Some(computed.graph_a.as_str().to_string()),
631 graph_b: Some(computed.graph_b.as_str().to_string()),
632 })
633 .unwrap();
634
635 let computed_a: HashSet<Triple> = computed.a_only_triples().collect();
636 let computed_b: HashSet<Triple> = computed.b_only_triples().collect();
637 let loaded_a: HashSet<Triple> = loaded.a_only_triples().collect();
638 let loaded_b: HashSet<Triple> = loaded.b_only_triples().collect();
639 assert_eq!(computed_a, loaded_a);
640 assert_eq!(computed_b, loaded_b);
641 }
642}