Skip to main content

rdf_compare/
input.rs

1use crate::cli::InputFormat;
2use anyhow::{Context, Result};
3use flate2::read::MultiGzDecoder;
4use oxrdf::{GraphName, NamedOrBlankNode, Quad, Term, Triple};
5use std::fs::File;
6use std::io::{BufRead, BufReader, Read};
7use std::path::Path;
8
9/// Returns true for triple-shaped formats (no graph component in the input).
10pub fn is_quad_format(f: InputFormat) -> bool {
11    matches!(f, InputFormat::Trig | InputFormat::Nq)
12}
13
14/// Open `path` for reading, transparently decompressing if it ends with `.gz`.
15pub fn open_reader(path: &Path) -> Result<Box<dyn BufRead>> {
16    let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
17    let is_gz = path
18        .file_name()
19        .and_then(|s| s.to_str())
20        .map(|s| s.to_ascii_lowercase().ends_with(".gz"))
21        .unwrap_or(false);
22
23    let raw: Box<dyn Read> = if is_gz {
24        Box::new(MultiGzDecoder::new(file))
25    } else {
26        Box::new(file)
27    };
28    Ok(Box::new(BufReader::new(raw)))
29}
30
31/// Returns true if subject or object is a blank node.
32fn has_blank_node(t: &Triple) -> bool {
33    matches!(t.subject, NamedOrBlankNode::BlankNode(_)) || matches!(t.object, Term::BlankNode(_))
34}
35
36/// Outcome of parsing one input file.
37#[derive(Debug, Default, Clone)]
38pub struct ParseOutcome {
39    pub total: u64,
40    pub skipped: u64,
41    /// Number of triples/quads with at least one blank node (subject or object).
42    pub bnode_count: u64,
43    /// Prefix declarations encountered in the source, in iteration order.
44    /// Empty for formats that do not carry prefixes (N-Triples, N-Quads, RDF/XML).
45    pub prefixes: Vec<(String, String)>,
46}
47
48/// Stream-parse `reader` according to `format`. For each triple (after dropping
49/// any graph context for quad formats), invoke `on_triple`. Triples involving a
50/// blank node are skipped and increment the skipped counter.
51///
52/// Returns a [`ParseOutcome`] containing totals and any prefix declarations
53/// seen in the source.
54pub fn parse_triples<R: BufRead, F: FnMut(Triple) -> Result<()>>(
55    reader: R,
56    format: InputFormat,
57    mut on_triple: F,
58) -> Result<ParseOutcome> {
59    let mut total: u64 = 0;
60    let mut skipped: u64 = 0;
61    let mut prefixes: Vec<(String, String)> = Vec::new();
62
63    macro_rules! handle_triple {
64        ($t:expr) => {{
65            let t: Triple = $t;
66            if has_blank_node(&t) {
67                skipped += 1;
68            } else {
69                total += 1;
70                on_triple(t)?;
71            }
72        }};
73    }
74
75    match format {
76        InputFormat::Nt => {
77            let parser = oxttl::NTriplesParser::new().for_reader(reader);
78            for tri in parser {
79                let t = tri.context("N-Triples parse error")?;
80                handle_triple!(t);
81            }
82        }
83        InputFormat::Ttl => {
84            let mut parser = oxttl::TurtleParser::new().for_reader(reader);
85            for tri in parser.by_ref() {
86                let t = tri.context("Turtle parse error")?;
87                handle_triple!(t);
88            }
89            prefixes.extend(
90                parser
91                    .prefixes()
92                    .map(|(k, v)| (k.to_string(), v.to_string())),
93            );
94        }
95        InputFormat::Rdf => {
96            let parser = oxrdfxml::RdfXmlParser::new().for_reader(reader);
97            for tri in parser {
98                let t = tri.context("RDF/XML parse error")?;
99                handle_triple!(t);
100            }
101        }
102        InputFormat::Trig => {
103            let mut parser = oxttl::TriGParser::new().for_reader(reader);
104            for q in parser.by_ref() {
105                let q = q.context("TriG parse error")?;
106                handle_triple!(Triple::new(q.subject, q.predicate, q.object));
107            }
108            prefixes.extend(
109                parser
110                    .prefixes()
111                    .map(|(k, v)| (k.to_string(), v.to_string())),
112            );
113        }
114        InputFormat::Nq => {
115            let parser = oxttl::NQuadsParser::new().for_reader(reader);
116            for q in parser {
117                let q = q.context("N-Quads parse error")?;
118                handle_triple!(Triple::new(q.subject, q.predicate, q.object));
119            }
120        }
121    }
122
123    Ok(ParseOutcome {
124        total,
125        skipped,
126        bnode_count: 0,
127        prefixes,
128    })
129}
130
131/// Stream-parse `reader` according to `format` and emit one [`Quad`] per
132/// statement. For triple-shaped formats, the graph component is set to
133/// [`GraphName::DefaultGraph`]; for quad-shaped formats, the input's named
134/// graph is preserved. Blank nodes are NOT skipped — the caller is expected to
135/// handle them via canonicalisation. Counts of statements and of those that
136/// touch a blank node are returned in the outcome.
137pub fn parse_quads<R: BufRead, F: FnMut(Quad) -> Result<()>>(
138    reader: R,
139    format: InputFormat,
140    mut on_quad: F,
141) -> Result<ParseOutcome> {
142    let mut total: u64 = 0;
143    let mut bnode_count: u64 = 0;
144    let mut prefixes: Vec<(String, String)> = Vec::new();
145
146    macro_rules! handle {
147        ($q:expr) => {{
148            let q: Quad = $q;
149            total += 1;
150            if matches!(q.subject, NamedOrBlankNode::BlankNode(_))
151                || matches!(q.object, Term::BlankNode(_))
152            {
153                bnode_count += 1;
154            }
155            on_quad(q)?;
156        }};
157    }
158
159    match format {
160        InputFormat::Nt => {
161            let parser = oxttl::NTriplesParser::new().for_reader(reader);
162            for tri in parser {
163                let t = tri.context("N-Triples parse error")?;
164                handle!(Quad {
165                    subject: t.subject,
166                    predicate: t.predicate,
167                    object: t.object,
168                    graph_name: GraphName::DefaultGraph,
169                });
170            }
171        }
172        InputFormat::Ttl => {
173            let mut parser = oxttl::TurtleParser::new().for_reader(reader);
174            for tri in parser.by_ref() {
175                let t = tri.context("Turtle parse error")?;
176                handle!(Quad {
177                    subject: t.subject,
178                    predicate: t.predicate,
179                    object: t.object,
180                    graph_name: GraphName::DefaultGraph,
181                });
182            }
183            prefixes.extend(
184                parser
185                    .prefixes()
186                    .map(|(k, v)| (k.to_string(), v.to_string())),
187            );
188        }
189        InputFormat::Rdf => {
190            let parser = oxrdfxml::RdfXmlParser::new().for_reader(reader);
191            for tri in parser {
192                let t = tri.context("RDF/XML parse error")?;
193                handle!(Quad {
194                    subject: t.subject,
195                    predicate: t.predicate,
196                    object: t.object,
197                    graph_name: GraphName::DefaultGraph,
198                });
199            }
200        }
201        InputFormat::Trig => {
202            let mut parser = oxttl::TriGParser::new().for_reader(reader);
203            for q in parser.by_ref() {
204                let q = q.context("TriG parse error")?;
205                handle!(q);
206            }
207            prefixes.extend(
208                parser
209                    .prefixes()
210                    .map(|(k, v)| (k.to_string(), v.to_string())),
211            );
212        }
213        InputFormat::Nq => {
214            let parser = oxttl::NQuadsParser::new().for_reader(reader);
215            for q in parser {
216                let q = q.context("N-Quads parse error")?;
217                handle!(q);
218            }
219        }
220    }
221
222    Ok(ParseOutcome {
223        total,
224        skipped: 0,
225        bnode_count,
226        prefixes,
227    })
228}
229
230/// Convenience: produces a triple from a quad by dropping the graph component.
231pub fn quad_to_triple(q: &Quad) -> Triple {
232    Triple {
233        subject: q.subject.clone(),
234        predicate: q.predicate.clone(),
235        object: q.object.clone(),
236    }
237}