[libcatmandu-rdf-perl] 15/20: Adding a --speed option

Jonas Smedegaard dr at jones.dk
Sat Oct 28 03:10:22 UTC 2017


This is an automated email from the git hooks/post-receive script.

js pushed a commit to annotated tag upstream/0.32
in repository libcatmandu-rdf-perl.

commit 4b545ed422b000d4af359eb9ea8b25ff0a70a462
Author: Patrick Hochstenbach <patrick.hochstenbach at ugent.be>
Date:   Sat Jul 29 15:58:21 2017 +0200

    Adding a --speed option
---
 Changes                      |  2 ++
 lib/Catmandu/Importer/RDF.pm | 29 +++++++++++++++++++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/Changes b/Changes
index 5e0b03f..a05937c 100644
--- a/Changes
+++ b/Changes
@@ -2,6 +2,8 @@ Changelog for Catmandu-RDF
 
 {{$NEXT}}
   - Fixing SPARQL examples with latest RDF::LDF
+  - Better support for streaming RDF input
+  - Adding a --speed option to the Catmandu::Importer::RDF
 
 0.31  2016-04-13 10:24:55 CEST
   - Fix test failure caused by RDF::NS (#29)
diff --git a/lib/Catmandu/Importer/RDF.pm b/lib/Catmandu/Importer/RDF.pm
index faad9d6..f2140e4 100644
--- a/lib/Catmandu/Importer/RDF.pm
+++ b/lib/Catmandu/Importer/RDF.pm
@@ -88,6 +88,10 @@ has cache_options => (
     } }
 );
 
+has speed => (
+    is      => 'ro',
+);
+
 sub BUILD {
     my ($self) = @_;
 
@@ -279,7 +283,9 @@ sub _hashref_stream {
                ? RDF::Trine::Parser->new( $self->type ) : 'RDF::Trine::Parser';
 
     my $handler = sub {
-        my $triple = shift;
+        my $triple    = shift;
+        state $start  = time;
+        state $count  = 0;
 
         my $subject   = $triple->subject->is_blank ?
                             '_:' . $triple->subject->blank_identifier :
@@ -306,6 +312,12 @@ sub _hashref_stream {
         $hashref->{$subject}->{$predicate}->[0]->{value}    = $value;
 
         print $pipe encode_json($hashref) , "\n";
+
+        $count++;
+
+        if ($self->speed && ($count % 100 == 0) && (my $elapsed = time - $start) ) {
+          printf STDERR "triples %9d (%d/sec)\n" , $count , $count/$elapsed;
+        }
     };
 
     if ($self->url) {
@@ -345,15 +357,15 @@ Command line client C<catmandu>:
 
   catmandu convert RDF --url http://d-nb.info/gnd/4151473-7 to YAML
 
-  catmandu convert RDF --type ttl --file rdfdump.ttl to JSON
+  catmandu convert RDF --file rdfdump.ttl to JSON
 
-  # For big input files it will be faster not to build a big hash in memory
-  # bit to return each triple fragment
-  catmandu convert RDF --type ttl --triples 1 --file rdfdump.ttl to JSON
+  # Parse the input into on JSON document per triplet. This is the
+  # most memory efficient (and fastest) way to parse RDF input.
+  catmandu convert RDF --triples 1 --file rdfdump.ttl to JSON
 
   # Transform back into NTriples (conversions to and from triples is the
   # most efficient way to process RDF)
-  catmandu convert RDF --type ttl --triples 1 --file rdfdump.ttl to RDF --type NTriples
+  catmandu convert RDF --triples 1 --file rdfdump.ttl to RDF --type NTriples
 
   # Query a SPARQL endpoint
   catmandu convert RDF --url http://dbpedia.org/sparql
@@ -452,6 +464,11 @@ Provide the L<CHI> based options for caching result sets. By default a memory st
             max_size => 1024*1024
         });
 
+=item speed
+
+If set to a true value, then write RDF file processing speed on the STDERR as
+number of triples parsed per second.
+
 =back
 
 =head1 METHODS

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libcatmandu-rdf-perl.git



More information about the Pkg-perl-cvs-commits mailing list