[libcatmandu-marc-perl] 25/208: Making some performance improvements

Jonas Smedegaard dr at jones.dk
Sat Oct 28 03:42:31 UTC 2017


This is an automated email from the git hooks/post-receive script.

js pushed a commit to annotated tag upstream/1.19
in repository libcatmandu-marc-perl.

commit 50c557b0cccd642f137ed9b326de12e6d14f4912
Author: Patrick Hochstenbach <patrick.hochstenbach at ugent.be>
Date:   Wed Jul 6 15:22:50 2016 +0200

    Making some performance improvements
---
 .gitignore           |   4 +-
 lib/Catmandu/MARC.pm | 165 ++++++++++++++++++++++++++++++---------------------
 t/bench.pl           |  14 +++++
 3 files changed, 115 insertions(+), 68 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1c67a5a..50df8e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,6 @@ Catmandu-MARC-*
 data
 local
 cpanfile.snapshot
-.perl-version
\ No newline at end of file
+.perl-version
+nytprof.out
+nytprof/
diff --git a/lib/Catmandu/MARC.pm b/lib/Catmandu/MARC.pm
index 3d203f8..f951e4d 100644
--- a/lib/Catmandu/MARC.pm
+++ b/lib/Catmandu/MARC.pm
@@ -3,8 +3,11 @@ package Catmandu::MARC;
 use Catmandu::Sane;
 use Catmandu::Util;
 use Catmandu::Exporter::MARC::XML;
+use Memoize;
 use Carp;
 
+memoize('_compile_marc_path');
+
 our $VERSION = '0.219';
 
 sub marc_map {
@@ -28,28 +31,29 @@ sub marc_map {
     my $vals;
 
     marc_at_field($record, $marc_path, sub {
-        my ($field, %context) = @_;
+        my ($field, $context) = @_;
         my $v;
 
         if ($value_set) {
-            for (my $i = $context{start}; $i < $context{end}; $i += 2) {
-                if ($field->[$i] =~ /$context{subfield}/) {
+            for (my $i = $context->{start}; $i < $context->{end}; $i += 2) {
+                my $subfield_regex = $context->{subfield_regex};
+                if ($field->[$i] =~ $subfield_regex) {
                     $v = $value_set;
                     last;
                 }
             }
         }
         else {
-            $v = _extract_subfields($field,\%context, pluck => $pluck);
+            $v = _extract_subfields($field,$context, pluck => $pluck);
 
             if (defined $v && @$v) {
                 if (!$split) {
                     $v = join $join_char, @$v;
                 }
 
-                if (defined(my $off = $context{from})) {
+                if (defined(my $off = $context->{from})) {
                     $v = join $join_char, @$v if (ref $v eq 'ARRAY');
-                    my $len = $context{len};
+                    my $len = $context->{len};
                     if (length(${v}) > $off) {
                         $v = substr($v, $off, $len);
                     } else {
@@ -110,8 +114,8 @@ sub _extract_subfields {
     }
     else {
         for (my $i = $context->{start}; $i < $context->{end}; $i += 2) {
-            my $subfield = $context->{subfield};
-            if ($field->[$i] =~ /^$subfield$/) {
+            my $subfield_regex = $context->{subfield_regex};
+            if ($field->[$i] =~ $subfield_regex) {
                 push(@v, $field->[$i + 1]);
             }
         }
@@ -195,13 +199,13 @@ sub marc_set {
     }
 
     marc_at_field($record, $marc_path, sub {
-        my ($field,%context) = @_;
+        my ($field,$context) = @_;
 
         my $found = 0;
-        for (my $i = $context{start}; $i < $context{end}; $i += 2) {
-            if ($field->[$i] eq $context{subfield}) {
-                if (defined $context{from}) {
-                    substr($field->[$i + 1], $context{from}, $context{len}) = $value;
+        for (my $i = $context->{start}; $i < $context->{end}; $i += 2) {
+            if ($field->[$i] eq $context->{subfield}) {
+                if (defined $context->{from}) {
+                    substr($field->[$i + 1], $context->{from}, $context->{len}) = $value;
                 }
                 else {
                     $field->[$i + 1] = $value;
@@ -211,7 +215,7 @@ sub marc_set {
         }
 
         if ($found == 0) {
-            push(@$field,$context{subfield},$value);
+            push(@$field,$context->{subfield},$value);
         }
     }, subfield_default => 1);
 
@@ -226,32 +230,36 @@ sub marc_remove {
     my $new_record;
 
     marc_at_field($record, $marc_path, sub {
-        my ($field,%context) = @_;
+        my ($field,$context) = @_;
 
-        if ($field->[0] =~ /$context{field_regex}/) {
-            if (defined $context{ind1}) {
-                return if (defined $field->[1] && $field->[1] eq $context{ind1});
+        if ($field->[0] =~ $context->{field_regex}) {
+            if (defined $context->{ind1}) {
+                return if (defined $field->[1] && $field->[1] eq $context->{ind1});
             }
 
-            if (defined $context{ind2}) {
-                return if (defined $field->[2] && $field->[2] eq $context{ind2});
+            if (defined $context->{ind2}) {
+                return if (defined $field->[2] && $field->[2] eq $context->{ind2});
             }
 
-            unless (defined $context{ind1} || defined $context{ind2} || defined $context{subfield}) {
+            unless (
+                defined $context->{ind1} ||
+                defined $context->{ind2} ||
+                defined $context->{subfield_regex} ) {
                 return;
             }
         }
 
-        if (defined $context{subfield}) {
-            if ( $field->[0] =~ /$context{field_regex}/) {
+        if (defined $context->{subfield_regex}) {
+            my $subfield_regex = $context->{subfield_regex};
+            if ( $field->[0] =~ $context->{field_regex}) {
                 my $new_subf = [];
-                for (my $i = $context{start}; $i < $context{end}; $i += 2) {
-                    unless ($field->[$i] =~ /$context{subfield}/) {
+                for (my $i = $context->{start}; $i < $context->{end}; $i += 2) {
+                    unless ($field->[$i] =~ $subfield_regex) {
                         push @$new_subf , $field->[$i];
                         push @$new_subf , $field->[$i+1];
                     }
                 }
-                splice @$field , $context{start} , int(@$field), @$new_subf;
+                splice @$field , $context->{start} , int(@$field), @$new_subf;
             }
         }
 
@@ -393,83 +401,106 @@ sub marc_decode_dollar_subfields {
     $data;
 }
 
-sub marc_at_field {
-    my ($record,$marc_path,$callback,%opts) = @_;
-
-    croak "need a marc_path and callback" unless defined($marc_path) && defined($callback);
+sub _compile_marc_path {
+    my ($marc_path,%opts) = @_;
 
-    my $field_regex;
-    my ($field,$ind1,$ind2,$subfield_regex,$from,$to,$len);
+    my ($field,$field_regex,$ind1,$ind2,
+        $subfield,$subfield_regex,$from,$to,$len,$is_regex_field);
 
-    if ($marc_path =~ /(\S{3})(\[([^,])?,?([^,])?\])?([_a-z0-9^]+)?(\/(\d+)(-(\d+))?)?/) {
+    my $MARC_PATH_REGEX = qr/(\S{3})(\[([^,])?,?([^,])?\])?([_a-z0-9^]+)?(\/(\d+)(-(\d+))?)?/;
+    if ($marc_path =~ $MARC_PATH_REGEX) {
         $field          = $1;
         $ind1           = $3;
         $ind2           = $4;
-        $subfield_regex = $5;
-        if (defined($subfield_regex)) {
-            unless ($subfield_regex =~ /^[a-zA-Z0-9]$/) {
-                $subfield_regex = "[$subfield_regex]";
+        $subfield       = $5;
+        if (defined($subfield)) {
+            unless ($subfield =~ /^[a-zA-Z0-9]$/) {
+                $subfield = "[$subfield]";
             }
         }
         elsif ($opts{subfield_default}) {
-            $subfield_regex = $field =~ /^0|LDR/ ? '_' : 'a';
+            $subfield = $field =~ /^0|LDR/ ? '_' : 'a';
         }
         elsif ($opts{subfield_wildcard}) {
-            $subfield_regex = '[a-z0-9_]';
+            $subfield = '[a-z0-9_]';
+        }
+        if (defined($subfield)) {
+            $subfield_regex = qr/^${subfield}$/;
         }
         $from           = $7;
         $to             = $9;
         $len = defined $to ? $to - $from + 1 : 1;
     }
     else {
-        confess "invalid marc path";
+        return undef;
     }
 
     $field_regex = $field;
-    $field_regex =~ s/\*/./g;
 
-    for (@$record) {
+    if ($field =~ /\*/) {
+        $field_regex    =~ s/\*/./g;
+        $is_regex_field = 1;
+    }
+
+    $field_regex = qr/^$field_regex$/;
+
+    return {
+        field           => $field ,
+        field_regex     => $field_regex ,
+        is_regex_field  => $is_regex_field ,
+        subfield        => $subfield ,
+        subfield_regex  => $subfield_regex ,
+        ind1            => $ind1 ,
+        ind2            => $ind2 ,
+        from            => $from ,
+        to              => $to ,
+        len             => $len
+    };
+}
+
+sub marc_at_field {
+    my ($record,$marc_path,$callback,%opts) = @_;
+
+    croak "need a marc_path and callback" unless defined($marc_path) && defined($callback);
+
+    my $context = _compile_marc_path($marc_path,%opts);
+
+    confess "invalid marc path" unless $context;
+
+    for my $row (@$record) {
         unless ($opts{nofilter}) {
-            if ($_->[0] !~ /$field_regex/) {
-                next;
+            if ($context->{is_regex_field}) {
+                next unless $row->[0] =~ $context->{field_regex};
+            }
+            else {
+                next unless $row->[0] eq $context->{field};
             }
 
-            if (defined $ind1) {
-                if (!defined $_->[1] || $_->[1] ne $ind1) {
+            if (defined $context->{ind1}) {
+                if (!defined $row->[1] || $row->[1] ne $context->{ind1}) {
                     next;
                 }
             }
-            if (defined $ind2) {
-                if (!defined $_->[2] || $_->[2] ne $ind2) {
+            if (defined $context->{ind2}) {
+                if (!defined $row->[2] || $row->[2] ne $context->{ind2}) {
                     next;
                 }
             }
         }
 
-        my $start;
-
-        if ($_->[0] =~ /^LDR|^00/) {
-            $start = 3;
+        if ($row->[0] =~ /^LDR|^00/) {
+            $context->{start} = 3;
         }
-        elsif (defined $_->[5] && $_->[5] eq '_') {
-            $start = 5;
+        elsif (defined $row->[5] && $row->[5] eq '_') {
+            $context->{start} = 5;
         }
         else {
-            $start = 3;
+            $context->{start} = 3;
         }
 
-        $callback->($_,
-            field        => $field ,
-            field_regex  => $field_regex ,
-            subfield     => $subfield_regex ,
-            start        => $start ,
-            end          => int(@$_) ,
-            ind1         => $ind1 ,
-            ind2         => $ind2 ,
-            from         => $from ,
-            to           => $to ,
-            len          => $len
-        );
+        $context->{end} = int(@$row);
+
+        $callback->($row,$context);
     }
 }
 
diff --git a/t/bench.pl b/t/bench.pl
new file mode 100644
index 0000000..eda3d09
--- /dev/null
+++ b/t/bench.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/env perl
+
+use Catmandu;
+
+my $file     = shift;
+
+die "usage: $0 fix_file" unless $file;
+
+my $importer = Catmandu->importer('MARC', type => 'ALEPHSEQ', file => 't/rug01.aleph');
+my $fixer    = Catmandu->fixer($file);
+my $exporter = Catmandu->exporter('Null');
+
+$exporter->add_many($fixer->fix($importer));
+$exporter->commit;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libcatmandu-marc-perl.git



More information about the Pkg-perl-cvs-commits mailing list