Bug 11096: support the retrieval of large MARCXML records

This patch makes Koha <-> Zebra use MARCXML for the serialization when using DOM, and USMARC for GRS-1. * The following functions are modified to set the Zebra record syntax according to the current sysprefs and configuration: - C4::Context->Zconn - C4::Context-_new_Zconn * A new function 'new_record_from_zebra' is introduced, which checks the context we are in, and creates the MARC::Record object using the right constructor. The following packages get touched to make use of the new function: - C4::Search - C4::AuthoritiesMarc and the same happens to the UI scripts that make use of them (both in the OPAC and STAFF interfaces). * Calls to the unsafe ZOOM::Record->render()[1] method are removed. Due to this last change the code for building facets was rewritten. And for performance on the facets creation I pushed higher version dependencies for MARC::File::XML and MARC::Record (we rely on MARC::Field->as_string). * Calls to MARC::Record->new_from_xml and MARC::Record->new_from_usmarc are wrapped with eval for catching problems [2]. * As of bug 3087, UNIMARC uses the 'unimarc' record syntax. this case is correctly handled. * As of bug 7818 misc/migration_tools/rebuild_zebra.pl behaves like: - bib_index_mode (defaults to 'grs1' if not specified) - auth_index_mode (defaults to 'dom') here we do exactly the same. To test: - prove t/db_dependent/Search.t should pass. - Searching should remain functional. - Indexing and searching for a big record should work (that's what the unit tests do). - Test an index scan search (on the staff interface): Search > More options > Check "Scan indexes". - Enable 'itemBarcodeFallbackSearch' and try to circulate any word, it shouldn't break. - Searching for a biblio in a new subscription shouldn't break. - Running bulkmarcimport.pl shouldn't break. - And so on... for the rest of the .pl files. [1] http://search.cpan.org/~mirk/Net-Z3950-ZOOM/lib/ZOOM.pod#render() [2] a record that cannot be parsed by MARC::Record is simply skipped (bug 10684) Sponsored-by: Universidad Nacional de Cordoba Signed-off-by: Kyle M Hall <[email protected]> Signed-off-by: Martin Renvoize <[email protected]> Signed-off-by: Galen Charlton <[email protected]>
PTFS-Europe · Feb 28, 2014 · daf2ebc · daf2ebc
1 parent 5159a43
commit daf2ebc
Show file tree

Hide file tree

Showing 15 changed files with 196 additions and 86 deletions.
diff --git a/C4/AuthoritiesMarc.pm b/C4/AuthoritiesMarc.pm
@@ -266,10 +266,17 @@ sub SearchAuthorities {
 
         ##Here we have to extract MARC record and $authid from ZEBRA AUTHORITIES
         my $rec=$oAResult->record($counter);
-        my $marcdata=$rec->raw();
-        my $authrecord;
         my $separator=C4::Context->preference('authoritysep');
-        $authrecord = MARC::File::USMARC::decode($marcdata);
+        my $authrecord = C4::Search::new_record_from_zebra(
+            'authorityserver',
+            $rec->raw()
+        );
+
+        if ( !defined $authrecord or !defined $authrecord->field('001') ) {
+            $counter++;
+            next;
+        }
+
         my $authid=$authrecord->field('001')->data();
         my %newline;
         $newline{authid} = $authid;
@@ -847,15 +854,18 @@ sub FindDuplicateAuthority {
     my $query='at:'.$authtypecode.' ';
     my $filtervalues=qr([\001-\040\!\'\"\`\#\$\%\&\*\+,\-\./:;<=>\?\@\(\)\{\[\]\}_\|\~]);
     if ($record->field($auth_tag_to_report)) {
-      foreach ($record->field($auth_tag_to_report)->subfields()) {
-        $_->[1]=~s/$filtervalues/ /g; $query.= " $op he:\"".$_->[1]."\"" if ($_->[0]=~/[A-z]/);
-      }
+        foreach ($record->field($auth_tag_to_report)->subfields()) {
+            $_->[1]=~s/$filtervalues/ /g; $query.= " $op he:\"".$_->[1]."\"" if ($_->[0]=~/[A-z]/);
+        }
     }
     my ($error, $results, $total_hits) = C4::Search::SimpleSearch( $query, 0, 1, [ "authorityserver" ] );
     # there is at least 1 result => return the 1st one
     if (!defined $error && @{$results} ) {
-      my $marcrecord = MARC::File::USMARC::decode($results->[0]);
-      return $marcrecord->field('001')->data,BuildSummary($marcrecord,$marcrecord->field('001')->data,$authtypecode);
+        my $marcrecord = C4::Search::new_record_from_zebra(
+            'authorityserver',
+            $results->[0]
+        );
+        return $marcrecord->field('001')->data,BuildSummary($marcrecord,$marcrecord->field('001')->data,$authtypecode);
     }
     # no result, returns nothing
     return;
@@ -1452,13 +1462,15 @@ sub merge {
     }
     my $z=0;
     while ( $z<$count ) {
-        my $rec;
-        $rec=$oResult->record($z);
-        my $marcdata = $rec->raw();
-        my $marcrecordzebra= MARC::Record->new_from_usmarc($marcdata);
+        my $marcrecordzebra = C4::Search::new_record_from_zebra(
+            'biblioserver',
+            $oResult->record($z)->raw()
+        );
         my ( $biblionumbertagfield, $biblionumbertagsubfield ) = &GetMarcFromKohaField( "biblio.biblionumber", '' );
-        my $i = ($biblionumbertagfield < 10) ? $marcrecordzebra->field($biblionumbertagfield)->data : $marcrecordzebra->subfield($biblionumbertagfield, $biblionumbertagsubfield);
-        my $marcrecorddb=GetMarcBiblio($i);
+        my $i = ($biblionumbertagfield < 10)
+            ? $marcrecordzebra->field( $biblionumbertagfield )->data
+            : $marcrecordzebra->subfield( $biblionumbertagfield, $biblionumbertagsubfield );
+        my $marcrecorddb = GetMarcBiblio($i);
         push @reccache, $marcrecorddb;
         $z++;
     }

diff --git a/C4/Context.pm b/C4/Context.pm
@@ -706,8 +706,6 @@ sub Zconn {
         $context->{"Zconn"}->{$server}->destroy() if defined($context->{"Zconn"}->{$server});
 
         $context->{"Zconn"}->{$server} = &_new_Zconn($server,$async,$auth,$piggyback,$syntax);
-        $context->{ Zconn }->{ $server }->option(
-            preferredRecordSyntax => C4::Context->preference("marcflavour") );
         return $context->{"Zconn"}->{$server};
     }
 }
@@ -731,14 +729,36 @@ sub _new_Zconn {
 
     my $tried=0; # first attempt
     my $Zconn; # connection object
-    $server = "biblioserver" unless $server;
-    $syntax = "usmarc" unless $syntax;
+    my $elementSetName;
+    my $index_mode;
+
+    $server //= "biblioserver";
+    $syntax //= "XML";
+
+    if ( $server eq 'biblioserver' ) {
+        $index_mode = $context->{'config'}->{'zebra_bib_index_mode'} // 'grs1';
+    } elsif ( $server eq 'authorityserver' ) {
+        $index_mode = $context->{'config'}->{'zebra_auth_index_mode'} // 'dom';
+    }
+
+    if ( $index_mode eq 'grs1' ) {
+
+        $elementSetName = 'F';
+        $syntax = ( $context->preference("marcflavour") eq 'UNIMARC' )
+                ? 'unimarc'
+                : 'usmarc';
+
+    } else {
+
+        $elementSetName = 'marcxml';
+        $syntax = 'XML';
+    }
 
     my $host = $context->{'listen'}->{$server}->{'content'};
     my $servername = $context->{"config"}->{$server};
     my $user = $context->{"serverinfo"}->{$server}->{"user"};
     my $password = $context->{"serverinfo"}->{$server}->{"password"};
- $auth = 1 if($user && $password);   
+    $auth = 1 if($user && $password);
     retry:
     eval {
         # set options
@@ -750,7 +770,7 @@ sub _new_Zconn {
         $o->option(cqlfile=> $context->{"server"}->{$server}->{"cql2rpn"});
         $o->option(cclfile=> $context->{"serverinfo"}->{$server}->{"ccl2rpn"});
         $o->option(preferredRecordSyntax => $syntax);
-        $o->option(elementSetName => "F"); # F for 'full' as opposed to B for 'brief'
+        $o->option(elementSetName => $elementSetName);
         $o->option(databaseName => ($servername?$servername:"biblios"));
 
         # create a new connection object
@@ -765,23 +785,7 @@ sub _new_Zconn {
         }
 
     };
-#     if ($@) {
-#         # Koha manages the Zebra server -- this doesn't work currently for me because of permissions issues
-#         # Also, I'm skeptical about whether it's the best approach
-#         warn "problem with Zebra";
-#         if ( C4::Context->preference("ManageZebra") ) {
-#             if ($@->code==10000 && $tried==0) { ##No connection try restarting Zebra
-#                 $tried=1;
-#                 warn "trying to restart Zebra";
-#                 my $res=system("zebrasrv -f $ENV{'KOHA_CONF'} >/koha/log/zebra-error.log");
-#                 goto "retry";
-#             } else {
-#                 warn "Error ", $@->code(), ": ", $@->message(), "\n";
-#                 $Zconn="error";
-#                 return $Zconn;
-#             }
-#         }
-#     }
+
     return $Zconn;
 }
 

diff --git a/C4/Installer/PerlDependencies.pm b/C4/Installer/PerlDependencies.pm
@@ -457,7 +457,7 @@ our $PERL_DEPS = {
     'MARC::Record' => {
         'usage'    => 'Core',
         'required' => '1',
-        'min_ver'  => '2'
+        'min_ver'  => '2.0.6'
     },
     'Locale::Currency::Format' => {
         'usage'    => 'Core',
@@ -482,7 +482,7 @@ our $PERL_DEPS = {
     'MARC::File::XML' => {
         'usage'    => 'Core',
         'required' => '1',
-        'min_ver'  => '0.88'
+        'min_ver'  => '1.0.1'
     },
     'XML::SAX::Writer' => {
         'usage'    => 'Core',

diff --git a/C4/Matcher.pm b/C4/Matcher.pm
@@ -696,7 +696,7 @@ sub get_matches {
     if ($self->{'record_type'} eq 'biblio') {
         require C4::Biblio;
         foreach my $marcblob (keys %matches) {
-            my $target_record = MARC::Record->new_from_usmarc($marcblob);
+            my $target_record = C4::Search::new_record_from_zebra('biblioserver',$marcblob);
             my $record_number;
             my $result = C4::Biblio::TransformMarcToKoha(C4::Context->dbh, $target_record, '');
             $record_number = $result->{'biblionumber'};

diff --git a/C4/Search.pm b/C4/Search.pm
@@ -144,8 +144,11 @@ sub FindDuplicate {
     my @results;
     if (!defined $error) {
         foreach my $possible_duplicate_record (@{$searchresults}) {
-            my $marcrecord =
-            MARC::Record->new_from_usmarc($possible_duplicate_record);
+            my $marcrecord = new_record_from_zebra(
+                'biblioserver',
+                $possible_duplicate_record
+            );
+
             my $result = TransformMarcToKoha( $dbh, $marcrecord, '' );
 
             # FIXME :: why 2 $biblionumber ?
@@ -289,10 +292,11 @@ sub SimpleSearch {
             }
 
             for my $j ( $first_record .. $last_record ) {
-                my $record =
+                my $record = eval {
                   $tmpresults[ $i - 1 ]->record( $j - 1 )->raw()
                   ;    # 0 indexed
-                push @{$results}, $record;
+                };
+                push @{$results}, $record if defined $record;
             }
         }
     );
@@ -446,6 +450,7 @@ sub getRecords {
                 else {
                     $times = $size;
                 }
+
                 for ( my $j = $offset ; $j < $times ; $j++ ) {
                     my $records_hash;
                     my $record;
@@ -488,7 +493,6 @@ sub getRecords {
                     # not an index scan
                     else {
                         $record = $results[ $i - 1 ]->record($j)->raw();
-
                         # warn "RECORD $j:".$record;
                         $results_hash->{'RECORDS'}[$j] = $record;
                     }
@@ -503,39 +507,37 @@ sub getRecords {
                       $size > $facets_maxrecs ? $facets_maxrecs : $size;
                     for my $facet (@$facets) {
                         for ( my $j = 0 ; $j < $jmax ; $j++ ) {
-                            my $render_record =
-                              $results[ $i - 1 ]->record($j)->render();
+
+                            my $marc_record = new_record_from_zebra (
+                                    'biblioserver',
+                                    $results[ $i - 1 ]->record($j)->raw()
+                            );
+
+                            if ( ! defined $marc_record ) {
+                                warn "ERROR DECODING RECORD - $@: " .
+                                    $results[ $i - 1 ]->record($j)->raw();
+                                next;
+                            }
+
                             my @used_datas = ();
+
                             foreach my $tag ( @{ $facet->{tags} } ) {
 
                                 # avoid first line
                                 my $tag_num = substr( $tag, 0, 3 );
-                                my $letters = substr( $tag, 3 );
-                                my $field_pattern =
-                                  '\n' . $tag_num . ' ([^z][^\n]+)';
-                                $field_pattern = '\n' . $tag_num . ' ([^\n]+)'
-                                  if ( int($tag_num) < 10 );
-                                my @field_tokens =
-                                  ( $render_record =~ /$field_pattern/g );
-                                foreach my $field_token (@field_tokens) {
-                                    my @subf = ( $field_token =~
-                                          /\$([a-zA-Z0-9]) ([^\$]+)/g );
-                                    my @values;
-                                    for ( my $i = 0 ; $i < @subf ; $i += 2 ) {
-                                        if ( $letters =~ $subf[$i] ) {
-                                            my $value = $subf[ $i + 1 ];
-                                            $value =~ s/^ *//;
-                                            $value =~ s/ *$//;
-                                            push @values, $value;
-                                        }
-                                    }
-                                    my $data = join( $facet->{sep}, @values );
+                                my $subfield_letters = substr( $tag, 3 );
+                                # Removed when as_string fixed
+                                my @subfields = $subfield_letters =~ /./sg;
+
+                                my @fields = $marc_record->field($tag_num);
+                                foreach my $field (@fields) {
+                                    my $data = $field->as_string( $subfield_letters, $facet->{sep} );
+
                                     unless ( $data ~~ @used_datas ) {
-                                        $facets_counter->{ $facet->{idx} }
-                                          ->{$data}++;
                                         push @used_datas, $data;
+                                        $facets_counter->{ $facet->{idx} }->{$data}++;
                                     }
-                                }    # fields
+                                } # fields
                             }    # field codes
                         }    # records
                         $facets_info->{ $facet->{idx} }->{label_value} =
@@ -1700,16 +1702,28 @@ sub searchResults {
         $times = $hits;	 # FIXME: if $hits is undefined, why do we want to equal it?
     }
 
-	my $marcflavour = C4::Context->preference("marcflavour");
+    my $marcflavour = C4::Context->preference("marcflavour");
     # We get the biblionumber position in MARC
     my ($bibliotag,$bibliosubf)=GetMarcFromKohaField('biblio.biblionumber','');
 
     # loop through all of the records we've retrieved
     for ( my $i = $offset ; $i <= $times - 1 ; $i++ ) {
-        my $marcrecord = eval { MARC::File::USMARC::decode( $marcresults->[$i] ); };
-        if ( $@ ) {
-            warn "ERROR DECODING RECORD - $@: " . $marcresults->[$i];
-            next;
+
+        my $marcrecord;
+        if ($scan) {
+            # For Scan searches we built USMARC data
+            $marcrecord = MARC::Record->new_from_usmarc( $marcresults->[$i]);
+        } else {
+            # Normal search, render from Zebra's output
+            $marcrecord = new_record_from_zebra(
+                'biblioserver',
+                $marcresults->[$i]
+            );
+
+            if ( ! defined $marcrecord ) {
+                warn "ERROR DECODING RECORD - $@: " . $marcresults->[$i];
+                next;
+            }
         }
 
         my $fw = $scan
@@ -2379,6 +2393,43 @@ sub _ZOOM_event_loop {
     }
 }
 
+=head2 new_record_from_zebra
+
+Given raw data from a Zebra result set, return a MARC::Record object
+
+This helper function is needed to take into account all the involved
+system preferences and configuration variables to properly create the
+MARC::Record object.
+
+If we are using GRS-1, then the raw data we get from Zebra should be USMARC
+data. If we are using DOM, then it has to be MARCXML.
+
+=cut
+
+sub new_record_from_zebra {
+
+    my $server   = shift;
+    my $raw_data = shift;
+    # Set the default indexing modes
+    my $index_mode = ( $server eq 'biblioserver' )
+                        ? C4::Context->config('zebra_bib_index_mode') // 'grs1'
+                        : C4::Context->config('zebra_auth_index_mode') // 'dom';
+
+    my $marc_record =  eval {
+        if ( $index_mode eq 'dom' ) {
+            MARC::Record->new_from_xml( $raw_data, 'UTF-8' );
+        } else {
+            MARC::Record->new_from_usmarc( $raw_data );
+        }
+    };
+
+    if ($@) {
+        return;
+    } else {
+        return $marc_record;
+    }
+
+}
 
 END { }    # module clean-up code here (global destructor)
 

diff --git a/C4/XISBN.pm b/C4/XISBN.pm
@@ -67,7 +67,7 @@ sub _get_biblio_from_xisbn {
     my ( $errors, $results, $total_hits ) = C4::Search::SimpleSearch( "nb=$xisbn", 0, 1 );
     return unless ( !$errors && scalar @$results );
 
-    my $record = MARC::Record::new_from_usmarc( $results->[0] );
+    my $record = C4::Search::new_record_from_zebra( 'biblioserver', $results->[0] );
     my $biblionumber = C4::Biblio::get_koha_field_from_marc('biblio', 'biblionumber', $record, '');
     return unless $biblionumber;
 

diff --git a/acqui/neworderbiblio.pl b/acqui/neworderbiblio.pl
@@ -111,7 +111,7 @@ =head1 CGI PARAMETERS
 my @results;
 
 foreach my $result ( @{$marcresults} ) {
-    my $marcrecord = MARC::File::USMARC::decode( $result );
+    my $marcrecord = C4::Search::new_record_from_zebra( 'biblioserver', $result );
     my $biblio = TransformMarcToKoha( C4::Context->dbh, $marcrecord, '' );
 
     $biblio->{booksellerid} = $booksellerid;

diff --git a/catalogue/search.pl b/catalogue/search.pl
@@ -660,7 +660,10 @@ =head3 Additional Notes
     elsif ($server =~/authorityserver/) { # this is the local authority server
         my @inner_sup_results_array;
         for my $sup_record ( @{$results_hashref->{$server}->{"RECORDS"}} ) {
-            my $marc_record_object = MARC::Record->new_from_usmarc($sup_record);
+            my $marc_record_object = C4::Search::new_record_from_zebra(
+                'authorityserver',
+                $sup_record
+            );
             # warn "Authority Found: ".$marc_record_object->as_formatted();
             push @inner_sup_results_array, {
                 'title' => $marc_record_object->field(100)->subfield('a'),