Skip to content

Commit

Permalink
Bug 11096: support the retrieval of large MARCXML records
Browse files Browse the repository at this point in the history
This patch makes Koha <-> Zebra use MARCXML for the serialization when
using DOM, and USMARC for GRS-1.

* The following functions are modified to set the Zebra record syntax
according to the current sysprefs and configuration:

- C4::Context->Zconn
- C4::Context-_new_Zconn

* A new function 'new_record_from_zebra' is introduced, which checks the
context we are in, and creates the MARC::Record object using the right
constructor.

The following packages get touched to make use of the new function:
- C4::Search
- C4::AuthoritiesMarc

and the same happens to the UI scripts that make use of them (both in
the OPAC and STAFF interfaces).

* Calls to the unsafe ZOOM::Record->render()[1] method are removed.

Due to this last change the code for building facets was rewritten. And
for performance on the facets creation I pushed higher version
dependencies for MARC::File::XML and MARC::Record (we rely on
MARC::Field->as_string).

* Calls to MARC::Record->new_from_xml and MARC::Record->new_from_usmarc
are wrapped with eval for catching problems [2].

* As of bug 3087, UNIMARC uses the 'unimarc' record syntax. this case is
  correctly handled.
* As of bug 7818 misc/migration_tools/rebuild_zebra.pl behaves like:

- bib_index_mode (defaults to 'grs1' if not specified)
- auth_index_mode (defaults to 'dom')

here we do exactly the same.

To test:
 - prove t/db_dependent/Search.t should pass.
 - Searching should remain functional.
 - Indexing and searching for a big record should work (that's what the
   unit tests do).
 - Test an index scan search (on the staff interface):
    Search > More options > Check "Scan indexes".
 - Enable 'itemBarcodeFallbackSearch' and try to circulate any word, it
   shouldn't break.
 - Searching for a biblio in a new subscription shouldn't break.
 - Running bulkmarcimport.pl shouldn't break.
 - And so on... for the rest of the .pl files.

[1] http://search.cpan.org/~mirk/Net-Z3950-ZOOM/lib/ZOOM.pod#render()
[2] a record that cannot be parsed by MARC::Record is simply skipped (bug 10684)

Sponsored-by: Universidad Nacional de Cordoba
Signed-off-by: Kyle M Hall <[email protected]>
Signed-off-by: Martin Renvoize <[email protected]>
Signed-off-by: Galen Charlton <[email protected]>
  • Loading branch information
tomascohen authored and gmcharlt committed Feb 28, 2014
1 parent 5159a43 commit daf2ebc
Show file tree
Hide file tree
Showing 15 changed files with 196 additions and 86 deletions.
40 changes: 26 additions & 14 deletions C4/AuthoritiesMarc.pm
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,17 @@ sub SearchAuthorities {

##Here we have to extract MARC record and $authid from ZEBRA AUTHORITIES
my $rec=$oAResult->record($counter);
my $marcdata=$rec->raw();
my $authrecord;
my $separator=C4::Context->preference('authoritysep');
$authrecord = MARC::File::USMARC::decode($marcdata);
my $authrecord = C4::Search::new_record_from_zebra(
'authorityserver',
$rec->raw()
);

if ( !defined $authrecord or !defined $authrecord->field('001') ) {
$counter++;
next;
}

my $authid=$authrecord->field('001')->data();
my %newline;
$newline{authid} = $authid;
Expand Down Expand Up @@ -847,15 +854,18 @@ sub FindDuplicateAuthority {
my $query='at:'.$authtypecode.' ';
my $filtervalues=qr([\001-\040\!\'\"\`\#\$\%\&\*\+,\-\./:;<=>\?\@\(\)\{\[\]\}_\|\~]);
if ($record->field($auth_tag_to_report)) {
foreach ($record->field($auth_tag_to_report)->subfields()) {
$_->[1]=~s/$filtervalues/ /g; $query.= " $op he:\"".$_->[1]."\"" if ($_->[0]=~/[A-z]/);
}
foreach ($record->field($auth_tag_to_report)->subfields()) {
$_->[1]=~s/$filtervalues/ /g; $query.= " $op he:\"".$_->[1]."\"" if ($_->[0]=~/[A-z]/);
}
}
my ($error, $results, $total_hits) = C4::Search::SimpleSearch( $query, 0, 1, [ "authorityserver" ] );
# there is at least 1 result => return the 1st one
if (!defined $error && @{$results} ) {
my $marcrecord = MARC::File::USMARC::decode($results->[0]);
return $marcrecord->field('001')->data,BuildSummary($marcrecord,$marcrecord->field('001')->data,$authtypecode);
my $marcrecord = C4::Search::new_record_from_zebra(
'authorityserver',
$results->[0]
);
return $marcrecord->field('001')->data,BuildSummary($marcrecord,$marcrecord->field('001')->data,$authtypecode);
}
# no result, returns nothing
return;
Expand Down Expand Up @@ -1452,13 +1462,15 @@ sub merge {
}
my $z=0;
while ( $z<$count ) {
my $rec;
$rec=$oResult->record($z);
my $marcdata = $rec->raw();
my $marcrecordzebra= MARC::Record->new_from_usmarc($marcdata);
my $marcrecordzebra = C4::Search::new_record_from_zebra(
'biblioserver',
$oResult->record($z)->raw()
);
my ( $biblionumbertagfield, $biblionumbertagsubfield ) = &GetMarcFromKohaField( "biblio.biblionumber", '' );
my $i = ($biblionumbertagfield < 10) ? $marcrecordzebra->field($biblionumbertagfield)->data : $marcrecordzebra->subfield($biblionumbertagfield, $biblionumbertagsubfield);
my $marcrecorddb=GetMarcBiblio($i);
my $i = ($biblionumbertagfield < 10)
? $marcrecordzebra->field( $biblionumbertagfield )->data
: $marcrecordzebra->subfield( $biblionumbertagfield, $biblionumbertagsubfield );
my $marcrecorddb = GetMarcBiblio($i);
push @reccache, $marcrecorddb;
$z++;
}
Expand Down
50 changes: 27 additions & 23 deletions C4/Context.pm
Original file line number Diff line number Diff line change
Expand Up @@ -706,8 +706,6 @@ sub Zconn {
$context->{"Zconn"}->{$server}->destroy() if defined($context->{"Zconn"}->{$server});

$context->{"Zconn"}->{$server} = &_new_Zconn($server,$async,$auth,$piggyback,$syntax);
$context->{ Zconn }->{ $server }->option(
preferredRecordSyntax => C4::Context->preference("marcflavour") );
return $context->{"Zconn"}->{$server};
}
}
Expand All @@ -731,14 +729,36 @@ sub _new_Zconn {

my $tried=0; # first attempt
my $Zconn; # connection object
$server = "biblioserver" unless $server;
$syntax = "usmarc" unless $syntax;
my $elementSetName;
my $index_mode;

$server //= "biblioserver";
$syntax //= "XML";

if ( $server eq 'biblioserver' ) {
$index_mode = $context->{'config'}->{'zebra_bib_index_mode'} // 'grs1';
} elsif ( $server eq 'authorityserver' ) {
$index_mode = $context->{'config'}->{'zebra_auth_index_mode'} // 'dom';
}

if ( $index_mode eq 'grs1' ) {

$elementSetName = 'F';
$syntax = ( $context->preference("marcflavour") eq 'UNIMARC' )
? 'unimarc'
: 'usmarc';

} else {

$elementSetName = 'marcxml';
$syntax = 'XML';
}

my $host = $context->{'listen'}->{$server}->{'content'};
my $servername = $context->{"config"}->{$server};
my $user = $context->{"serverinfo"}->{$server}->{"user"};
my $password = $context->{"serverinfo"}->{$server}->{"password"};
$auth = 1 if($user && $password);
$auth = 1 if($user && $password);
retry:
eval {
# set options
Expand All @@ -750,7 +770,7 @@ sub _new_Zconn {
$o->option(cqlfile=> $context->{"server"}->{$server}->{"cql2rpn"});
$o->option(cclfile=> $context->{"serverinfo"}->{$server}->{"ccl2rpn"});
$o->option(preferredRecordSyntax => $syntax);
$o->option(elementSetName => "F"); # F for 'full' as opposed to B for 'brief'
$o->option(elementSetName => $elementSetName);
$o->option(databaseName => ($servername?$servername:"biblios"));

# create a new connection object
Expand All @@ -765,23 +785,7 @@ sub _new_Zconn {
}

};
# if ($@) {
# # Koha manages the Zebra server -- this doesn't work currently for me because of permissions issues
# # Also, I'm skeptical about whether it's the best approach
# warn "problem with Zebra";
# if ( C4::Context->preference("ManageZebra") ) {
# if ($@->code==10000 && $tried==0) { ##No connection try restarting Zebra
# $tried=1;
# warn "trying to restart Zebra";
# my $res=system("zebrasrv -f $ENV{'KOHA_CONF'} >/koha/log/zebra-error.log");
# goto "retry";
# } else {
# warn "Error ", $@->code(), ": ", $@->message(), "\n";
# $Zconn="error";
# return $Zconn;
# }
# }
# }

return $Zconn;
}

Expand Down
4 changes: 2 additions & 2 deletions C4/Installer/PerlDependencies.pm
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ our $PERL_DEPS = {
'MARC::Record' => {
'usage' => 'Core',
'required' => '1',
'min_ver' => '2'
'min_ver' => '2.0.6'
},
'Locale::Currency::Format' => {
'usage' => 'Core',
Expand All @@ -482,7 +482,7 @@ our $PERL_DEPS = {
'MARC::File::XML' => {
'usage' => 'Core',
'required' => '1',
'min_ver' => '0.88'
'min_ver' => '1.0.1'
},
'XML::SAX::Writer' => {
'usage' => 'Core',
Expand Down
2 changes: 1 addition & 1 deletion C4/Matcher.pm
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ sub get_matches {
if ($self->{'record_type'} eq 'biblio') {
require C4::Biblio;
foreach my $marcblob (keys %matches) {
my $target_record = MARC::Record->new_from_usmarc($marcblob);
my $target_record = C4::Search::new_record_from_zebra('biblioserver',$marcblob);
my $record_number;
my $result = C4::Biblio::TransformMarcToKoha(C4::Context->dbh, $target_record, '');
$record_number = $result->{'biblionumber'};
Expand Down
121 changes: 86 additions & 35 deletions C4/Search.pm
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,11 @@ sub FindDuplicate {
my @results;
if (!defined $error) {
foreach my $possible_duplicate_record (@{$searchresults}) {
my $marcrecord =
MARC::Record->new_from_usmarc($possible_duplicate_record);
my $marcrecord = new_record_from_zebra(
'biblioserver',
$possible_duplicate_record
);

my $result = TransformMarcToKoha( $dbh, $marcrecord, '' );

# FIXME :: why 2 $biblionumber ?
Expand Down Expand Up @@ -289,10 +292,11 @@ sub SimpleSearch {
}

for my $j ( $first_record .. $last_record ) {
my $record =
my $record = eval {
$tmpresults[ $i - 1 ]->record( $j - 1 )->raw()
; # 0 indexed
push @{$results}, $record;
};
push @{$results}, $record if defined $record;
}
}
);
Expand Down Expand Up @@ -446,6 +450,7 @@ sub getRecords {
else {
$times = $size;
}

for ( my $j = $offset ; $j < $times ; $j++ ) {
my $records_hash;
my $record;
Expand Down Expand Up @@ -488,7 +493,6 @@ sub getRecords {
# not an index scan
else {
$record = $results[ $i - 1 ]->record($j)->raw();

# warn "RECORD $j:".$record;
$results_hash->{'RECORDS'}[$j] = $record;
}
Expand All @@ -503,39 +507,37 @@ sub getRecords {
$size > $facets_maxrecs ? $facets_maxrecs : $size;
for my $facet (@$facets) {
for ( my $j = 0 ; $j < $jmax ; $j++ ) {
my $render_record =
$results[ $i - 1 ]->record($j)->render();

my $marc_record = new_record_from_zebra (
'biblioserver',
$results[ $i - 1 ]->record($j)->raw()
);

if ( ! defined $marc_record ) {
warn "ERROR DECODING RECORD - $@: " .
$results[ $i - 1 ]->record($j)->raw();
next;
}

my @used_datas = ();

foreach my $tag ( @{ $facet->{tags} } ) {

# avoid first line
my $tag_num = substr( $tag, 0, 3 );
my $letters = substr( $tag, 3 );
my $field_pattern =
'\n' . $tag_num . ' ([^z][^\n]+)';
$field_pattern = '\n' . $tag_num . ' ([^\n]+)'
if ( int($tag_num) < 10 );
my @field_tokens =
( $render_record =~ /$field_pattern/g );
foreach my $field_token (@field_tokens) {
my @subf = ( $field_token =~
/\$([a-zA-Z0-9]) ([^\$]+)/g );
my @values;
for ( my $i = 0 ; $i < @subf ; $i += 2 ) {
if ( $letters =~ $subf[$i] ) {
my $value = $subf[ $i + 1 ];
$value =~ s/^ *//;
$value =~ s/ *$//;
push @values, $value;
}
}
my $data = join( $facet->{sep}, @values );
my $subfield_letters = substr( $tag, 3 );
# Removed when as_string fixed
my @subfields = $subfield_letters =~ /./sg;

my @fields = $marc_record->field($tag_num);
foreach my $field (@fields) {
my $data = $field->as_string( $subfield_letters, $facet->{sep} );

unless ( $data ~~ @used_datas ) {
$facets_counter->{ $facet->{idx} }
->{$data}++;
push @used_datas, $data;
$facets_counter->{ $facet->{idx} }->{$data}++;
}
} # fields
} # fields
} # field codes
} # records
$facets_info->{ $facet->{idx} }->{label_value} =
Expand Down Expand Up @@ -1700,16 +1702,28 @@ sub searchResults {
$times = $hits; # FIXME: if $hits is undefined, why do we want to equal it?
}

my $marcflavour = C4::Context->preference("marcflavour");
my $marcflavour = C4::Context->preference("marcflavour");
# We get the biblionumber position in MARC
my ($bibliotag,$bibliosubf)=GetMarcFromKohaField('biblio.biblionumber','');

# loop through all of the records we've retrieved
for ( my $i = $offset ; $i <= $times - 1 ; $i++ ) {
my $marcrecord = eval { MARC::File::USMARC::decode( $marcresults->[$i] ); };
if ( $@ ) {
warn "ERROR DECODING RECORD - $@: " . $marcresults->[$i];
next;

my $marcrecord;
if ($scan) {
# For Scan searches we built USMARC data
$marcrecord = MARC::Record->new_from_usmarc( $marcresults->[$i]);
} else {
# Normal search, render from Zebra's output
$marcrecord = new_record_from_zebra(
'biblioserver',
$marcresults->[$i]
);

if ( ! defined $marcrecord ) {
warn "ERROR DECODING RECORD - $@: " . $marcresults->[$i];
next;
}
}

my $fw = $scan
Expand Down Expand Up @@ -2379,6 +2393,43 @@ sub _ZOOM_event_loop {
}
}

=head2 new_record_from_zebra
Given raw data from a Zebra result set, return a MARC::Record object
This helper function is needed to take into account all the involved
system preferences and configuration variables to properly create the
MARC::Record object.
If we are using GRS-1, then the raw data we get from Zebra should be USMARC
data. If we are using DOM, then it has to be MARCXML.
=cut

sub new_record_from_zebra {

my $server = shift;
my $raw_data = shift;
# Set the default indexing modes
my $index_mode = ( $server eq 'biblioserver' )
? C4::Context->config('zebra_bib_index_mode') // 'grs1'
: C4::Context->config('zebra_auth_index_mode') // 'dom';

my $marc_record = eval {
if ( $index_mode eq 'dom' ) {
MARC::Record->new_from_xml( $raw_data, 'UTF-8' );
} else {
MARC::Record->new_from_usmarc( $raw_data );
}
};

if ($@) {
return;
} else {
return $marc_record;
}

}

END { } # module clean-up code here (global destructor)

Expand Down
2 changes: 1 addition & 1 deletion C4/XISBN.pm
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ sub _get_biblio_from_xisbn {
my ( $errors, $results, $total_hits ) = C4::Search::SimpleSearch( "nb=$xisbn", 0, 1 );
return unless ( !$errors && scalar @$results );

my $record = MARC::Record::new_from_usmarc( $results->[0] );
my $record = C4::Search::new_record_from_zebra( 'biblioserver', $results->[0] );
my $biblionumber = C4::Biblio::get_koha_field_from_marc('biblio', 'biblionumber', $record, '');
return unless $biblionumber;

Expand Down
2 changes: 1 addition & 1 deletion acqui/neworderbiblio.pl
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ =head1 CGI PARAMETERS
my @results;

foreach my $result ( @{$marcresults} ) {
my $marcrecord = MARC::File::USMARC::decode( $result );
my $marcrecord = C4::Search::new_record_from_zebra( 'biblioserver', $result );
my $biblio = TransformMarcToKoha( C4::Context->dbh, $marcrecord, '' );

$biblio->{booksellerid} = $booksellerid;
Expand Down
5 changes: 4 additions & 1 deletion catalogue/search.pl
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,10 @@ =head3 Additional Notes
elsif ($server =~/authorityserver/) { # this is the local authority server
my @inner_sup_results_array;
for my $sup_record ( @{$results_hashref->{$server}->{"RECORDS"}} ) {
my $marc_record_object = MARC::Record->new_from_usmarc($sup_record);
my $marc_record_object = C4::Search::new_record_from_zebra(
'authorityserver',
$sup_record
);
# warn "Authority Found: ".$marc_record_object->as_formatted();
push @inner_sup_results_array, {
'title' => $marc_record_object->field(100)->subfield('a'),
Expand Down
Loading

0 comments on commit daf2ebc

Please sign in to comment.