From 3ce82c136580b55f705901c22ba1ff0852e1800c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 5 Mar 2020 14:57:24 +0100 Subject: [PATCH] Loosen expectation of XML structure when finding the pageId I encountered METS embedded in a OAI-PMH response, and while processing the result with OCR-D works somewhat it fails to find the pageIds for every file in the METS. Example OAI-PMH with METS: https://digital.staatsbibliothek-berlin.de/oai?verb=GetRecord&metadataPrefix=mets&identifier=oai%3Adigital.staatsbibliothek-berlin.de%3APPN719671574 When saving that as mets.xml, ocrd workspace validate reports lots of errors like this one: File 'FILE_0001_PRESENTATION' does not manifest any physical page. Fix this by loosening the expectation of the XML structure when finding the pageId. (There are more XPath strings in the code that could be reviewed, I think.) --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 758ab8ca8..4f0706467 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -313,7 +313,7 @@ def get_physical_page_for_file(self, ocrd_file): Get the pageId for a ocrd_file """ ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + '//mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % ocrd_file.ID, namespaces=NS) if ret: return ret[0]