kermitt2 · kermitt2 · Sep 12, 2019 · Aug 20, 2019 · Aug 22, 2019 · Aug 22, 2019
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -310,7 +310,8 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     TaggingLabel clusterLabel = cluster.getTaggingLabel();
-                    String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
+                    //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
+                    String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
                     if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
                         try {
                             List<Node> refNodes = formatter.markReferencesTEILuceneBased(

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -102,7 +102,8 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     TaggingLabel clusterLabel = cluster.getTaggingLabel();
-                    String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
+                    //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
+                    String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
                     if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
                         try {
                             List<Node> refNodes = formatter.markReferencesTEILuceneBased(

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -162,18 +162,66 @@ public Document processing(DocumentSource documentSource,
                     }
                 }
             }
+
             // structure the abstract using the fulltext model
             if ( (resHeader.getAbstract() != null) && (resHeader.getAbstract().length() > 0) ) {
+                List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
+                if ( (abstractTokens != null) && (abstractTokens.size()>0) ) {
+                    if ( (abstractTokens != null) && (abstractTokens.size()>0) ) {
+                        Pair<String, List<LayoutToken>> abstractProcessed = processShort(abstractTokens, doc);
+                        if (abstractProcessed != null) {
+                            resHeader.setLabeledAbstract(abstractProcessed.getLeft());
+                            resHeader.setLayoutTokensForLabel(abstractProcessed.getRight(), TaggingLabels.HEADER_ABSTRACT);
+                        }
+                    }
+                }
+            }
+
+            // structure the abstract using the fulltext model
+            /*if ( (resHeader.getAbstract() != null) && (resHeader.getAbstract().length() > 0) ) {
                 List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
                 if ( (abstractTokens != null) && (abstractTokens.size()>0) ) {
                     SortedSet<DocumentPiece> documentParts = new TreeSet<DocumentPiece>();
-                    int endInd = abstractTokens.size()-1;
-                    int posStartAbstract = getDocIndexToken(doc, abstractTokens.get(0));
-                    int posEndAbstract = getDocIndexToken(doc, abstractTokens.get(endInd));
-                    DocumentPointer dp1 = new DocumentPointer(doc, abstractTokens.get(0).getBlockPtr(), posStartAbstract);
-                    DocumentPointer dp2 = new DocumentPointer(doc, abstractTokens.get(endInd).getBlockPtr(), posEndAbstract);
-                    DocumentPiece piece = new DocumentPiece(dp1, dp2);
-                    documentParts.add(piece);
+                    // identify continuous sequence of layout tokens in the abstract
+                    int posStartAbstractPiece = -1;
+                    int currentOffsetAbstract = -1;
+                    int startBlockPtr = -1;
+                    LayoutToken previousAbstractToken = null;
+                    for(LayoutToken abstractToken : abstractTokens) {
+                        if (currentOffsetAbstract == -1) {
+                            currentOffsetAbstract = abstractToken.getOffset();
+                            posStartAbstractPiece = getDocIndexToken(doc, abstractToken);
+                            startBlockPtr = abstractToken.getBlockPtr();
+                        } else {
+                            if (abstractToken.getOffset() != currentOffsetAbstract + previousAbstractToken.getText().length()) {
+                                // new DocumentPiece to be added 
+                                DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartAbstractPiece);
+                                DocumentPointer dp2 = new DocumentPointer(doc, 
+                                    previousAbstractToken.getBlockPtr(), 
+                                    getDocIndexToken(doc, previousAbstractToken));
+                                DocumentPiece piece = new DocumentPiece(dp1, dp2);
+                                documentParts.add(piece);
+
+                                // set index for the next DocumentPiece
+                                currentOffsetAbstract = abstractToken.getOffset();
+                                posStartAbstractPiece = getDocIndexToken(doc, abstractToken);
+                                startBlockPtr = abstractToken.getBlockPtr();
+                            } 
+                        }
+                        currentOffsetAbstract = abstractToken.getOffset();
+                        previousAbstractToken = abstractToken;
+                    }
+                    // we still need to add the last document piece
+                    // conditional below should always be true because abstract is not null if we reach this part, but paranoia is good when programming 
+                    if (posStartAbstractPiece != -1) {
+                        DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartAbstractPiece);
+                        DocumentPointer dp2 = new DocumentPointer(doc, 
+                            previousAbstractToken.getBlockPtr(), 
+                            getDocIndexToken(doc, previousAbstractToken));
+                        DocumentPiece piece = new DocumentPiece(dp1, dp2);
+                        documentParts.add(piece);
+                    }
+
                     featSeg = getBodyTextFeatured(doc, documentParts);
                     String rese2 = null;
                     List<LayoutToken> tokenizationsAbstract = null;
@@ -188,7 +236,7 @@ public Document processing(DocumentSource documentSource,
                         resHeader.setLayoutTokensForLabel(tokenizationsAbstract, TaggingLabels.HEADER_ABSTRACT);
                     }
                 }
-            }
+            }*/
 
             // citation processing
             // consolidation, if selected, is not done individually for each citation but 
@@ -300,8 +348,72 @@ else if (config.getConsolidateCitations() == 2)
     }
 
     /**
-     * Process a simple segment of layout tokens with the full text model
+     * Process a simple segment of layout tokens with the full text model.
+     * Return null if provided Layout Tokens is empty or if structuring failed. 
      */
+    public Pair<String, List<LayoutToken>> processShortNew(List<LayoutToken> tokens, Document doc) {
+        if (tokens == null || tokens.size() == 0) 
+            return null;
+
+        SortedSet<DocumentPiece> documentParts = new TreeSet<DocumentPiece>();
+        // identify continuous sequence of layout tokens in the abstract
+        int posStartPiece = -1;
+        int currentOffset = -1;
+        int startBlockPtr = -1;
+        LayoutToken previousToken = null;
+        for(LayoutToken token : tokens) {
+            if (currentOffset == -1) {
+                currentOffset = token.getOffset();
+                posStartPiece = getDocIndexToken(doc, token);
+                startBlockPtr = token.getBlockPtr();
+            } else {
+                if (token.getOffset() != currentOffset + previousToken.getText().length()) {
+                    // new DocumentPiece to be added 
+                    DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartPiece);
+                    DocumentPointer dp2 = new DocumentPointer(doc, 
+                        previousToken.getBlockPtr(), 
+                        getDocIndexToken(doc, previousToken));
+                    DocumentPiece piece = new DocumentPiece(dp1, dp2);
+                    documentParts.add(piece);
+
+                    // set index for the next DocumentPiece
+                    currentOffset = token.getOffset();
+                    posStartPiece = getDocIndexToken(doc, token);
+                    startBlockPtr = token.getBlockPtr();
+                } 
+            }
+            currentOffset = token.getOffset();
+            previousToken = token;
+        }
+        // we still need to add the last document piece
+        // conditional below should always be true because abstract is not null if we reach this part, but paranoia is good when programming 
+        if (posStartPiece != -1) {
+            DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartPiece);
+            DocumentPointer dp2 = new DocumentPointer(doc, 
+                previousToken.getBlockPtr(), 
+                getDocIndexToken(doc, previousToken));
+            DocumentPiece piece = new DocumentPiece(dp1, dp2);
+            documentParts.add(piece);
+        }
+
+        Pair<String, LayoutTokenization> featSeg = getBodyTextFeatured(doc, documentParts);
+        String res = null;
+        List<LayoutToken> tokenizations = null;
+        if (featSeg != null) {
+            // if featSeg is null, it usually means that no body segment is found in the
+            // document segmentation
+            String processedText = featSeg.getLeft();
+            tokenizations = featSeg.getRight().getTokenization();
+            // here, tokenizations should be identical to the provided tokens
+            if (isNotEmpty(trim(processedText))) 
+                res = label(processedText);
+        } else
+            return null;
+
+        return Pair.of(res, tokenizations);
+    }
+
+
     public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Document doc) {
         SortedSet<DocumentPiece> documentParts = new TreeSet<DocumentPiece>();
 
@@ -355,6 +467,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
         return Pair.of(res, layoutTokenization);
     }
 
+
 	static public Pair<String, LayoutTokenization> getBodyTextFeatured(Document doc,
                                                                        SortedSet<DocumentPiece> documentBodyParts) {
 		if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) {