Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid duplicated body part in the abstract #486

Merged
merged 8 commits into from
Sep 12, 2019
3 changes: 2 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,8 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
try {
List<Node> refNodes = formatter.markReferencesTEILuceneBased(
Expand Down
3 changes: 2 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
try {
List<Node> refNodes = formatter.markReferencesTEILuceneBased(
Expand Down
131 changes: 122 additions & 9 deletions grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,18 +162,66 @@ public Document processing(DocumentSource documentSource,
}
}
}

// structure the abstract using the fulltext model
if ( (resHeader.getAbstract() != null) && (resHeader.getAbstract().length() > 0) ) {
List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
if ( (abstractTokens != null) && (abstractTokens.size()>0) ) {
if ( (abstractTokens != null) && (abstractTokens.size()>0) ) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a duplicated if, I would also rewrite this part directly with CollectionUtils and StringUtils to improve readability:

            // structure the abstract using the fulltext model
            if ( isNotBlank(resHeader.getAbstract())) {
                List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
                if (CollectionUtils.isNotEmpty(abstractTokens)) {
                    Pair<String, List<LayoutToken>> abstractProcessed = processShort(abstractTokens, doc);
                    if (abstractProcessed != null) {
                        resHeader.setLabeledAbstract(abstractProcessed.getLeft());
                        resHeader.setLayoutTokensForLabel(abstractProcessed.getRight(), TaggingLabels.HEADER_ABSTRACT);
                    }
                }
            }

Pair<String, List<LayoutToken>> abstractProcessed = processShort(abstractTokens, doc);
if (abstractProcessed != null) {
resHeader.setLabeledAbstract(abstractProcessed.getLeft());
resHeader.setLayoutTokensForLabel(abstractProcessed.getRight(), TaggingLabels.HEADER_ABSTRACT);
}
}
}
}

// structure the abstract using the fulltext model
/*if ( (resHeader.getAbstract() != null) && (resHeader.getAbstract().length() > 0) ) {
List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
if ( (abstractTokens != null) && (abstractTokens.size()>0) ) {
SortedSet<DocumentPiece> documentParts = new TreeSet<DocumentPiece>();
int endInd = abstractTokens.size()-1;
int posStartAbstract = getDocIndexToken(doc, abstractTokens.get(0));
int posEndAbstract = getDocIndexToken(doc, abstractTokens.get(endInd));
DocumentPointer dp1 = new DocumentPointer(doc, abstractTokens.get(0).getBlockPtr(), posStartAbstract);
DocumentPointer dp2 = new DocumentPointer(doc, abstractTokens.get(endInd).getBlockPtr(), posEndAbstract);
DocumentPiece piece = new DocumentPiece(dp1, dp2);
documentParts.add(piece);
// identify continuous sequence of layout tokens in the abstract
int posStartAbstractPiece = -1;
int currentOffsetAbstract = -1;
int startBlockPtr = -1;
LayoutToken previousAbstractToken = null;
for(LayoutToken abstractToken : abstractTokens) {
if (currentOffsetAbstract == -1) {
currentOffsetAbstract = abstractToken.getOffset();
posStartAbstractPiece = getDocIndexToken(doc, abstractToken);
startBlockPtr = abstractToken.getBlockPtr();
} else {
if (abstractToken.getOffset() != currentOffsetAbstract + previousAbstractToken.getText().length()) {
// new DocumentPiece to be added
DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartAbstractPiece);
DocumentPointer dp2 = new DocumentPointer(doc,
previousAbstractToken.getBlockPtr(),
getDocIndexToken(doc, previousAbstractToken));
DocumentPiece piece = new DocumentPiece(dp1, dp2);
documentParts.add(piece);

// set index for the next DocumentPiece
currentOffsetAbstract = abstractToken.getOffset();
posStartAbstractPiece = getDocIndexToken(doc, abstractToken);
startBlockPtr = abstractToken.getBlockPtr();
}
}
currentOffsetAbstract = abstractToken.getOffset();
previousAbstractToken = abstractToken;
}
// we still need to add the last document piece
// conditional below should always be true because abstract is not null if we reach this part, but paranoia is good when programming
if (posStartAbstractPiece != -1) {
DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartAbstractPiece);
DocumentPointer dp2 = new DocumentPointer(doc,
previousAbstractToken.getBlockPtr(),
getDocIndexToken(doc, previousAbstractToken));
DocumentPiece piece = new DocumentPiece(dp1, dp2);
documentParts.add(piece);
}

featSeg = getBodyTextFeatured(doc, documentParts);
String rese2 = null;
List<LayoutToken> tokenizationsAbstract = null;
Expand All @@ -188,7 +236,7 @@ public Document processing(DocumentSource documentSource,
resHeader.setLayoutTokensForLabel(tokenizationsAbstract, TaggingLabels.HEADER_ABSTRACT);
}
}
}
}*/

// citation processing
// consolidation, if selected, is not done individually for each citation but
Expand Down Expand Up @@ -300,8 +348,72 @@ else if (config.getConsolidateCitations() == 2)
}

/**
* Process a simple segment of layout tokens with the full text model
* Process a simple segment of layout tokens with the full text model.
* Return null if provided Layout Tokens is empty or if structuring failed.
*/
public Pair<String, List<LayoutToken>> processShortNew(List<LayoutToken> tokens, Document doc) {
if (tokens == null || tokens.size() == 0)
return null;

SortedSet<DocumentPiece> documentParts = new TreeSet<DocumentPiece>();
// identify continuous sequence of layout tokens in the abstract
int posStartPiece = -1;
int currentOffset = -1;
int startBlockPtr = -1;
LayoutToken previousToken = null;
for(LayoutToken token : tokens) {
if (currentOffset == -1) {
currentOffset = token.getOffset();
posStartPiece = getDocIndexToken(doc, token);
startBlockPtr = token.getBlockPtr();
} else {
if (token.getOffset() != currentOffset + previousToken.getText().length()) {
// new DocumentPiece to be added
DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartPiece);
DocumentPointer dp2 = new DocumentPointer(doc,
previousToken.getBlockPtr(),
getDocIndexToken(doc, previousToken));
DocumentPiece piece = new DocumentPiece(dp1, dp2);
documentParts.add(piece);

// set index for the next DocumentPiece
currentOffset = token.getOffset();
posStartPiece = getDocIndexToken(doc, token);
startBlockPtr = token.getBlockPtr();
}
}
currentOffset = token.getOffset();
previousToken = token;
}
// we still need to add the last document piece
// conditional below should always be true because abstract is not null if we reach this part, but paranoia is good when programming
if (posStartPiece != -1) {
DocumentPointer dp1 = new DocumentPointer(doc, startBlockPtr, posStartPiece);
DocumentPointer dp2 = new DocumentPointer(doc,
previousToken.getBlockPtr(),
getDocIndexToken(doc, previousToken));
DocumentPiece piece = new DocumentPiece(dp1, dp2);
documentParts.add(piece);
}

Pair<String, LayoutTokenization> featSeg = getBodyTextFeatured(doc, documentParts);
String res = null;
List<LayoutToken> tokenizations = null;
if (featSeg != null) {
// if featSeg is null, it usually means that no body segment is found in the
// document segmentation
String processedText = featSeg.getLeft();
tokenizations = featSeg.getRight().getTokenization();
// here, tokenizations should be identical to the provided tokens
if (isNotEmpty(trim(processedText)))
res = label(processedText);
} else
return null;

return Pair.of(res, tokenizations);
}


public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Document doc) {
SortedSet<DocumentPiece> documentParts = new TreeSet<DocumentPiece>();

Expand Down Expand Up @@ -355,6 +467,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
return Pair.of(res, layoutTokenization);
}


static public Pair<String, LayoutTokenization> getBodyTextFeatured(Document doc,
SortedSet<DocumentPiece> documentBodyParts) {
if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) {
Expand Down