Skip to content

Commit

Permalink
added a string linking stage to ticcl, this adds extra markup informa…
Browse files Browse the repository at this point in the history
…tion (t-str/t-correction) using the foliatextcontent tool, this is in turn needed by FLAT for proper visualisation. #62
  • Loading branch information
proycon committed Dec 7, 2020
1 parent d423fce commit e67b292
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 6 deletions.
2 changes: 1 addition & 1 deletion codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"@type": "SoftwareSourceCode",
"identifier": "piccl",
"name": "PICCL",
"version": "0.9.4",
"version": "0.9.5",
"description": "A set of workflows for corpus building through OCR, post-correction, and normalisation.",
"license": "https://spdx.org/licenses/GPL-3.0",
"url": "https://github.com/LanguageMachines/PICCL",
Expand Down
57 changes: 54 additions & 3 deletions ticcl.nf
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ if (params.containsKey('help')) {
log.info " --high INT skip entries from the anagram file longer than 'high' characters. (default=35)"
log.info " --chainclean BOOLINT enable chain clean or not (1 = on, 0 = off, default)"
log.info " --nofoliacorrect skip the FoLiA correct step"
log.info " --nostringlinking skip the final string linking step"
exit 2
}

Expand Down Expand Up @@ -508,7 +509,7 @@ if (!params.containsKey('nofoliacorrect')) {
Correct the input documents using the ranked list, produces final output documents with <str>, using FoLiA-correct
*/

publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
publishDir params.outputdir, mode: 'copy', overwrite: true
label "multicore"

input:
Expand All @@ -522,7 +523,7 @@ if (!params.containsKey('nofoliacorrect')) {
val virtualenv from params.virtualenv

output:
file "*.ticcl.folia.xml" into folia_ticcl_documents
file "*.foliacorrect.folia.xml" into foliacorrect_documents

script:
"""
Expand All @@ -540,13 +541,16 @@ if (!params.containsKey('nofoliacorrect')) {
FoLiA-correct --inputclass "${inputclass}" --outputclass "${outputclass}" --nums 10 -e ${extension} -O outputdir/ --unk "${unknownfreqlist}" --punct "${punctuationmap}" --rank "${rankedlist}" -t ${task.cpus} . || exit 1
cd outputdir
echo "output files:"
ls
#rename files so they have *.ticcl.folia.xml as extension (rather than .ticcl.xml which FoLiA-correct produces)
for f in *.xml; do
if [[ \$f != "*.xml" ]]; then
if [[ \${f%.ticcl.xml} != \$f ]]; then
newf="\${f%.ticcl.xml}.ticcl.folia.xml"
newf="\${f%.ticcl.xml}.foliacorrect.folia.xml" #old folia-correc
elif [[ \${f%.ticcl.folia.xml} != \$f ]]; then
newf="\${f%.ticcl.folia.xml}.foliacorrect.folia.xml" #new folia-correct
else
newf="\$f"
fi
Expand All @@ -557,6 +561,53 @@ if (!params.containsKey('nofoliacorrect')) {
"""
}

if (!params.containsKey('nostringlinking')) {
process linkstrings {
/*
This invokes a tool that adds text markup information (t-str and t-correction) linking to the substrings. It adds a level of redundancy that is needed for proper visualisation in FLAT.
*/

publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)

input:
file foliadoc from foliacorrect_documents
val virtualenv from params.virtualenv

output:
file "*.ticcl.folia.xml" into folia_ticcl_documents

script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
foliatextcontent -M ${foliadoc} > ${foliadoc.simpleName}.ticcl.folia.xml || exit 1
"""
}

} else {
process nolinkstrings {
"""Simple file rename step"""

publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)

input:
file foliadoc from foliacorrect_documents

output:
file "*.ticcl.folia.xml" into folia_ticcl_documents

script:
"""
cp ${foliadoc} ${foliadoc.simpleName}.ticcl.folia.xml || exit 1
"""
}
}

//explicitly report the final documents created to stdout
folia_ticcl_documents.subscribe { println "TICCL output document written to " + params.outputdir + "/" + it.name }
}
2 changes: 1 addition & 1 deletion webservice/picclservice/picclservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
#An informative description for this system (this should be fairly short, about one paragraph, and may not contain HTML)
SYSTEM_DESCRIPTION = "PICCL offers a workflow for corpus building and builds on a variety of tools. The primary component of PICCL is TICCL; a Text-induced Corpus Clean-up system, which performs spelling correction and OCR post-correction (normalisation of spelling variants etc)."

SYSTEM_VERSION = "0.9.4" #also change in codemeta.json and setup.py
SYSTEM_VERSION = "0.9.5" #also change in codemeta.json and setup.py

SYSTEM_AUTHOR = "Martin Reynaert, Maarten van Gompel, Ko van der Sloot"

Expand Down
2 changes: 1 addition & 1 deletion webservice/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name = "PICCL",
version = "0.9.4", #also change in codemeta.json and picclservice.py
version = "0.9.5", #also change in codemeta.json and picclservice.py
author = "Martin Reynaert, Maarten van Gompel",
author_email = "[email protected]",
description = ("Webservice for PICCL; a set of workflows for corpus building through OCR, post-correction, modernization of historic language and Natural Language Processing"),
Expand Down

0 comments on commit e67b292

Please sign in to comment.