Skip to content

Commit

Permalink
start working on issue #86
Browse files Browse the repository at this point in the history
  • Loading branch information
kosloot committed Mar 30, 2022
1 parent c2ddc65 commit 697d0fc
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 9 deletions.
6 changes: 5 additions & 1 deletion include/ucto/tokenize.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ namespace Tokenizer {
const UnicodeString&,
TokenRole role = NOROLE,
const std::string& = "" );
Token( const UnicodeString&,
const UnicodeString&,
const std::string& = "" );
std::string lang_code; // ISO 639-3 language code
std::string texttostring();
std::string typetostring();
Expand Down Expand Up @@ -261,7 +264,7 @@ namespace Tokenizer {

bool setXMLInput( bool b ) { bool t = xmlin; xmlin = b; return t; }
bool getXMLInput() const { return xmlin; }

bool setUnkLang( bool b ){ bool r = unk_language; unk_language = b; return r; };

const std::string getInputClass( ) const { return inputclass; }
const std::string setInputClass( const std::string& cls) {
Expand Down Expand Up @@ -373,6 +376,7 @@ namespace Tokenizer {
std::set<UnicodeString> norm_set;
TiCC::LogStream *theErrLog;

bool unk_language;
std::string default_language;
std::string document_language; // in case of an input FoLiA document
std::map<std::string,Setting*> settings;
Expand Down
46 changes: 38 additions & 8 deletions src/tokenize.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ namespace Tokenizer {
type(_type), us(_s), role(_role), lang_code(_lang_code) {
}

Token::Token( const UnicodeString& _type,
const UnicodeString& _s,
const string& _lang_code ):
type(_type), us(_s), role(NOROLE), lang_code(_lang_code) {
}


std::string Token::texttostring() { return TiCC::UnicodeToUTF8(us); }
std::string Token::typetostring() { return TiCC::UnicodeToUTF8(type); }
Expand Down Expand Up @@ -174,6 +180,7 @@ namespace Tokenizer {
inputEncoding( "UTF-8" ),
space_separated(true),
eosmark("<utt>"),
unk_language(false),
tokDebug(0),
verbose(false),
detectQuotes(false),
Expand Down Expand Up @@ -547,6 +554,9 @@ namespace Tokenizer {
add_provenance_passthru( doc );
}
else {
if ( unk_language ){
add_provenance_passthru( doc );
}
add_provenance_setting( doc );
}
folia::KWargs args;
Expand Down Expand Up @@ -586,7 +596,18 @@ namespace Tokenizer {
if ( tokDebug > 3 ){
LOG << "found an unsupported language: " << language << endl;
}
language = "default";
if ( unk_language ){
language = "unk";
passthru = true;
cerr << "passthrueline UNk" << endl;
passthruLine( input_line, bos );
cerr << "passthrueline Done UNk" << endl;
passthru = false;
return;
}
else {
language = "default";
}
}
}
}
Expand Down Expand Up @@ -872,7 +893,10 @@ namespace Tokenizer {
}
else {
string tc_lc = get_language( toks );
if ( tc_lc != "default" ){
if ( tc_lc == "unk" ){
tok_set = "passthru";
}
else if ( tc_lc != "default" ){
tok_set = "tokconfig-" + tc_lc;
set_language( sent, tc_lc );
}
Expand Down Expand Up @@ -1789,8 +1813,10 @@ namespace Tokenizer {
tokens.erase( tokens.begin(), tokens.begin()+end+1 );
if ( !passthru ){
string lang = get_language( outToks );
if ( !settings[lang]->quotes.emptyStack() ) {
settings[lang]->quotes.flushStack( end+1 );
if ( lang != "unk" ){
if ( !settings[lang]->quotes.emptyStack() ) {
settings[lang]->quotes.flushStack( end+1 );
}
}
}
// we are done...
Expand Down Expand Up @@ -2266,11 +2292,11 @@ namespace Tokenizer {
word = "{{" + type + "}}";
}
if (bos) {
tokens.push_back( Token( type, word , BEGINOFSENTENCE ) );
tokens.push_back( Token( type, word , BEGINOFSENTENCE, "unk" ) );
bos = false;
}
else {
tokens.push_back( Token( type, word ) );
tokens.push_back( Token( type, word, "unk" ) );
}
}
alpha = false;
Expand Down Expand Up @@ -2329,11 +2355,11 @@ namespace Tokenizer {
word = "{{" + type + "}}";
}
if (bos) {
tokens.push_back( Token( type, word , BEGINOFSENTENCE ) );
tokens.push_back( Token( type, word , BEGINOFSENTENCE, "unk" ) );
bos = false;
}
else {
tokens.push_back( Token( type, word ) );
tokens.push_back( Token( type, word, "unk" ) );
}
}
}
Expand Down Expand Up @@ -2973,6 +2999,10 @@ namespace Tokenizer {
data_version = get_data_version();
Setting *default_set = 0;
for ( const auto& lang : languages ){
if ( lang == "unk" ){
settings["unk"] = 0;
continue;
}
if ( tokDebug > 0 ){
LOG << "init language=" << lang << endl;
}
Expand Down
10 changes: 10 additions & 0 deletions src/ucto.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cstdlib>
#include <cstring>
#include <string>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
Expand Down Expand Up @@ -158,6 +159,7 @@ int main( int argc, char *argv[] ){
bool xmlout = false;
bool verbose = false;
bool docorrectwords = false;
bool do_unk_lang = false;
string redundancy = "minimal";
string eosmarker = "<utt>";
string docid = "untitleddoc";
Expand Down Expand Up @@ -418,6 +420,13 @@ int main( int argc, char *argv[] ){
return EXIT_FAILURE;
}
else {
auto it = std::find( language_list.begin(),
language_list.end(),
string("unk") );
if ( it != language_list.end() ){
language_list.erase( it );
do_unk_lang = true;
}
for ( const auto& l : language_list ){
if ( available_languages.find(l) == available_languages.end() ){
cerr << "ucto: unsupported language '" << l << "'" << endl;
Expand Down Expand Up @@ -508,6 +517,7 @@ int main( int argc, char *argv[] ){
tokenizer.setXMLInput(xmlin);
tokenizer.setTextRedundancy(redundancy);
tokenizer.setSeparators(separators); // IMPORTANT: AFTER setNormalization
tokenizer.setUnkLang( do_unk_lang );
if ( ignore_tags ){
tokenizer.setNoTags( true );
}
Expand Down

0 comments on commit 697d0fc

Please sign in to comment.