Add support for Ukrainian language

I have to admit this may be pretty lame because I don't know anything about Ukrainian language. I hope if someone really uses it (s)he will provide kind feedback how to improve this.
miso-belica · Mar 11, 2022 · 2dedd0c · 2dedd0c
1 parent 1b998d9
commit 2dedd0c
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 
 [![image](https://github.com/miso-belica/sumy/actions/workflows/run-tests.yml/badge.svg)](https://github.com/miso-belica/sumy/actions/workflows/run-tests.yml)
-[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-Ready--to--Code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/miso-belica/sumy) 
+[![GitPod Ready-to-Code](https://img.shields.io/badge/Gitpod-Ready--to--Code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/miso-belica/sumy) 
 
 Simple library and command line utility for extracting summary from HTML
 pages or plain texts. The package also contains simple evaluation
@@ -29,26 +29,27 @@ $ [sudo] pip install git+git://github.com/miso-belica/sumy.git  # for the fresh
 Sumy contains command line utility for quick summarization of documents.
 
 ```sh
-$ sumy lex-rank --length=10 --url=http://en.wikipedia.org/wiki/Automatic_summarization # what's summarization?
-$ sumy luhn --language=czech --url=http://www.zdrojak.cz/clanky/automaticke-zabezpeceni/
-$ sumy edmundson --language=czech --length=3% --url=http://cs.wikipedia.org/wiki/Bitva_u_Lipan
+$ sumy lex-rank --length=10 --url=https://en.wikipedia.org/wiki/Automatic_summarization # what's summarization?
+$ sumy lex-rank --language=uk --length=30 --url=https://uk.wikipedia.org/wiki/Україна
+$ sumy luhn --language=czech --url=https://www.zdrojak.cz/clanky/automaticke-zabezpeceni/
+$ sumy edmundson --language=czech --length=3% --url=https://cs.wikipedia.org/wiki/Bitva_u_Lipan
 $ sumy --help # for more info
 ```
 
 Various evaluation methods for some summarization method can be executed
 by commands below:
 
 ```sh
-$ sumy_eval lex-rank reference_summary.txt --url=http://en.wikipedia.org/wiki/Automatic_summarization
-$ sumy_eval lsa reference_summary.txt --language=czech --url=http://www.zdrojak.cz/clanky/automaticke-zabezpeceni/
-$ sumy_eval edmundson reference_summary.txt --language=czech --url=http://cs.wikipedia.org/wiki/Bitva_u_Lipan
+$ sumy_eval lex-rank reference_summary.txt --url=https://en.wikipedia.org/wiki/Automatic_summarization
+$ sumy_eval lsa reference_summary.txt --language=czech --url=https://www.zdrojak.cz/clanky/automaticke-zabezpeceni/
+$ sumy_eval edmundson reference_summary.txt --language=czech --url=https://cs.wikipedia.org/wiki/Bitva_u_Lipan
 $ sumy_eval --help # for more info
 ```
 
 If you don't want to bother by the installation, you can try it as a container.
 
 ```sh
-$ docker run --rm misobelica/sumy lex-rank --length=10 --url=http://en.wikipedia.org/wiki/Automatic_summarization
+$ docker run --rm misobelica/sumy lex-rank --length=10 --url=https://en.wikipedia.org/wiki/Automatic_summarization
 ```
 
 ## Python API
@@ -90,7 +91,7 @@ if __name__ == "__main__":
 
 ## Interesting projects using sumy
 
-I found some interesting projects while browsing the interner or sometimes people wrote me an e-mail with questions and I was curious how they use the sumy :)
+I found some interesting projects while browsing the internet or sometimes people wrote me an e-mail with questions, and I was curious how they use the sumy :)
 
 * [Learning to generate questions from text](https://software.intel.com/en-us/articles/using-natural-language-processing-for-smart-question-generation) - https://github.com/adityasarvaiya/Automatic_Question_Generation
 * Summarize your video to any duration - https://github.com/aswanthkoleri/VideoMash and similar https://github.com/OpenGenus/vidsum

diff --git a/sumy/data/stopwords/ukrainian.txt b/sumy/data/stopwords/ukrainian.txt
@@ -0,0 +1,77 @@
+Із
+Інших
+авжеж
+адже
+але
+б
+без
+був
+була
+були
+було
+бути
+більш
+вІн
+вам
+вас
+весь
+вздовж
+ви
+вниз
+внизу
+вона
+вони
+воно
+все
+всередині
+всіх
+від
+він
+да
+давай
+давати
+де
+дещо
+для
+до
+з
+завжди
+замість
+й
+коли
+ледве
+майже
+ми
+навколо
+навіть
+нам
+от
+отже
+отож
+поза
+про
+під
+та
+так
+такий
+також
+те
+ти
+тобто
+тож
+тощо
+хоча
+це
+цей
+чи
+чого
+що
+як
+який
+яко
+якої
+є
+із
+інших
+їх
+її
diff --git a/sumy/nlp/stemmers/__init__.py b/sumy/nlp/stemmers/__init__.py
@@ -6,13 +6,14 @@
 import nltk.stem.snowball as nltk_stemmers_module
 
 from .czech import stem_word as czech_stemmer
+from .ukrainian import stem_word as ukrainian_stemmer
 
 from ..._compat import to_unicode
 from ...utils import normalize_language
 
 
 def null_stemmer(object):
-    "Converts given object to unicode with lower letters."
+    """Converts given object to unicode with lower letters."""
     return to_unicode(object).lower()
 
 
@@ -24,6 +25,7 @@ class Stemmer(object):
         'chinese': null_stemmer,
         'japanese': null_stemmer,
         'korean': null_stemmer,
+        'ukrainian': ukrainian_stemmer,
     }
 
     def __init__(self, language):

diff --git a/sumy/nlp/stemmers/ukrainian.py b/sumy/nlp/stemmers/ukrainian.py
@@ -0,0 +1,63 @@
+import re
+
+_PERFECTIVE_GROUND = r'(ив|ивши|ившись|ыв|ывши|ывшись((?<=[ая])(в|вши|вшись)))$'
+# http://uk.wikipedia.org/wiki/Рефлексивне_дієслово
+_REFLEXIVE = r'(с[яьи])$'
+# http://uk.wikipedia.org/wiki/Прикметник + http://wapedia.mobi/uk/Прикметник
+_ADJECTIVE = r'(ими|ій|ий|а|е|ова|ове|ів|є|їй|єє|еє|я|ім|ем|им|ім|их|іх|ою|йми|іми|у|ю|ого|ому|ої)$'
+# http://uk.wikipedia.org/wiki/Дієприкметник
+_PARTICIPLE = r'(ий|ого|ому|им|ім|а|ій|у|ою|ій|і|их|йми|их)$'
+# http://uk.wikipedia.org/wiki/Дієслово
+_VERB = r'(сь|ся|ив|ать|ять|у|ю|ав|али|учи|ячи|вши|ши|е|ме|ати|яти|є)$'
+# http://uk.wikipedia.org/wiki/Іменник
+_NOUN = r'(а|ев|ов|е|ями|ами|еи|и|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я|і|ові|ї|ею|єю|ою|є|еві|ем|єм|ів|їв|ю)$'
+_RVRE = r'[аеиоуюяіїє]'
+_DERIVATIONAL = r'[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*(?<=о)сть?$'
+
+
+def stem_word(word):
+    """
+    Based on https://drupal.org/project/ukstemmer and ported to Python https://github.com/Amice13/ukr_stemmer
+    """
+    word = _preprocess(word)
+    if not re.search('[аеиоуюяіїє]', word):
+        return word
+
+    p = re.search(_RVRE, word)
+    start = word[0:p.span()[1]]
+    suffix = word[p.span()[1]:]
+
+    # Step 1
+    updated, suffix = _update_suffix(suffix, _PERFECTIVE_GROUND, '')
+    if not updated:
+        _, suffix = _update_suffix(suffix, _REFLEXIVE, '')
+        updated, suffix = _update_suffix(suffix, _ADJECTIVE, '')
+        if updated:
+            updated, suffix = _update_suffix(suffix, _PARTICIPLE, '')
+        else:
+            updated, suffix = _update_suffix(suffix, _VERB, '')
+            if not updated:
+                _, suffix = _update_suffix(suffix, _NOUN, '')
+    # Step 2
+    updated, suffix = _update_suffix(suffix, 'и$', '')
+
+    # Step 3
+    if re.search(_DERIVATIONAL, suffix):
+        updated, suffix = _update_suffix(suffix, 'ость$', '')
+
+    # Step 4
+    updated, suffix = _update_suffix(suffix, 'ь$', '')
+    if updated:
+        _, suffix = _update_suffix(suffix, 'ейше?$', '')
+        _, suffix = _update_suffix(suffix, 'нн$', u'н')
+
+    return start + suffix
+
+
+def _preprocess(word):
+    return word.lower().replace("'", '').replace('ё', 'е').replace('ъ', 'ї')
+
+
+def _update_suffix(suffix, pattern, replacement):
+    result = re.sub(pattern, replacement, suffix)
+    return suffix != result, result
diff --git a/sumy/nlp/tokenizers.py b/sumy/nlp/tokenizers.py
@@ -91,9 +91,11 @@ class Tokenizer(object):
     LANGUAGE_EXTRA_ABREVS = {
         "english": ["e.g", "al", "i.e"],
         "german": ["al", "z.B", "Inc", "engl", "z. B", "vgl", "lat", "bzw", "S"],
+        "ukrainian": ["ім.", "о.", "вул.", "просп.", "бул.", "пров.", "пл.", "г.", "р.", "див.", "п.", "с.", "м."],
     }
 
     SPECIAL_SENTENCE_TOKENIZERS = {
+        'ukrainian': nltk.RegexpTokenizer(r'[.!?…»]', gaps=True),
         'hebrew': nltk.RegexpTokenizer(r'\.\s+', gaps=True),
         'japanese': nltk.RegexpTokenizer('[^　！？。]*[！？。]'),
         'chinese': nltk.RegexpTokenizer('[^　！？。]*[！？。]'),