Ngapi is a semantic chunker based on fastText word embeddings for the Myanmar language. This approach can also be applied to other languages.
If you have used the Ngapi semantic chunker, please cite it as follows:
(Ngapi Semantic Chunker ကို သုံးဖြစ်ကြရင် အောက်ပါ citation လုပ်ပေးပါ။ ကျေးဇူးပါ။)
@misc{ngapi_2024,
author = {Ye Kyaw Thu},
title = {NgaPi Semantic Chunker for Burmese, Version 1.0},
month = {12},
year = {2024},
url = {https://github.com/ye-kyaw-thu/NgaPi},
note = {Accessed: 2024-12-25},
institution = {LU Lab., Myanmar}
}
[1]. A 19th century Burmese watercolor depicting a ngapi hawker, https://en.wikipedia.org/wiki/Ngapi
[2]. The 5 Levels Of Text Splitting For Retrieval by Greg Kamradt: https://www.youtube.com/watch?v=8OJC21T2SL4&t=2112s
[3] P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov, Enriching Word Vectors with Subword Information
@article{bojanowski2016enriching,
title={Enriching Word Vectors with Subword Information},
author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
journal={arXiv preprint arXiv:1607.04606},
year={2016}
}
[4] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, Bag of Tricks for Efficient Text Classification
@article{joulin2016bag,
title={Bag of Tricks for Efficient Text Classification},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
journal={arXiv preprint arXiv:1607.01759},
year={2016}
}
[5] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, FastText.zip: Compressing text classification models
@article{joulin2016fasttext,
title={FastText.zip: Compressing text classification models},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
journal={arXiv preprint arXiv:1612.03651},
year={2016}
}
(* These authors contributed equally.)
[6]. RE based syllable breaking tool for Burmese: https://github.com/ye-kyaw-thu/sylbreak