Skip to content

Commit

Permalink
ported some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
aafrecct committed Jul 22, 2024
1 parent fc273f5 commit dc652e5
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 14 deletions.
15 changes: 14 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ on:
push:
branches:
- main
- master
tags:
- '*'
pull_request:
Expand All @@ -19,8 +18,18 @@ permissions:
contents: read

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: actions-rs
with:
command: test

linux:
runs-on: ${{ matrix.platform.runner }}
needs: [test]
strategy:
matrix:
platform:
Expand Down Expand Up @@ -56,6 +65,7 @@ jobs:

musllinux:
runs-on: ${{ matrix.platform.runner }}
needs: [test]
strategy:
matrix:
platform:
Expand Down Expand Up @@ -87,6 +97,7 @@ jobs:

windows:
runs-on: ${{ matrix.platform.runner }}
needs: [test]
strategy:
matrix:
platform:
Expand Down Expand Up @@ -114,6 +125,7 @@ jobs:

macos:
runs-on: ${{ matrix.platform.runner }}
needs: [test]
strategy:
matrix:
platform:
Expand All @@ -140,6 +152,7 @@ jobs:

sdist:
runs-on: ubuntu-latest
needs: [test]
steps:
- uses: actions/checkout@v4
- name: Build sdist
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "simple_unicode_normalization_forms"
version = "0.2.0"
version = "0.2.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
2 changes: 1 addition & 1 deletion src/emoji.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pub const CHAR_TO_AVOID: &'static [(char, char)] = &[
('\u{20D0}', '\u{20FF}'), // Combining Diacritical Marks for Symbols
('\u{2800}', '\u{28FF}'), // Braille Patterns
// ('\u{D800}', '\u{F8FF}'), // High Surrogates, High Private Use Surrogates, Low Surrogates and Private Use Area blocks
('\u{E000}', '\u{F8FF}'), // Private Use Area blocks
('\u{E000}', '\u{F8FF}'), // Private Use Area blocks
('\u{10000}', '\u{10FFFF}'), // Extra planes
];

Expand Down
82 changes: 72 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,13 @@ fn custom_character_normalization(

#[pyfunction]
#[pyo3(signature = (value, allow_tab=false, allow_eol=true, collapse_whitespace=false, remove_emojis=false))]
fn basic_string_clean(value: String, allow_tab: bool, allow_eol: bool, collapse_whitespace: bool, remove_emojis: bool) -> PyResult<String> {
fn basic_string_clean(
value: String,
allow_tab: bool,
allow_eol: bool,
collapse_whitespace: bool,
remove_emojis: bool,
) -> PyResult<String> {
let mut allowed_chars = vec!['º', 'ª'];
if allow_tab {
allowed_chars.push('\t');
Expand All @@ -82,9 +88,11 @@ fn basic_string_clean(value: String, allow_tab: bool, allow_eol: bool, collapse_
allowed_chars.push('\r');
}

Ok(custom_normalization(value, allowed_chars, collapse_whitespace, remove_emojis)
.trim()
.to_string())
Ok(
custom_normalization(value, allowed_chars, collapse_whitespace, remove_emojis)
.trim()
.to_string(),
)
}

#[pyfunction]
Expand All @@ -104,13 +112,67 @@ fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> {
#[cfg(test)]
mod tests {
use super::remove_emojis;
use std::time::Instant;
use std::time::{Duration, Instant};

#[test]
fn correctness() {
let test_cases: [(&str, Option<&str>); 18] = [
(
"Este es un texto de prueba. Contiene todas las letras del alfabeto español: á, é, í, ó, ú, ü, ñ y Ñ. También incluye números (123) y otros símbolos habituales (-*#@€©) .",
None,
),
(
" dirección con\nvarias líneas y muchos espacios en blanco ",
Some("dirección con varias líneas y muchos espacios en blanco"),
),
("\u{0000}\u{0008}\u{009F}\u{009E}", Some("")),
("Lui Ángel🪽🪽🪽🪽🪽🪽🫀🔂",Some("Lui Ángel")),
(
" a\t name with ❤️✳️0️⃣#️⃣ #©*1 ",
Some("a name with ❤✳0# #©*1"),
),
("👍🏽👍🏻👍🏿", Some("")),
("🦰..🦳", Some("..")),
("𓃵𓀂𓆏𓍊𓋼𓍊🂡🀷🀉𐆔",Some("")),
("𝑝𝑖𝑒𝑑𝑎𝑑 𝑖𝑛𝑚𝑎𝑐𝑢𝑙𝑎𝑑𝑎", Some("piedad inmaculada")),
("𝑐𝑎𝑙𝑙𝑒 𝑞𝑢𝑒𝑣𝑒𝑑𝑜 𝑛𝑢𝑚𝑒𝑟𝑜 1 𝑐𝑎𝑠𝑎", Some("calle quevedo numero 1 casa")),
(
"Rua nossa senhora de Belém n16",
None,
),
("Vordere Zollamtsstraße 11", None),
("GLUMSØ", None),
("Bård Skolemesters vei 14, 1.", None),
("45 شارع النهضة", None),
("女子学院中学校", None),
("アイウエオ", Some("アイウエオ")),
("北京海洋馆", None),
];

for case in test_cases {
let expected_result = match case.1 {
Some(s) => s.to_string(),
None => case.0.to_string(),
};
assert_eq!(expected_result, remove_emojis(case.0.to_string()).unwrap())
}
}

#[test]
fn timeit() {
let t1 = Instant::now();
remove_emojis(" a\t name with ❤️✳️0️⃣#️⃣ #©*1 ".to_string());
let t2 = Instant::now();
println!("{:?}", t2 - t1);
#[allow(unused)]
fn performance() {
let mut total: Duration = Duration::new(0, 0);

for _ in 0..10000 {
let t1 = Instant::now();
remove_emojis(
"𝑐𝑎𝑙𝑙𝑒 𝑞𝑢𝑒𝑣𝑒𝑑𝑜 𝑛𝑢𝑚𝑒𝑟𝑜 1 𝑐𝑎𝑠𝑎 a\t name with ❤️✳️0️⃣#️⃣ #©*1👍🏽👍🏻👍🏿 "
.to_string(),
);
let t2 = Instant::now();
total += t2 - t1;
}

println!("{:?}", total / 10000);
}
}

0 comments on commit dc652e5

Please sign in to comment.