* Generalize tokenization rules to capitals

2025-10-25 05:01:02 +03:00 · 2014-07-07 05:07:21 +02:00 · 2014-07-07 05:07:21 +02:00 · 25849fc926
commit 25849fc926
parent df0458001d
2 changed files with 15 additions and 0 deletions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -36,6 +36,11 @@ def read_tokenization(lang):
            assert chunk not in seen, chunk
            seen.add(chunk)
            entries.append((chunk, lex, pieces))
            if chunk[0].isalpha() and chunk[0].islower():
                chunk = chunk[0].title() + chunk[1:]
                lex = lex[0].title() + lex[1:]
                seen.add(chunk)
                entries.append((chunk, lex, pieces))
    return entries
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -32,3 +32,13 @@ def test_aint():
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[0])) == "are"
    assert unhash(lex_of(tokens[1])) == "not"
 def test_capitalized():
    tokens = expand_chunk(lookup("can't"))
    assert len(tokens) == 2
    tokens = expand_chunk(lookup("Can't"))
    assert len(tokens) == 2
    tokens = expand_chunk(lookup("Ain't"))
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[0])) == "Are"