mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Generalize tokenization rules to capitals
This commit is contained in:
parent
df0458001d
commit
25849fc926
|
@ -36,6 +36,11 @@ def read_tokenization(lang):
|
|||
assert chunk not in seen, chunk
|
||||
seen.add(chunk)
|
||||
entries.append((chunk, lex, pieces))
|
||||
if chunk[0].isalpha() and chunk[0].islower():
|
||||
chunk = chunk[0].title() + chunk[1:]
|
||||
lex = lex[0].title() + lex[1:]
|
||||
seen.add(chunk)
|
||||
entries.append((chunk, lex, pieces))
|
||||
return entries
|
||||
|
||||
|
||||
|
|
|
@ -32,3 +32,13 @@ def test_aint():
|
|||
assert len(tokens) == 2
|
||||
assert unhash(lex_of(tokens[0])) == "are"
|
||||
assert unhash(lex_of(tokens[1])) == "not"
|
||||
|
||||
|
||||
def test_capitalized():
|
||||
tokens = expand_chunk(lookup("can't"))
|
||||
assert len(tokens) == 2
|
||||
tokens = expand_chunk(lookup("Can't"))
|
||||
assert len(tokens) == 2
|
||||
tokens = expand_chunk(lookup("Ain't"))
|
||||
assert len(tokens) == 2
|
||||
assert unhash(lex_of(tokens[0])) == "Are"
|
||||
|
|
Loading…
Reference in New Issue
Block a user