mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Rename stop_words.py to word_sets.py and include more sets
NUM_WORDS and ORDINAL_WORDS are currently not used, but the hard-coded list should be removed from orth.pyx and replaced to use language-specific functions. This will later allow other languages to use their own functions to set those flags. (In English, this is easier because it only needs to be checked against a set – in German for example, this requires a more complex function, as most number words are one word.)
This commit is contained in:
parent
f24f9b4b7b
commit
f9e603903b
|
@ -6,7 +6,7 @@ from ..language_data import update_exc, strings_to_exc, expand_exc
|
||||||
from ..symbols import ORTH, LEMMA
|
from ..symbols import ORTH, LEMMA
|
||||||
|
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .word_sets import STOP_WORDS, NUM_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
# encoding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# Stop words
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set("""
|
||||||
a about above across after afterwards again against all almost alone along
|
a about above across after afterwards again against all almost alone along
|
||||||
already also although always am among amongst amount an and another any anyhow
|
already also although always am among amongst amount an and another any anyhow
|
||||||
|
@ -65,3 +67,24 @@ whither who whoever whole whom whose why will with within without would
|
||||||
|
|
||||||
yet you your yours yourself yourselves
|
yet you your yours yourself yourselves
|
||||||
""".split())
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
|
# Number words
|
||||||
|
|
||||||
|
NUM_WORDS = set("""
|
||||||
|
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
||||||
|
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
||||||
|
sixty seventy eighty ninety hundred thousand million billion trillion
|
||||||
|
quadrillion gajillion bazillion
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
|
# Ordinal words
|
||||||
|
|
||||||
|
ORDINAL_WORDS = set("""
|
||||||
|
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
|
||||||
|
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
|
||||||
|
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
|
||||||
|
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
|
||||||
|
bazillionth
|
||||||
|
""".split())
|
Loading…
Reference in New Issue
Block a user