mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Rename stop_words.py to word_sets.py and include more sets
NUM_WORDS and ORDINAL_WORDS are currently not used, but the hard-coded list should be removed from orth.pyx and replaced to use language-specific functions. This will later allow other languages to use their own functions to set those flags. (In English, this is easier because it only needs to be checked against a set – in German for example, this requires a more complex function, as most number words are one word.)
This commit is contained in:
parent
f24f9b4b7b
commit
f9e603903b
|
@ -6,7 +6,7 @@ from ..language_data import update_exc, strings_to_exc, expand_exc
|
|||
from ..symbols import ORTH, LEMMA
|
||||
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .word_sets import STOP_WORDS, NUM_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
from .morph_rules import MORPH_RULES
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
# encoding: utf8
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Stop words
|
||||
|
||||
STOP_WORDS = set("""
|
||||
a about above across after afterwards again against all almost alone along
|
||||
already also although always am among amongst amount an and another any anyhow
|
||||
|
@ -65,3 +67,24 @@ whither who whoever whole whom whose why will with within without would
|
|||
|
||||
yet you your yours yourself yourselves
|
||||
""".split())
|
||||
|
||||
|
||||
# Number words
|
||||
|
||||
NUM_WORDS = set("""
|
||||
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
||||
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
||||
sixty seventy eighty ninety hundred thousand million billion trillion
|
||||
quadrillion gajillion bazillion
|
||||
""".split())
|
||||
|
||||
|
||||
# Ordinal words
|
||||
|
||||
ORDINAL_WORDS = set("""
|
||||
first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
|
||||
thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
|
||||
twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
|
||||
hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
|
||||
bazillionth
|
||||
""".split())
|
Loading…
Reference in New Issue
Block a user