Rename stop_words.py to word_sets.py and include more sets

NUM_WORDS and ORDINAL_WORDS are currently not used, but the hard-coded list should be removed from orth.pyx and replaced to use language-specific functions. This will later allow other languages to use their own functions to set those flags. (In English, this is easier because it only needs to be checked against a set – in German for example, this requires a more complex function, as most number words are one word.)
2025-10-28 14:41:14 +03:00 · 2017-03-12 13:53:46 +01:00 · 2017-03-12 13:53:46 +01:00 · f9e603903b
commit f9e603903b
parent f24f9b4b7b
2 changed files with 25 additions and 2 deletions
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@ -6,7 +6,7 @@ from ..language_data import update_exc, strings_to_exc, expand_exc
 from ..symbols import ORTH, LEMMA
 from .tag_map import TAG_MAP
-from .stop_words import STOP_WORDS
+from .word_sets import STOP_WORDS, NUM_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
 from .morph_rules import MORPH_RULES
--- a/spacy/en/stop_words.py
+++ b/spacy/en/stop_words.py
@ -1,7 +1,9 @@
-# encoding: utf8
+# coding: utf8
 from __future__ import unicode_literals
 # Stop words
 STOP_WORDS = set("""
 a about above across after afterwards again against all almost alone along
 already also although always am among amongst amount an and another any anyhow
@ -65,3 +67,24 @@ whither who whoever whole whom whose why will with within without would
 yet you your yours yourself yourselves
 """.split())
 # Number words
 NUM_WORDS = set("""
 zero one two three four five six seven eight nine ten eleven twelve thirteen
 fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
 sixty seventy eighty ninety hundred thousand million billion trillion
 quadrillion gajillion bazillion
 """.split())
 # Ordinal words
 ORDINAL_WORDS = set("""
 first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth
 thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth
 twentieth  thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth
 hundreth thousandth millionth billionth trillionth quadrillionth gajillionth
 bazillionth
 """.split())