diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 22f90fb34..971d998f5 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -6,7 +6,7 @@ from ..language_data import update_exc, strings_to_exc, expand_exc from ..symbols import ORTH, LEMMA from .tag_map import TAG_MAP -from .stop_words import STOP_WORDS +from .word_sets import STOP_WORDS, NUM_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY from .morph_rules import MORPH_RULES diff --git a/spacy/en/stop_words.py b/spacy/en/word_sets.py similarity index 73% rename from spacy/en/stop_words.py rename to spacy/en/word_sets.py index 1b00eb974..deb5dc44b 100644 --- a/spacy/en/stop_words.py +++ b/spacy/en/word_sets.py @@ -1,7 +1,9 @@ -# encoding: utf8 +# coding: utf8 from __future__ import unicode_literals +# Stop words + STOP_WORDS = set(""" a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow @@ -65,3 +67,24 @@ whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves """.split()) + + +# Number words + +NUM_WORDS = set(""" +zero one two three four five six seven eight nine ten eleven twelve thirteen +fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty +sixty seventy eighty ninety hundred thousand million billion trillion +quadrillion gajillion bazillion +""".split()) + + +# Ordinal words + +ORDINAL_WORDS = set(""" +first second third fourth fifth sixth seventh eigth ninth tenth eleventh twelveth +thirteenth fourteenth fifteenth sixteenth sventeenth eighteenth nineteenth +twentieth thirtieth fortieth fiftieth sixtieth seventieth eightieth ninetieth +hundreth thousandth millionth billionth trillionth quadrillionth gajillionth +bazillionth +""".split())