mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Break language data components into their own files
This commit is contained in:
parent
62655fd36f
commit
704c7442e0
67
spacy/en/stop_words.py
Normal file
67
spacy/en/stop_words.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set("""
|
||||||
|
a about above across after afterwards again against all almost alone along
|
||||||
|
already also although always am among amongst amount an and another any anyhow
|
||||||
|
anyone anything anyway anywhere are around as at
|
||||||
|
|
||||||
|
back be became because become becomes becoming been before beforehand behind
|
||||||
|
being below beside besides between beyond both bottom but by
|
||||||
|
|
||||||
|
call can cannot ca could
|
||||||
|
|
||||||
|
did do does doing done down due during
|
||||||
|
|
||||||
|
each eight either eleven else elsewhere empty enough etc even ever every
|
||||||
|
everyone everything everywhere except
|
||||||
|
|
||||||
|
few fifteen fifty first five for former formerly forty four from front full
|
||||||
|
further
|
||||||
|
|
||||||
|
get give go
|
||||||
|
|
||||||
|
had has have he hence her here hereafter hereby herein hereupon hers herself
|
||||||
|
him himself his how however hundred
|
||||||
|
|
||||||
|
i if in inc indeed into is it its itself
|
||||||
|
|
||||||
|
keep
|
||||||
|
|
||||||
|
last latter latterly least less
|
||||||
|
|
||||||
|
just
|
||||||
|
|
||||||
|
made make many may me meanwhile might mine more moreover most mostly move much
|
||||||
|
must my myself
|
||||||
|
|
||||||
|
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||||
|
nothing now nowhere
|
||||||
|
|
||||||
|
of off often on once one only onto or other others otherwise our ours ourselves
|
||||||
|
out over own
|
||||||
|
|
||||||
|
part per perhaps please put
|
||||||
|
|
||||||
|
quite
|
||||||
|
|
||||||
|
rather re really regarding
|
||||||
|
|
||||||
|
same say see seem seemed seeming seems serious several she should show side
|
||||||
|
since six sixty so some somehow someone something sometime sometimes somewhere
|
||||||
|
still such
|
||||||
|
|
||||||
|
take ten than that the their them themselves then thence there thereafter
|
||||||
|
thereby therefore therein thereupon these they third this those though three
|
||||||
|
through throughout thru thus to together too top toward towards twelve twenty
|
||||||
|
two
|
||||||
|
|
||||||
|
under until up unless upon us used using
|
||||||
|
|
||||||
|
various very very via was we well were what whatever when whence whenever where
|
||||||
|
whereafter whereas whereby wherein whereupon wherever whether which while
|
||||||
|
whither who whoever whole whom whose why will with within without would
|
||||||
|
|
||||||
|
yet you your yours yourself yourselves
|
||||||
|
""".split())
|
64
spacy/en/tag_map.py
Normal file
64
spacy/en/tag_map.py
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import *
|
||||||
|
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
".": {POS: PUNCT, "PunctType": "peri"},
|
||||||
|
",": {POS: PUNCT, "PunctType": "comm"},
|
||||||
|
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||||
|
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||||
|
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||||
|
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
|
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
|
":": {POS: PUNCT},
|
||||||
|
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
|
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||||
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
|
"DT": {POS: DET},
|
||||||
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
|
"FW": {POS: X, "Foreign": "yes"},
|
||||||
|
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||||
|
"IN": {POS: ADP},
|
||||||
|
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||||
|
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||||
|
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||||
|
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||||
|
"MD": {POS: VERB, "VerbType": "mod"},
|
||||||
|
"NIL": {POS: ""},
|
||||||
|
"NN": {POS: NOUN, "Number": "sing"},
|
||||||
|
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||||
|
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||||
|
"NNS": {POS: NOUN, "Number": "plur"},
|
||||||
|
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||||
|
"POS": {POS: PART, "Poss": "yes"},
|
||||||
|
"PRP": {POS: PRON, "PronType": "prs"},
|
||||||
|
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||||
|
"RB": {POS: ADV, "Degree": "pos"},
|
||||||
|
"RBR": {POS: ADV, "Degree": "comp"},
|
||||||
|
"RBS": {POS: ADV, "Degree": "sup"},
|
||||||
|
"RP": {POS: PART},
|
||||||
|
"SYM": {POS: SYM},
|
||||||
|
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||||
|
"UH": {POS: INTJ},
|
||||||
|
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||||
|
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||||
|
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||||
|
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||||
|
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||||
|
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
||||||
|
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||||
|
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||||
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
|
"SP": {POS: SPACE},
|
||||||
|
"ADD": {POS: X},
|
||||||
|
"NFP": {POS: PUNCT},
|
||||||
|
"GW": {POS: X},
|
||||||
|
"XX": {POS: X},
|
||||||
|
"BES": {POS: VERB},
|
||||||
|
"HVS": {POS: VERB}
|
||||||
|
}
|
|
@ -3,159 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
from ..language_data import PRON_LEMMA
|
from ..language_data import PRON_LEMMA
|
||||||
from ..language_data import TOKENIZER_PREFIXES
|
|
||||||
from ..language_data import TOKENIZER_SUFFIXES
|
|
||||||
from ..language_data import TOKENIZER_INFIXES
|
|
||||||
|
|
||||||
|
|
||||||
def get_time_exc(hours):
|
|
||||||
exc = {}
|
|
||||||
for hour in hours:
|
|
||||||
exc["%da.m." % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "a.m."}
|
|
||||||
]
|
|
||||||
|
|
||||||
exc["%dp.m." % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "p.m."}
|
|
||||||
]
|
|
||||||
|
|
||||||
exc["%dam" % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "am", LEMMA: "a.m."}
|
|
||||||
]
|
|
||||||
|
|
||||||
exc["%dpm" % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "pm", LEMMA: "p.m."}
|
|
||||||
]
|
|
||||||
return exc
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
|
||||||
".": {POS: PUNCT, "PunctType": "peri"},
|
|
||||||
",": {POS: PUNCT, "PunctType": "comm"},
|
|
||||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
|
||||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
|
||||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
|
||||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
|
||||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
|
||||||
":": {POS: PUNCT},
|
|
||||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
|
||||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
|
||||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
|
||||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
|
||||||
"CD": {POS: NUM, "NumType": "card"},
|
|
||||||
"DT": {POS: DET},
|
|
||||||
"EX": {POS: ADV, "AdvType": "ex"},
|
|
||||||
"FW": {POS: X, "Foreign": "yes"},
|
|
||||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
|
||||||
"IN": {POS: ADP},
|
|
||||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
|
||||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
|
||||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
|
||||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
|
||||||
"MD": {POS: VERB, "VerbType": "mod"},
|
|
||||||
"NIL": {POS: ""},
|
|
||||||
"NN": {POS: NOUN, "Number": "sing"},
|
|
||||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
|
||||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
|
||||||
"NNS": {POS: NOUN, "Number": "plur"},
|
|
||||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
|
||||||
"POS": {POS: PART, "Poss": "yes"},
|
|
||||||
"PRP": {POS: PRON, "PronType": "prs"},
|
|
||||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
|
||||||
"RB": {POS: ADV, "Degree": "pos"},
|
|
||||||
"RBR": {POS: ADV, "Degree": "comp"},
|
|
||||||
"RBS": {POS: ADV, "Degree": "sup"},
|
|
||||||
"RP": {POS: PART},
|
|
||||||
"SYM": {POS: SYM},
|
|
||||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
|
||||||
"UH": {POS: INTJ},
|
|
||||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
|
||||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
|
||||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
|
||||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
|
||||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
|
||||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
|
||||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
|
||||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
|
||||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
|
||||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
|
||||||
"SP": {POS: SPACE},
|
|
||||||
"ADD": {POS: X},
|
|
||||||
"NFP": {POS: PUNCT},
|
|
||||||
"GW": {POS: X},
|
|
||||||
"XX": {POS: X},
|
|
||||||
"BES": {POS: VERB},
|
|
||||||
"HVS": {POS: VERB}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
|
||||||
a about above across after afterwards again against all almost alone along
|
|
||||||
already also although always am among amongst amount an and another any anyhow
|
|
||||||
anyone anything anyway anywhere are around as at
|
|
||||||
|
|
||||||
back be became because become becomes becoming been before beforehand behind
|
|
||||||
being below beside besides between beyond both bottom but by
|
|
||||||
|
|
||||||
call can cannot ca could
|
|
||||||
|
|
||||||
did do does doing done down due during
|
|
||||||
|
|
||||||
each eight either eleven else elsewhere empty enough etc even ever every
|
|
||||||
everyone everything everywhere except
|
|
||||||
|
|
||||||
few fifteen fifty first five for former formerly forty four from front full
|
|
||||||
further
|
|
||||||
|
|
||||||
get give go
|
|
||||||
|
|
||||||
had has have he hence her here hereafter hereby herein hereupon hers herself
|
|
||||||
him himself his how however hundred
|
|
||||||
|
|
||||||
i if in inc indeed into is it its itself
|
|
||||||
|
|
||||||
keep
|
|
||||||
|
|
||||||
last latter latterly least less
|
|
||||||
|
|
||||||
just
|
|
||||||
|
|
||||||
made make many may me meanwhile might mine more moreover most mostly move much
|
|
||||||
must my myself
|
|
||||||
|
|
||||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
|
||||||
nothing now nowhere
|
|
||||||
|
|
||||||
of off often on once one only onto or other others otherwise our ours ourselves
|
|
||||||
out over own
|
|
||||||
|
|
||||||
part per perhaps please put
|
|
||||||
|
|
||||||
quite
|
|
||||||
|
|
||||||
rather re really regarding
|
|
||||||
|
|
||||||
same say see seem seemed seeming seems serious several she should show side
|
|
||||||
since six sixty so some somehow someone something sometime sometimes somewhere
|
|
||||||
still such
|
|
||||||
|
|
||||||
take ten than that the their them themselves then thence there thereafter
|
|
||||||
thereby therefore therein thereupon these they third this those though three
|
|
||||||
through throughout thru thus to together too top toward towards twelve twenty
|
|
||||||
two
|
|
||||||
|
|
||||||
under until up unless upon us used using
|
|
||||||
|
|
||||||
various very very via was we well were what whatever when whence whenever where
|
|
||||||
whereafter whereas whereby wherein whereupon wherever whether which while
|
|
||||||
whither who whoever whole whom whose why will with within without would
|
|
||||||
|
|
||||||
yet you your yours yourself yourselves
|
|
||||||
""".split())
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
TOKENIZER_EXCEPTIONS = {
|
Loading…
Reference in New Issue
Block a user