mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
468ca6c760
|
@ -3,7 +3,7 @@ pathlib
|
|||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.7.1,<6.8.0
|
||||
thinc>=6.7.2,<6.8.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
|
|
2
setup.py
2
setup.py
|
@ -191,7 +191,7 @@ def setup_package():
|
|||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.7.1,<6.8.0',
|
||||
'thinc>=6.7.2,<6.8.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
|
|
|
@ -28,15 +28,17 @@ from .. import displacy
|
|||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "flag", "G", bool),
|
||||
resume=("Whether to resume training", "flag", "R", bool),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool)
|
||||
)
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||
use_gpu=False, resume=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
util.set_env_log(True)
|
||||
n_sents = n_sents or None
|
||||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
|
@ -66,7 +68,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
util.env_opt('batch_to', 64),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
if resume:
|
||||
prints(output_path / 'model19.pickle', title="Resuming training")
|
||||
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
|
||||
else:
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_docs = corpus.count_train()
|
||||
|
||||
|
@ -75,6 +81,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
for i in range(n_iter):
|
||||
if resume:
|
||||
i += 20
|
||||
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||
gold_preproc=False, max_length=0)
|
||||
|
@ -86,14 +94,18 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
pbar.update(len(docs))
|
||||
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
with (output_path / ('model%d.bin' % i)).open('wb') as file_:
|
||||
file_.write(nlp.to_bytes())
|
||||
with (output_path / ('model%d.bin' % i)).open('rb') as file_:
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded.from_bytes(file_.read())
|
||||
scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=False))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
|
|
|
@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
||||
httpd.serve_forever()
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
prints("Shutting down server on port %d." % port)
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
|
@ -65,12 +70,13 @@ def app(environ, start_response):
|
|||
return [res]
|
||||
|
||||
|
||||
def parse_deps(doc, options={}):
|
||||
def parse_deps(orig_doc, options={}):
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
doc (Doc): Document do parse.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||
if options.get('collapse_punct', True):
|
||||
spans = []
|
||||
for word in doc[:-1]:
|
||||
|
|
|
@ -18,12 +18,11 @@ class DependencyRenderer(object):
|
|||
offset_x, color, bg, font)
|
||||
"""
|
||||
self.compact = options.get('compact', False)
|
||||
distance, arrow_width = (85, 8) if self.compact else (175, 10)
|
||||
self.word_spacing = options.get('word_spacing', 45)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 20)
|
||||
self.arrow_width = options.get('arrow_width', arrow_width)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
|
||||
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
|
||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||
self.distance = options.get('distance', distance)
|
||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
||||
self.offset_x = options.get('offset_x', 50)
|
||||
self.color = options.get('color', '#000000')
|
||||
self.bg = options.get('bg', '#ffffff')
|
||||
|
@ -99,6 +98,8 @@ class DependencyRenderer(object):
|
|||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||
-self.arrow_spacing*(self.highest_level-level)/4)
|
||||
y_curve = self.offset_y-level*self.distance/2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y-level*self.distance/6
|
||||
if y_curve == 0 and len(self.levels) > 5:
|
||||
y_curve = -self.distance
|
||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||
|
|
|
@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
|
|||
TPL_DEP_ARCS = """
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
||||
<text dy="1.25em" style="font-size: 0.8em">
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
|
||||
|
|
|
@ -212,7 +212,7 @@ class GoldCorpus(object):
|
|||
|
||||
def dev_docs(self, nlp, gold_preproc=False):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||
gold_docs = nlp.preprocess_gold(gold_docs)
|
||||
#gold_docs = nlp.preprocess_gold(gold_docs)
|
||||
yield from gold_docs
|
||||
|
||||
@classmethod
|
||||
|
@ -227,7 +227,7 @@ class GoldCorpus(object):
|
|||
gold_preproc)
|
||||
golds = cls._make_golds(docs, paragraph_tuples)
|
||||
for doc, gold in zip(docs, golds):
|
||||
if not max_length or len(doc) < max_length:
|
||||
if (not max_length) or len(doc) < max_length:
|
||||
yield doc, gold
|
||||
|
||||
@classmethod
|
||||
|
@ -235,17 +235,17 @@ class GoldCorpus(object):
|
|||
if raw_text is not None:
|
||||
return [nlp.make_doc(raw_text)]
|
||||
else:
|
||||
return [Doc(nlp.vocab, words=sent_tuples[0][1])
|
||||
for sent_tuples in paragraph_tuples]
|
||||
return [Doc(nlp.vocab, words=sent_tuples[1])
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples):
|
||||
assert len(docs) == len(paragraph_tuples)
|
||||
if len(docs) == 1:
|
||||
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
|
||||
for sent_tuples in paragraph_tuples]
|
||||
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
|
||||
else:
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
|
||||
for doc, sent_tuples in zip(docs, paragraph_tuples)]
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
|
||||
|
||||
@staticmethod
|
||||
def walk_corpus(path):
|
||||
|
|
|
@ -2,21 +2,25 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
|
|
17
spacy/lang/de/norm_exceptions.py
Normal file
17
spacy/lang/de/norm_exceptions.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Here we only want to include the absolute most common words. Otherwise,
|
||||
# this list would get impossibly long for German – especially considering the
|
||||
# old vs. new spelling rules, and all possible cases.
|
||||
|
||||
|
||||
_exc = {
|
||||
"daß": "dass"
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
|
|||
_exc = {
|
||||
"auf'm": [
|
||||
{ORTH: "auf", LEMMA: "auf"},
|
||||
{ORTH: "'m", LEMMA: "der", NORM: "dem" }],
|
||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
||||
|
||||
"du's": [
|
||||
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
|
@ -53,97 +53,97 @@ _exc = {
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "'n", LEMMA: "ein", NORM: "ein"},
|
||||
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
|
||||
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
|
||||
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
|
||||
{ORTH: "Abb.", LEMMA: "Abbildung"},
|
||||
{ORTH: "Abk.", LEMMA: "Abkürzung"},
|
||||
{ORTH: "Abt.", LEMMA: "Abteilung"},
|
||||
{ORTH: "Apr.", LEMMA: "April"},
|
||||
{ORTH: "Aug.", LEMMA: "August"},
|
||||
{ORTH: "Bd.", LEMMA: "Band"},
|
||||
{ORTH: "Betr.", LEMMA: "Betreff"},
|
||||
{ORTH: "Bf.", LEMMA: "Bahnhof"},
|
||||
{ORTH: "Bhf.", LEMMA: "Bahnhof"},
|
||||
{ORTH: "Bsp.", LEMMA: "Beispiel"},
|
||||
{ORTH: "Dez.", LEMMA: "Dezember"},
|
||||
{ORTH: "Di.", LEMMA: "Dienstag"},
|
||||
{ORTH: "Do.", LEMMA: "Donnerstag"},
|
||||
{ORTH: "Fa.", LEMMA: "Firma"},
|
||||
{ORTH: "Fam.", LEMMA: "Familie"},
|
||||
{ORTH: "Feb.", LEMMA: "Februar"},
|
||||
{ORTH: "Fr.", LEMMA: "Frau"},
|
||||
{ORTH: "Frl.", LEMMA: "Fräulein"},
|
||||
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
|
||||
{ORTH: "Hr.", LEMMA: "Herr"},
|
||||
{ORTH: "Hrn.", LEMMA: "Herr"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar"},
|
||||
{ORTH: "Jh.", LEMMA: "Jahrhundert"},
|
||||
{ORTH: "Jhd.", LEMMA: "Jahrhundert"},
|
||||
{ORTH: "Jul.", LEMMA: "Juli"},
|
||||
{ORTH: "Jun.", LEMMA: "Juni"},
|
||||
{ORTH: "Mi.", LEMMA: "Mittwoch"},
|
||||
{ORTH: "Mio.", LEMMA: "Million"},
|
||||
{ORTH: "Mo.", LEMMA: "Montag"},
|
||||
{ORTH: "Mrd.", LEMMA: "Milliarde"},
|
||||
{ORTH: "Mrz.", LEMMA: "März"},
|
||||
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
|
||||
{ORTH: "Mär.", LEMMA: "März"},
|
||||
{ORTH: "Nov.", LEMMA: "November"},
|
||||
{ORTH: "Nr.", LEMMA: "Nummer"},
|
||||
{ORTH: "Okt.", LEMMA: "Oktober"},
|
||||
{ORTH: "Orig.", LEMMA: "Original"},
|
||||
{ORTH: "Pkt.", LEMMA: "Punkt"},
|
||||
{ORTH: "Prof.", LEMMA: "Professor"},
|
||||
{ORTH: "Red.", LEMMA: "Redaktion"},
|
||||
{ORTH: "Sa.", LEMMA: "Samstag"},
|
||||
{ORTH: "Sep.", LEMMA: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September"},
|
||||
{ORTH: "So.", LEMMA: "Sonntag"},
|
||||
{ORTH: "Std.", LEMMA: "Stunde"},
|
||||
{ORTH: "Str.", LEMMA: "Straße"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon"},
|
||||
{ORTH: "Tsd.", LEMMA: "Tausend"},
|
||||
{ORTH: "Univ.", LEMMA: "Universität"},
|
||||
{ORTH: "abzgl.", LEMMA: "abzüglich"},
|
||||
{ORTH: "allg.", LEMMA: "allgemein"},
|
||||
{ORTH: "bspw.", LEMMA: "beispielsweise"},
|
||||
{ORTH: "bzgl.", LEMMA: "bezüglich"},
|
||||
{ORTH: "bzw.", LEMMA: "beziehungsweise"},
|
||||
{ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
|
||||
{ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
|
||||
{ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
|
||||
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
|
||||
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
|
||||
{ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
|
||||
{ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
|
||||
{ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
|
||||
{ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
|
||||
{ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
|
||||
{ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
|
||||
{ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
|
||||
{ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
|
||||
{ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
|
||||
{ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
|
||||
{ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
|
||||
{ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
|
||||
{ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
|
||||
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
|
||||
{ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
|
||||
{ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||
{ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
|
||||
{ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
|
||||
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
|
||||
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
|
||||
{ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
|
||||
{ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
|
||||
{ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
|
||||
{ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
|
||||
{ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
|
||||
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
|
||||
{ORTH: "Mär.", LEMMA: "März", NORM: "März"},
|
||||
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
|
||||
{ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
|
||||
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
|
||||
{ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
|
||||
{ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
|
||||
{ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
|
||||
{ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
|
||||
{ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
|
||||
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
|
||||
{ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
|
||||
{ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
|
||||
{ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
|
||||
{ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
|
||||
{ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
|
||||
{ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
|
||||
{ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
|
||||
{ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
|
||||
{ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
|
||||
{ORTH: "d.h.", LEMMA: "das heißt"},
|
||||
{ORTH: "dgl.", LEMMA: "dergleichen"},
|
||||
{ORTH: "ebd.", LEMMA: "ebenda"},
|
||||
{ORTH: "eigtl.", LEMMA: "eigentlich"},
|
||||
{ORTH: "engl.", LEMMA: "englisch"},
|
||||
{ORTH: "evtl.", LEMMA: "eventuell"},
|
||||
{ORTH: "frz.", LEMMA: "französisch"},
|
||||
{ORTH: "gegr.", LEMMA: "gegründet"},
|
||||
{ORTH: "ggf.", LEMMA: "gegebenenfalls"},
|
||||
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
|
||||
{ORTH: "ggü.", LEMMA: "gegenüber"},
|
||||
{ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
|
||||
{ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
|
||||
{ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
|
||||
{ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
|
||||
{ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
|
||||
{ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
|
||||
{ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
|
||||
{ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
|
||||
{ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
|
||||
{ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
|
||||
{ORTH: "i.O.", LEMMA: "in Ordnung"},
|
||||
{ORTH: "i.d.R.", LEMMA: "in der Regel"},
|
||||
{ORTH: "incl.", LEMMA: "inklusive"},
|
||||
{ORTH: "inkl.", LEMMA: "inklusive"},
|
||||
{ORTH: "insb.", LEMMA: "insbesondere"},
|
||||
{ORTH: "kath.", LEMMA: "katholisch"},
|
||||
{ORTH: "lt.", LEMMA: "laut"},
|
||||
{ORTH: "max.", LEMMA: "maximal"},
|
||||
{ORTH: "min.", LEMMA: "minimal"},
|
||||
{ORTH: "mind.", LEMMA: "mindestens"},
|
||||
{ORTH: "mtl.", LEMMA: "monatlich"},
|
||||
{ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
|
||||
{ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
|
||||
{ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
|
||||
{ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
|
||||
{ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
|
||||
{ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
|
||||
{ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
|
||||
{ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
|
||||
{ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
|
||||
{ORTH: "n.Chr.", LEMMA: "nach Christus"},
|
||||
{ORTH: "orig.", LEMMA: "original"},
|
||||
{ORTH: "röm.", LEMMA: "römisch"},
|
||||
{ORTH: "orig.", LEMMA: "original", NORM: "original"},
|
||||
{ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
|
||||
{ORTH: "s.o.", LEMMA: "siehe oben"},
|
||||
{ORTH: "sog.", LEMMA: "so genannt"},
|
||||
{ORTH: "stellv.", LEMMA: "stellvertretend"},
|
||||
{ORTH: "tägl.", LEMMA: "täglich"},
|
||||
{ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
|
||||
{ORTH: "u.U.", LEMMA: "unter Umständen"},
|
||||
{ORTH: "u.s.w.", LEMMA: "und so weiter"},
|
||||
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
|
||||
|
@ -153,9 +153,9 @@ for exc_data in [
|
|||
{ORTH: "v.Chr.", LEMMA: "vor Christus"},
|
||||
{ORTH: "v.a.", LEMMA: "vor allem"},
|
||||
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
|
||||
{ORTH: "vgl.", LEMMA: "vergleiche"},
|
||||
{ORTH: "vllt.", LEMMA: "vielleicht"},
|
||||
{ORTH: "vlt.", LEMMA: "vielleicht"},
|
||||
{ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
|
||||
{ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
|
||||
{ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
|
||||
{ORTH: "z.B.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "z.T.", LEMMA: "zum Teil"},
|
||||
|
@ -163,7 +163,7 @@ for exc_data in [
|
|||
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
|
||||
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
|
||||
{ORTH: "österr.", LEMMA: "österreichisch"}]:
|
||||
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
|
1760
spacy/lang/en/norm_exceptions.py
Normal file
1760
spacy/lang/en/norm_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
|||
for pron in ["i"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||
|
||||
_exc[orth + "m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
|
||||
|
||||
_exc[orth + "'ma"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||
|
||||
_exc[orth + "ma"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||
|
||||
|
@ -36,72 +36,72 @@ for pron in ["i"]:
|
|||
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "ll"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "llve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "dve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for pron in ["i", "you", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for pron in ["you", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
|
||||
|
||||
|
||||
for pron in ["he", "she", "it"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'s"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'s", NORM: "'s"}]
|
||||
|
||||
_exc[orth + "s"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "s"}]
|
||||
|
||||
|
||||
|
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
|
|||
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'s"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"}]
|
||||
|
||||
_exc[orth + "s"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "s"}]
|
||||
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "ll"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "llve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'d"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'d", NORM: "'d"}]
|
||||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d"}]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "dve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
# Verbs
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "ca", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "could", TAG: "MD"},
|
||||
{ORTH: "do", LEMMA: "do"},
|
||||
{ORTH: "does", LEMMA: "do"},
|
||||
{ORTH: "did", LEMMA: "do", TAG: "VBD"},
|
||||
{ORTH: "had", LEMMA: "have", TAG: "VBD"},
|
||||
{ORTH: "may", TAG: "MD"},
|
||||
{ORTH: "might", TAG: "MD"},
|
||||
{ORTH: "must", TAG: "MD"},
|
||||
{ORTH: "need"},
|
||||
{ORTH: "ought"},
|
||||
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
|
||||
{ORTH: "should", TAG: "MD"},
|
||||
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "would", TAG: "MD"}]:
|
||||
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||
{ORTH: "could", NORM: "could", TAG: "MD"},
|
||||
{ORTH: "do", LEMMA: "do", NORM: "do"},
|
||||
{ORTH: "does", LEMMA: "do", NORM: "does"},
|
||||
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
|
||||
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
|
||||
{ORTH: "may", NORM: "may", TAG: "MD"},
|
||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||
{ORTH: "need", NORM: "need"},
|
||||
{ORTH: "ought", NORM: "ought", TAG: "MD"},
|
||||
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
|
||||
{ORTH: "should", NORM: "should", TAG: "MD"},
|
||||
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "would", NORM: "would", TAG: "MD"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
_exc[data[ORTH] + "n't"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "nt"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "n't've"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[data[ORTH] + "ntve"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "could", TAG: "MD"},
|
||||
{ORTH: "might"},
|
||||
{ORTH: "must"},
|
||||
{ORTH: "should"}]:
|
||||
{ORTH: "could", NORM: "could", TAG: "MD"},
|
||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||
{ORTH: "should", NORM: "should", TAG: "MD"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
|
@ -228,21 +228,21 @@ for verb_data in [
|
|||
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
|
||||
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be"},
|
||||
{ORTH: "were", LEMMA: "be"}]:
|
||||
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
||||
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
_exc[data[ORTH] + "n't"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "nt"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
|
||||
# Other contractions with trailing apostrophe
|
||||
|
@ -250,10 +250,10 @@ for verb_data in [
|
|||
for exc_data in [
|
||||
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
|
||||
{ORTH: "goin", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "nothin", LEMMA: "nothing"},
|
||||
{ORTH: "nuthin", LEMMA: "nothing"},
|
||||
{ORTH: "ol", LEMMA: "old"},
|
||||
{ORTH: "somethin", LEMMA: "something"}]:
|
||||
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
|
||||
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
|
||||
{ORTH: "ol", LEMMA: "old", NORM: "old"},
|
||||
{ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
|
||||
exc_data_tc = dict(exc_data)
|
||||
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
||||
for data in [exc_data, exc_data_tc]:
|
||||
|
@ -266,10 +266,10 @@ for exc_data in [
|
|||
# Other contractions with leading apostrophe
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "cause", LEMMA: "because"},
|
||||
{ORTH: "cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
||||
{ORTH: "ll", LEMMA: "will"},
|
||||
{ORTH: "nuff", LEMMA: "enough"}]:
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will"},
|
||||
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
|
||||
exc_data_apos = dict(exc_data)
|
||||
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
||||
for data in [exc_data, exc_data_apos]:
|
||||
|
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
|
|||
for period in ["a.m.", "am"]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d" % h},
|
||||
{ORTH: period, LEMMA: "a.m."}]
|
||||
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
|
||||
for period in ["p.m.", "pm"]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d" % h},
|
||||
{ORTH: period, LEMMA: "p.m."}]
|
||||
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
|
||||
|
||||
|
||||
# Rest
|
||||
|
@ -306,56 +306,56 @@ _other_exc = {
|
|||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||
|
||||
"How'd'y": [
|
||||
{ORTH: "How", LEMMA: "how"},
|
||||
{ORTH: "How", LEMMA: "how", NORM: "how"},
|
||||
{ORTH: "'d", LEMMA: "do"},
|
||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||
|
||||
"not've": [
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"notve": [
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"Not've": [
|
||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"Notve": [
|
||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"cannot": [
|
||||
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||
|
||||
"Cannot": [
|
||||
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||
|
||||
"gonna": [
|
||||
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "na", LEMMA: "to"}],
|
||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"Gonna": [
|
||||
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "na", LEMMA: "to"}],
|
||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"gotta": [
|
||||
{ORTH: "got"},
|
||||
{ORTH: "ta", LEMMA: "to"}],
|
||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"Gotta": [
|
||||
{ORTH: "Got"},
|
||||
{ORTH: "ta", LEMMA: "to"}],
|
||||
{ORTH: "Got", NORM: "got"},
|
||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"let's": [
|
||||
{ORTH: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
|
||||
|
||||
"Let's": [
|
||||
{ORTH: "Let", LEMMA: "let"},
|
||||
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
|
||||
}
|
||||
|
||||
|
@ -363,72 +363,80 @@ _exc.update(_other_exc)
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "'S", LEMMA: "'s"},
|
||||
{ORTH: "'s", LEMMA: "'s"},
|
||||
{ORTH: "\u2018S", LEMMA: "'s"},
|
||||
{ORTH: "\u2018s", LEMMA: "'s"},
|
||||
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
|
||||
{ORTH: "'S", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "'s", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
|
||||
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"},
|
||||
{ORTH: "'Cause", LEMMA: "because"},
|
||||
{ORTH: "'cause", LEMMA: "because"},
|
||||
{ORTH: "ma'am", LEMMA: "madam"},
|
||||
{ORTH: "Ma'am", LEMMA: "madam"},
|
||||
{ORTH: "o'clock", LEMMA: "o'clock"},
|
||||
{ORTH: "O'clock", LEMMA: "o'clock"},
|
||||
{ORTH: "'Cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cos", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Cos", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'coz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Coz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
|
||||
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
|
||||
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
|
||||
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||
|
||||
{ORTH: "Mt.", LEMMA: "Mount"},
|
||||
{ORTH: "Ak.", LEMMA: "Alaska"},
|
||||
{ORTH: "Ala.", LEMMA: "Alabama"},
|
||||
{ORTH: "Apr.", LEMMA: "April"},
|
||||
{ORTH: "Ariz.", LEMMA: "Arizona"},
|
||||
{ORTH: "Ark.", LEMMA: "Arkansas"},
|
||||
{ORTH: "Aug.", LEMMA: "August"},
|
||||
{ORTH: "Calif.", LEMMA: "California"},
|
||||
{ORTH: "Colo.", LEMMA: "Colorado"},
|
||||
{ORTH: "Conn.", LEMMA: "Connecticut"},
|
||||
{ORTH: "Dec.", LEMMA: "December"},
|
||||
{ORTH: "Del.", LEMMA: "Delaware"},
|
||||
{ORTH: "Feb.", LEMMA: "February"},
|
||||
{ORTH: "Fla.", LEMMA: "Florida"},
|
||||
{ORTH: "Ga.", LEMMA: "Georgia"},
|
||||
{ORTH: "Ia.", LEMMA: "Iowa"},
|
||||
{ORTH: "Id.", LEMMA: "Idaho"},
|
||||
{ORTH: "Ill.", LEMMA: "Illinois"},
|
||||
{ORTH: "Ind.", LEMMA: "Indiana"},
|
||||
{ORTH: "Jan.", LEMMA: "January"},
|
||||
{ORTH: "Jul.", LEMMA: "July"},
|
||||
{ORTH: "Jun.", LEMMA: "June"},
|
||||
{ORTH: "Kan.", LEMMA: "Kansas"},
|
||||
{ORTH: "Kans.", LEMMA: "Kansas"},
|
||||
{ORTH: "Ky.", LEMMA: "Kentucky"},
|
||||
{ORTH: "La.", LEMMA: "Louisiana"},
|
||||
{ORTH: "Mar.", LEMMA: "March"},
|
||||
{ORTH: "Mass.", LEMMA: "Massachusetts"},
|
||||
{ORTH: "May.", LEMMA: "May"},
|
||||
{ORTH: "Mich.", LEMMA: "Michigan"},
|
||||
{ORTH: "Minn.", LEMMA: "Minnesota"},
|
||||
{ORTH: "Miss.", LEMMA: "Mississippi"},
|
||||
{ORTH: "N.C.", LEMMA: "North Carolina"},
|
||||
{ORTH: "N.D.", LEMMA: "North Dakota"},
|
||||
{ORTH: "N.H.", LEMMA: "New Hampshire"},
|
||||
{ORTH: "N.J.", LEMMA: "New Jersey"},
|
||||
{ORTH: "N.M.", LEMMA: "New Mexico"},
|
||||
{ORTH: "N.Y.", LEMMA: "New York"},
|
||||
{ORTH: "Neb.", LEMMA: "Nebraska"},
|
||||
{ORTH: "Nebr.", LEMMA: "Nebraska"},
|
||||
{ORTH: "Nev.", LEMMA: "Nevada"},
|
||||
{ORTH: "Nov.", LEMMA: "November"},
|
||||
{ORTH: "Oct.", LEMMA: "October"},
|
||||
{ORTH: "Okla.", LEMMA: "Oklahoma"},
|
||||
{ORTH: "Ore.", LEMMA: "Oregon"},
|
||||
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
|
||||
{ORTH: "S.C.", LEMMA: "South Carolina"},
|
||||
{ORTH: "Sep.", LEMMA: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September"},
|
||||
{ORTH: "Tenn.", LEMMA: "Tennessee"},
|
||||
{ORTH: "Va.", LEMMA: "Virginia"},
|
||||
{ORTH: "Wash.", LEMMA: "Washington"},
|
||||
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
|
||||
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
||||
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
||||
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
|
||||
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
|
||||
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
|
||||
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
|
||||
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
|
||||
{ORTH: "Calif.", LEMMA: "California", NORM: "California"},
|
||||
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
|
||||
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
|
||||
{ORTH: "Dec.", LEMMA: "December", NORM: "December"},
|
||||
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
|
||||
{ORTH: "Feb.", LEMMA: "February", NORM: "February"},
|
||||
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
|
||||
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
|
||||
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
|
||||
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
|
||||
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
|
||||
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
|
||||
{ORTH: "Jan.", LEMMA: "January", NORM: "January"},
|
||||
{ORTH: "Jul.", LEMMA: "July", NORM: "July"},
|
||||
{ORTH: "Jun.", LEMMA: "June", NORM: "June"},
|
||||
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
|
||||
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
|
||||
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
|
||||
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
|
||||
{ORTH: "Mar.", LEMMA: "March", NORM: "March"},
|
||||
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
|
||||
{ORTH: "May.", LEMMA: "May", NORM: "May"},
|
||||
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
|
||||
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
|
||||
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
|
||||
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
|
||||
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
|
||||
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
|
||||
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
|
||||
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
|
||||
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
|
||||
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
|
||||
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
|
||||
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
|
||||
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
|
||||
{ORTH: "Oct.", LEMMA: "October", NORM: "October"},
|
||||
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
|
||||
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
|
||||
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
|
||||
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
|
||||
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
|
||||
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
|
||||
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
|
||||
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||
|
||||
|
||||
|
|
46
spacy/lang/norm_exceptions.py
Normal file
46
spacy/lang/norm_exceptions.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Individual languages can also add their own exceptions and overwrite them -
|
||||
# for example, British vs. American spelling in English.
|
||||
|
||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||
# Note that this does not change any other token attributes. Its main purpose
|
||||
# is to normalise the word representations so that equivalent tokens receive
|
||||
# similar representations. For example: $ and € are very different, but they're
|
||||
# both currency symbols. By normalising currency symbols to $, all symbols are
|
||||
# seen as similar, no matter how common they are in the training data.
|
||||
|
||||
|
||||
BASE_NORMS = {
|
||||
"'s": "'s",
|
||||
"'S": "'s",
|
||||
"’s": "'s",
|
||||
"’S": "'s",
|
||||
"’": "'",
|
||||
"‘": "'",
|
||||
"´": "'",
|
||||
"`": "'",
|
||||
"”": '"',
|
||||
"“": '"',
|
||||
"''": '"',
|
||||
"``": '"',
|
||||
"´´": '"',
|
||||
"„": '"',
|
||||
"»": '"',
|
||||
"«": '"',
|
||||
"…": "...",
|
||||
"—": "-",
|
||||
"–": "-",
|
||||
"--": "-",
|
||||
"---": "-",
|
||||
"€": "$",
|
||||
"£": "$",
|
||||
"¥": "$",
|
||||
"฿": "$",
|
||||
"US$": "$",
|
||||
"C$": "$",
|
||||
"A$": "$"
|
||||
}
|
|
@ -301,7 +301,7 @@ class Language(object):
|
|||
def evaluate(self, docs_golds):
|
||||
docs, golds = zip(*docs_golds)
|
||||
scorer = Scorer()
|
||||
for doc, gold in zip(self.pipe(docs), golds):
|
||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
||||
scorer.score(doc, gold)
|
||||
doc.tensor = None
|
||||
return scorer
|
||||
|
|
|
@ -38,7 +38,7 @@ cdef class Morphology:
|
|||
self.strings = string_store
|
||||
self.tag_map = {}
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map) + 1
|
||||
self.n_tags = len(tag_map)
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
|
|
|
@ -8,20 +8,33 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||||
def test_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||
def test_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
text = "Ich bin z.Zt. im Urlaub."
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[2].text == "z.Zt."
|
||||
assert tokens[2].lemma_ == "zur Zeit"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
|
||||
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||
tokens = de_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||
tokens = de_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -102,3 +102,16 @@ def test_en_tokenizer_handles_times(en_tokenizer, text):
|
|||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
|
||||
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||
tokens = en_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
33
spacy/tests/serialize/test_serialize_tokenizer.py
Normal file
33
spacy/tests/serialize/test_serialize_tokenizer.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...util import get_lang_class
|
||||
from ..util import make_tempdir, assert_packed_msg_equal
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def load_tokenizer(b):
|
||||
tok = get_lang_class('en').Defaults.create_tokenizer()
|
||||
tok.from_bytes(b)
|
||||
return tok
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
|
||||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||||
tokenizer = en_tokenizer
|
||||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||||
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
||||
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
||||
doc1 = tokenizer(text)
|
||||
doc2 = new_tokenizer(text)
|
||||
assert [token.text for token in doc1] == [token.text for token in doc2]
|
||||
|
||||
|
||||
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
||||
tokenizer = en_tokenizer
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'tokenizer'
|
||||
tokenizer.to_disk(file_path)
|
||||
tokenizer_d = en_tokenizer.from_disk(file_path)
|
||||
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|
|
@ -10,6 +10,7 @@ import numpy
|
|||
import tempfile
|
||||
import shutil
|
||||
import contextlib
|
||||
import msgpack
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
|
|||
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
|
||||
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
|
||||
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
|
||||
|
||||
|
||||
def assert_packed_msg_equal(b1, b2):
|
||||
"""Assert that two packed msgpack messages are equal."""
|
||||
msg1 = msgpack.loads(b1, encoding='utf8')
|
||||
msg2 = msgpack.loads(b2, encoding='utf8')
|
||||
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
||||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||
assert k1 == k2
|
||||
assert v1 == v2
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -355,14 +356,14 @@ cdef class Tokenizer:
|
|||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||
"""
|
||||
serializers = {
|
||||
'vocab': lambda: self.vocab.to_bytes(),
|
||||
'prefix_search': lambda: self.prefix_search.__self__.pattern,
|
||||
'suffix_search': lambda: self.suffix_search.__self__.pattern,
|
||||
'infix_finditer': lambda: self.infix_finditer.__self__.pattern,
|
||||
'token_match': lambda: self.token_match.__self__.pattern,
|
||||
'exceptions': lambda: self._rules
|
||||
}
|
||||
serializers = OrderedDict((
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('prefix_search', lambda: self.prefix_search.__self__.pattern),
|
||||
('suffix_search', lambda: self.suffix_search.__self__.pattern),
|
||||
('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
|
||||
('token_match', lambda: self.token_match.__self__.pattern),
|
||||
('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
|
||||
))
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
|
@ -372,15 +373,15 @@ cdef class Tokenizer:
|
|||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||
"""
|
||||
data = {}
|
||||
deserializers = {
|
||||
'vocab': lambda b: self.vocab.from_bytes(b),
|
||||
'prefix_search': lambda b: data.setdefault('prefix', b),
|
||||
'suffix_search': lambda b: data.setdefault('suffix_search', b),
|
||||
'infix_finditer': lambda b: data.setdefault('infix_finditer', b),
|
||||
'token_match': lambda b: data.setdefault('token_match', b),
|
||||
'exceptions': lambda b: data.setdefault('rules', b)
|
||||
}
|
||||
data = OrderedDict()
|
||||
deserializers = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('prefix_search', lambda b: data.setdefault('prefix', b)),
|
||||
('suffix_search', lambda b: data.setdefault('suffix_search', b)),
|
||||
('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
|
||||
('token_match', lambda b: data.setdefault('token_match', b)),
|
||||
('exceptions', lambda b: data.setdefault('rules', b))
|
||||
))
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if 'prefix_search' in data:
|
||||
self.prefix_search = re.compile(data['prefix_search']).search
|
||||
|
@ -392,3 +393,4 @@ cdef class Tokenizer:
|
|||
self.token_match = re.compile(data['token_match']).search
|
||||
for string, substrings in data.get('rules', {}).items():
|
||||
self.add_special_case(string, substrings)
|
||||
return self
|
||||
|
|
|
@ -437,7 +437,8 @@ cdef class Doc:
|
|||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
return self.user_hooks['sents'](self)
|
||||
yield from self.user_hooks['sents'](self)
|
||||
return
|
||||
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
|
@ -740,7 +741,7 @@ cdef class Doc:
|
|||
token.spacy = self.c[end-1].spacy
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == TAG:
|
||||
self.vocab.morphology.assign_tag(token, attr_value)
|
||||
self.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
|
|
|
@ -299,6 +299,22 @@ def compile_infix_regex(entries):
|
|||
return re.compile(expression)
|
||||
|
||||
|
||||
def add_lookups(default_func, *lookups):
|
||||
"""Extend an attribute function with special cases. If a word is in the
|
||||
lookups, the value is returned. Otherwise the previous function is used.
|
||||
|
||||
default_func (callable): The default function to execute.
|
||||
*lookups (dict): Lookup dictionary mapping string to attribute value.
|
||||
RETURNS (callable): Lexical attribute getter.
|
||||
"""
|
||||
def get_attr(string):
|
||||
for lookup in lookups:
|
||||
if string in lookup:
|
||||
return lookup[string]
|
||||
return default_func(string)
|
||||
return get_attr
|
||||
|
||||
|
||||
def update_exc(base_exceptions, *addition_dicts):
|
||||
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
|
||||
|
||||
|
|
|
@ -231,11 +231,13 @@ cdef class Vocab:
|
|||
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
|
||||
token = &tokens[i]
|
||||
# Set the special tokens up to have arbitrary attributes
|
||||
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
||||
token.lex = lex
|
||||
if attrs.TAG in props:
|
||||
self.morphology.assign_tag(token, props[attrs.TAG])
|
||||
for attr_id, value in props.items():
|
||||
Token.set_struct_attr(token, attr_id, value)
|
||||
Lexeme.set_struct_attr(lex, attr_id, value)
|
||||
return tokens
|
||||
|
||||
@property
|
||||
|
|
|
@ -205,7 +205,7 @@ p
|
|||
+cell #[code arrow_spacing]
|
||||
+cell int
|
||||
+cell Spacing between arrows in px to avoid overlaps.
|
||||
+cell #[code 20]
|
||||
+cell #[code 20] / #[code 12] (compact)
|
||||
|
||||
+row
|
||||
+cell #[code word_spacing]
|
||||
|
|
|
@ -64,7 +64,7 @@ p
|
|||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
span = doc[1:3]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
|
|
@ -141,7 +141,7 @@ p
|
|||
else:
|
||||
tokens.append(substring)
|
||||
substring = ''
|
||||
tokens.extend(suffixes)
|
||||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
|
||||
p
|
||||
|
|
|
@ -59,9 +59,11 @@ p
|
|||
| to customise the layout, for example:
|
||||
|
||||
+aside("Important note")
|
||||
| There's currently a known issue with the #[code compact] mode for long
|
||||
| sentences with arrow spacing. If the spacing is larger than the arc
|
||||
| itself, it'll cause the arc and its label to flip.
|
||||
| There's currently a known issue with the #[code compact] mode for
|
||||
| sentences with short arrows and long dependency labels, that causes labels
|
||||
| longer than the arrow to wrap. So if you come across this problem,
|
||||
| especially when using custom labels, you'll have to increase the
|
||||
| #[code distance] setting in the #[code options] to allow longer arcs.
|
||||
|
||||
+table(["Name", "Type", "Description", "Default"])
|
||||
+row
|
||||
|
|
Loading…
Reference in New Issue
Block a user