Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-03 14:33:51 -05:00
commit 468ca6c760
28 changed files with 2275 additions and 324 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.7.1,<6.8.0 thinc>=6.7.2,<6.8.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six

View File

@ -191,7 +191,7 @@ def setup_package():
'murmurhash>=0.28,<0.29', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.7.1,<6.8.0', 'thinc>=6.7.2,<6.8.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0', 'pip>=9.0.0,<10.0.0',
'six', 'six',

View File

@ -28,15 +28,17 @@ from .. import displacy
n_iter=("number of iterations", "option", "n", int), n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int), n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "flag", "G", bool), use_gpu=("Use GPU", "flag", "G", bool),
resume=("Whether to resume training", "flag", "R", bool),
no_tagger=("Don't train tagger", "flag", "T", bool), no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool) no_entities=("Don't train NER", "flag", "N", bool)
) )
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): use_gpu=False, resume=False, no_tagger=False, no_parser=False, no_entities=False):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
util.set_env_log(True)
n_sents = n_sents or None n_sents = n_sents or None
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
@ -66,7 +68,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
util.env_opt('batch_to', 64), util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
nlp = lang_class(pipeline=pipeline) if resume:
prints(output_path / 'model19.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
else:
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents) corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_docs = corpus.count_train() n_train_docs = corpus.count_train()
@ -75,6 +81,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try: try:
for i in range(n_iter): for i in range(n_iter):
if resume:
i += 20
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0) gold_preproc=False, max_length=0)
@ -86,14 +94,18 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
pbar.update(len(docs)) pbar.update(len(docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_: with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1) dill.dump(nlp, file_, -1)
with (output_path / ('model%d.bin' % i)).open('wb') as file_: nlp_loaded = lang_class(pipeline=pipeline)
file_.write(nlp.to_bytes()) nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
with (output_path / ('model%d.bin' % i)).open('rb') as file_: scorer = nlp_loaded.evaluate(
nlp_loaded = lang_class(pipeline=pipeline) corpus.dev_docs(
nlp_loaded.from_bytes(file_.read()) nlp_loaded,
scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False)) gold_preproc=False))
util.set_env_log(True)
print_progress(i, losses, scorer.scores) print_progress(i, losses, scorer.scores)
finally: finally:
print("Saving model...") print("Saving model...")

View File

@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
render(docs, style=style, page=page, minify=minify, options=options, manual=manual) render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app) httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
httpd.serve_forever() try:
httpd.serve_forever()
except KeyboardInterrupt:
prints("Shutting down server on port %d." % port)
finally:
httpd.server_close()
def app(environ, start_response): def app(environ, start_response):
@ -65,12 +70,13 @@ def app(environ, start_response):
return [res] return [res]
def parse_deps(doc, options={}): def parse_deps(orig_doc, options={}):
"""Generate dependency parse in {'words': [], 'arcs': []} format. """Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse. doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if options.get('collapse_punct', True): if options.get('collapse_punct', True):
spans = [] spans = []
for word in doc[:-1]: for word in doc[:-1]:

View File

@ -18,12 +18,11 @@ class DependencyRenderer(object):
offset_x, color, bg, font) offset_x, color, bg, font)
""" """
self.compact = options.get('compact', False) self.compact = options.get('compact', False)
distance, arrow_width = (85, 8) if self.compact else (175, 10)
self.word_spacing = options.get('word_spacing', 45) self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 20) self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width', arrow_width) self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2) self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', distance) self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50) self.offset_x = options.get('offset_x', 50)
self.color = options.get('color', '#000000') self.color = options.get('color', '#000000')
self.bg = options.get('bg', '#ffffff') self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
-self.arrow_spacing*(self.highest_level-level)/4) -self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2 y_curve = self.offset_y-level*self.distance/2
if self.compact:
y_curve = self.offset_y-level*self.distance/6
if y_curve == 0 and len(self.levels) > 5: if y_curve == 0 and len(self.levels) > 5:
y_curve = -self.distance y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end) arrowhead = self.get_arrowhead(direction, x_start, y, x_end)

View File

@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
TPL_DEP_ARCS = """ TPL_DEP_ARCS = """
<g class="displacy-arrow"> <g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/> <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.8em"> <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath> <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
</text> </text>
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/> <path class="displacy-arrowhead" d="{head}" fill="currentColor"/>

View File

@ -212,7 +212,7 @@ class GoldCorpus(object):
def dev_docs(self, nlp, gold_preproc=False): def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
gold_docs = nlp.preprocess_gold(gold_docs) #gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs yield from gold_docs
@classmethod @classmethod
@ -227,7 +227,7 @@ class GoldCorpus(object):
gold_preproc) gold_preproc)
golds = cls._make_golds(docs, paragraph_tuples) golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
if not max_length or len(doc) < max_length: if (not max_length) or len(doc) < max_length:
yield doc, gold yield doc, gold
@classmethod @classmethod
@ -235,17 +235,17 @@ class GoldCorpus(object):
if raw_text is not None: if raw_text is not None:
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)]
else: else:
return [Doc(nlp.vocab, words=sent_tuples[0][1]) return [Doc(nlp.vocab, words=sent_tuples[1])
for sent_tuples in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples]
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples): def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1: if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0]) return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
for sent_tuples in paragraph_tuples]
else: else:
return [GoldParse.from_annot_tuples(doc, sent_tuples[0]) return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, sent_tuples in zip(docs, paragraph_tuples)] for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):

View File

@ -2,21 +2,25 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
# Here we only want to include the absolute most common words. Otherwise,
# this list would get impossibly long for German especially considering the
# old vs. new spelling rules, and all possible cases.
_exc = {
"daß": "dass"
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
_exc = { _exc = {
"auf'm": [ "auf'm": [
{ORTH: "auf", LEMMA: "auf"}, {ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: "der", NORM: "dem" }], {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
"du's": [ "du's": [
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
@ -53,97 +53,97 @@ _exc = {
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "'n", LEMMA: "ein", NORM: "ein"}, {ORTH: "'n", LEMMA: "ein", NORM: "ein"},
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, {ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, {ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, {ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
{ORTH: "Abb.", LEMMA: "Abbildung"}, {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung"}, {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung"}, {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Aug.", LEMMA: "August"}, {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Bd.", LEMMA: "Band"}, {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff"}, {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof"}, {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof"}, {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel"}, {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember"}, {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag"}, {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag"}, {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma"}, {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie"}, {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar"}, {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau"}, {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein"}, {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}, {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr"}, {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr"}, {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
{ORTH: "Jan.", LEMMA: "Januar"}, {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert"}, {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert"}, {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli"}, {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni"}, {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch"}, {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million"}, {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
{ORTH: "Mo.", LEMMA: "Montag"}, {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde"}, {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März"}, {ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}, {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März"}, {ORTH: "Mär.", LEMMA: "März", NORM: "März"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Nr.", LEMMA: "Nummer"}, {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober"}, {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original"}, {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt"}, {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor"}, {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion"}, {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag"}, {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "So.", LEMMA: "Sonntag"}, {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde"}, {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße"}, {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon"}, {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend"}, {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität"}, {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich"}, {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein"}, {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise"}, {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich"}, {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise"}, {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
{ORTH: "d.h.", LEMMA: "das heißt"}, {ORTH: "d.h.", LEMMA: "das heißt"},
{ORTH: "dgl.", LEMMA: "dergleichen"}, {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda"}, {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich"}, {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch"}, {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell"}, {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch"}, {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet"}, {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls"}, {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"}, {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber"}, {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
{ORTH: "i.O.", LEMMA: "in Ordnung"}, {ORTH: "i.O.", LEMMA: "in Ordnung"},
{ORTH: "i.d.R.", LEMMA: "in der Regel"}, {ORTH: "i.d.R.", LEMMA: "in der Regel"},
{ORTH: "incl.", LEMMA: "inklusive"}, {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive"}, {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere"}, {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch"}, {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
{ORTH: "lt.", LEMMA: "laut"}, {ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
{ORTH: "max.", LEMMA: "maximal"}, {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
{ORTH: "min.", LEMMA: "minimal"}, {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens"}, {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich"}, {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
{ORTH: "n.Chr.", LEMMA: "nach Christus"}, {ORTH: "n.Chr.", LEMMA: "nach Christus"},
{ORTH: "orig.", LEMMA: "original"}, {ORTH: "orig.", LEMMA: "original", NORM: "original"},
{ORTH: "röm.", LEMMA: "römisch"}, {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
{ORTH: "s.o.", LEMMA: "siehe oben"}, {ORTH: "s.o.", LEMMA: "siehe oben"},
{ORTH: "sog.", LEMMA: "so genannt"}, {ORTH: "sog.", LEMMA: "so genannt"},
{ORTH: "stellv.", LEMMA: "stellvertretend"}, {ORTH: "stellv.", LEMMA: "stellvertretend"},
{ORTH: "tägl.", LEMMA: "täglich"}, {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
{ORTH: "u.U.", LEMMA: "unter Umständen"}, {ORTH: "u.U.", LEMMA: "unter Umständen"},
{ORTH: "u.s.w.", LEMMA: "und so weiter"}, {ORTH: "u.s.w.", LEMMA: "und so weiter"},
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, {ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
@ -153,9 +153,9 @@ for exc_data in [
{ORTH: "v.Chr.", LEMMA: "vor Christus"}, {ORTH: "v.Chr.", LEMMA: "vor Christus"},
{ORTH: "v.a.", LEMMA: "vor allem"}, {ORTH: "v.a.", LEMMA: "vor allem"},
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
{ORTH: "vgl.", LEMMA: "vergleiche"}, {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht"}, {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht"}, {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
{ORTH: "z.B.", LEMMA: "zum Beispiel"}, {ORTH: "z.B.", LEMMA: "zum Beispiel"},
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
{ORTH: "z.T.", LEMMA: "zum Teil"}, {ORTH: "z.T.", LEMMA: "zum Teil"},
@ -163,7 +163,7 @@ for exc_data in [
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}, {ORTH: "z.Zt.", LEMMA: "zur Zeit"},
{ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch"}]: {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)

File diff suppressed because it is too large Load Diff

View File

@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]: for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}] {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}] {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
_exc[orth + "ma"] = [ _exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}] {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
@ -36,72 +36,72 @@ for pron in ["i"]:
for pron in ["i", "you", "he", "she", "it", "we", "they"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}] {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}] {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}] {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}] {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["i", "you", "we", "they"]: for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["you", "we", "they"]: for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}] {ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
for pron in ["he", "she", "it"]: for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'s"}] {ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "s"}] {ORTH: "s"}]
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'s"}] {ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "s"}] {ORTH: "s"}]
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}] {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}] {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}] {ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}] {ORTH: "re", LEMMA: "be", NORM: "are"}]
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d"}] {ORTH: "'d", NORM: "'d"}]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d"}] {ORTH: "d"}]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
# Verbs # Verbs
for verb_data in [ for verb_data in [
{ORTH: "ca", LEMMA: "can", TAG: "MD"}, {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "could", TAG: "MD"}, {ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do"}, {ORTH: "do", LEMMA: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do"}, {ORTH: "does", LEMMA: "do", NORM: "does"},
{ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
{ORTH: "may", TAG: "MD"}, {ORTH: "may", NORM: "may", TAG: "MD"},
{ORTH: "might", TAG: "MD"}, {ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", TAG: "MD"}, {ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "need"}, {ORTH: "need", NORM: "need"},
{ORTH: "ought"}, {ORTH: "ought", NORM: "ought", TAG: "MD"},
{ORTH: "sha", LEMMA: "shall", TAG: "MD"}, {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"}, {ORTH: "should", NORM: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"}, {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"}]: {ORTH: "would", NORM: "would", TAG: "MD"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}] {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "n't've"] = [ _exc[data[ORTH] + "n't've"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}, {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[data[ORTH] + "ntve"] = [ _exc[data[ORTH] + "ntve"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}, {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for verb_data in [ for verb_data in [
{ORTH: "could", TAG: "MD"}, {ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "might"}, {ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must"}, {ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "should"}]: {ORTH: "should", NORM: "should", TAG: "MD"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [
for verb_data in [ for verb_data in [
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"}, {ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be"}]: {ORTH: "were", LEMMA: "be", NORM: "were"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}] {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
# Other contractions with trailing apostrophe # Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
for exc_data in [ for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", LEMMA: "do", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", LEMMA: "go", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing"}, {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"}, {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old"}, {ORTH: "ol", LEMMA: "old", NORM: "old"},
{ORTH: "somethin", LEMMA: "something"}]: {ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
exc_data_tc = dict(exc_data) exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title() exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]: for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
# Other contractions with leading apostrophe # Other contractions with leading apostrophe
for exc_data in [ for exc_data in [
{ORTH: "cause", LEMMA: "because"}, {ORTH: "cause", LEMMA: "because", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"}, {ORTH: "ll", LEMMA: "will", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough"}]: {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]: for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc["%d%s" % (h, period)] = [ _exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h}, {ORTH: "%d" % h},
{ORTH: period, LEMMA: "a.m."}] {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc["%d%s" % (h, period)] = [ _exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h}, {ORTH: "%d" % h},
{ORTH: period, LEMMA: "p.m."}] {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
# Rest # Rest
@ -306,56 +306,56 @@ _other_exc = {
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"How'd'y": [ "How'd'y": [
{ORTH: "How", LEMMA: "how"}, {ORTH: "How", LEMMA: "how", NORM: "how"},
{ORTH: "'d", LEMMA: "do"}, {ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"not've": [ "not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}], {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"notve": [ "notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}], {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Not've": [ "Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}], {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Notve": [ "Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}], {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"cannot": [ "cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"}, {ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}], {ORTH: "not", LEMMA: "not", TAG: "RB"}],
"Cannot": [ "Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"}, {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}], {ORTH: "not", LEMMA: "not", TAG: "RB"}],
"gonna": [ "gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"}, {ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}], {ORTH: "na", LEMMA: "to", NORM: "to"}],
"Gonna": [ "Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"}, {ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}], {ORTH: "na", LEMMA: "to", NORM: "to"}],
"gotta": [ "gotta": [
{ORTH: "got"}, {ORTH: "got"},
{ORTH: "ta", LEMMA: "to"}], {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [ "Gotta": [
{ORTH: "Got"}, {ORTH: "Got", NORM: "got"},
{ORTH: "ta", LEMMA: "to"}], {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [ "let's": [
{ORTH: "let"}, {ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [ "Let's": [
{ORTH: "Let", LEMMA: "let"}, {ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
} }
@ -363,72 +363,80 @@ _exc.update(_other_exc)
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: "'s"}, {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s"}, {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"}, {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"}, {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}, {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"}, {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cause", LEMMA: "because"}, {ORTH: "'cause", LEMMA: "because", NORM: "because"},
{ORTH: "ma'am", LEMMA: "madam"}, {ORTH: "'cos", LEMMA: "because", NORM: "because"},
{ORTH: "Ma'am", LEMMA: "madam"}, {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
{ORTH: "o'clock", LEMMA: "o'clock"}, {ORTH: "'coz", LEMMA: "because", NORM: "because"},
{ORTH: "O'clock", LEMMA: "o'clock"}, {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"}, {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"}, {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"}, {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"}, {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"}, {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"}, {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California"}, {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"}, {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"}, {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"}, {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware"}, {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"}, {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida"}, {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"}, {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"}, {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"}, {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"}, {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"}, {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"}, {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July"}, {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June"}, {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"}, {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"}, {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"}, {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"}, {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"}, {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"}, {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"}, {ORTH: "May.", LEMMA: "May", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"}, {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"}, {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"}, {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"}, {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"}, {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"}, {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"}, {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"}, {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"}, {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"}, {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"}, {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"}, {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October"}, {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"}, {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"}, {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"}, {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"}, {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"}, {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"}, {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"}, {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]: {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -0,0 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}

View File

@ -301,7 +301,7 @@ class Language(object):
def evaluate(self, docs_golds): def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds) docs, golds = zip(*docs_golds)
scorer = Scorer() scorer = Scorer()
for doc, gold in zip(self.pipe(docs), golds): for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
scorer.score(doc, gold) scorer.score(doc, gold)
doc.tensor = None doc.tensor = None
return scorer return scorer

View File

@ -38,7 +38,7 @@ cdef class Morphology:
self.strings = string_store self.strings = string_store
self.tag_map = {} self.tag_map = {}
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1 self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys())) self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {} self.reverse_index = {}

View File

@ -8,20 +8,33 @@ import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) @pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
def test_tokenizer_splits_contractions(de_tokenizer, text): def test_de_tokenizer_splits_contractions(de_tokenizer, text):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) @pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
def test_tokenizer_handles_abbr(de_tokenizer, text): def test_de_tokenizer_handles_abbr(de_tokenizer, text):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(de_tokenizer): def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
text = "Ich bin z.Zt. im Urlaub." text = "Ich bin z.Zt. im Urlaub."
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 6 assert len(tokens) == 6
assert tokens[2].text == "z.Zt." assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit" assert tokens[2].lemma_ == "zur Zeit"
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
tokens = de_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -102,3 +102,16 @@ def test_en_tokenizer_handles_times(en_tokenizer, text):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].lemma_ in ["a.m.", "p.m."] assert tokens[1].lemma_ in ["a.m.", "p.m."]
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
tokens = en_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -0,0 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
from ...util import get_lang_class
from ..util import make_tempdir, assert_packed_msg_equal
import pytest
def load_tokenizer(b):
tok = get_lang_class('en').Defaults.create_tokenizer()
tok.from_bytes(b)
return tok
@pytest.mark.parametrize('text', ["I💜you", "theyre", "“hello”"])
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
tokenizer = en_tokenizer
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
doc1 = tokenizer(text)
doc2 = new_tokenizer(text)
assert [token.text for token in doc1] == [token.text for token in doc2]
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
tokenizer = en_tokenizer
with make_tempdir() as d:
file_path = d / 'tokenizer'
tokenizer.to_disk(file_path)
tokenizer_d = en_tokenizer.from_disk(file_path)
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()

View File

@ -10,6 +10,7 @@ import numpy
import tempfile import tempfile
import shutil import shutil
import contextlib import contextlib
import msgpack
from pathlib import Path from pathlib import Path
@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ] assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ] assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ] assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
def assert_packed_msg_equal(b1, b2):
"""Assert that two packed msgpack messages are equal."""
msg1 = msgpack.loads(b1, encoding='utf8')
msg2 = msgpack.loads(b2, encoding='utf8')
assert sorted(msg1.keys()) == sorted(msg2.keys())
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2
assert v1 == v2

View File

@ -2,6 +2,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import OrderedDict
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -355,14 +356,14 @@ cdef class Tokenizer:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Tokenizer` object. RETURNS (bytes): The serialized form of the `Tokenizer` object.
""" """
serializers = { serializers = OrderedDict((
'vocab': lambda: self.vocab.to_bytes(), ('vocab', lambda: self.vocab.to_bytes()),
'prefix_search': lambda: self.prefix_search.__self__.pattern, ('prefix_search', lambda: self.prefix_search.__self__.pattern),
'suffix_search': lambda: self.suffix_search.__self__.pattern, ('suffix_search', lambda: self.suffix_search.__self__.pattern),
'infix_finditer': lambda: self.infix_finditer.__self__.pattern, ('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
'token_match': lambda: self.token_match.__self__.pattern, ('token_match', lambda: self.token_match.__self__.pattern),
'exceptions': lambda: self._rules ('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
} ))
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
@ -372,15 +373,15 @@ cdef class Tokenizer:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (Tokenizer): The `Tokenizer` object. RETURNS (Tokenizer): The `Tokenizer` object.
""" """
data = {} data = OrderedDict()
deserializers = { deserializers = OrderedDict((
'vocab': lambda b: self.vocab.from_bytes(b), ('vocab', lambda b: self.vocab.from_bytes(b)),
'prefix_search': lambda b: data.setdefault('prefix', b), ('prefix_search', lambda b: data.setdefault('prefix', b)),
'suffix_search': lambda b: data.setdefault('suffix_search', b), ('suffix_search', lambda b: data.setdefault('suffix_search', b)),
'infix_finditer': lambda b: data.setdefault('infix_finditer', b), ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
'token_match': lambda b: data.setdefault('token_match', b), ('token_match', lambda b: data.setdefault('token_match', b)),
'exceptions': lambda b: data.setdefault('rules', b) ('exceptions', lambda b: data.setdefault('rules', b))
} ))
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'prefix_search' in data: if 'prefix_search' in data:
self.prefix_search = re.compile(data['prefix_search']).search self.prefix_search = re.compile(data['prefix_search']).search
@ -392,3 +393,4 @@ cdef class Tokenizer:
self.token_match = re.compile(data['token_match']).search self.token_match = re.compile(data['token_match']).search
for string, substrings in data.get('rules', {}).items(): for string, substrings in data.get('rules', {}).items():
self.add_special_case(string, substrings) self.add_special_case(string, substrings)
return self

View File

@ -437,7 +437,8 @@ cdef class Doc:
""" """
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self) yield from self.user_hooks['sents'](self)
return
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
@ -740,7 +741,7 @@ cdef class Doc:
token.spacy = self.c[end-1].spacy token.spacy = self.c[end-1].spacy
for attr_name, attr_value in attributes.items(): for attr_name, attr_value in attributes.items():
if attr_name == TAG: if attr_name == TAG:
self.vocab.morphology.assign_tag(token, attr_value) self.vocab.morphology.assign_tag(token, attr_value)
else: else:
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions

View File

@ -299,6 +299,22 @@ def compile_infix_regex(entries):
return re.compile(expression) return re.compile(expression)
def add_lookups(default_func, *lookups):
"""Extend an attribute function with special cases. If a word is in the
lookups, the value is returned. Otherwise the previous function is used.
default_func (callable): The default function to execute.
*lookups (dict): Lookup dictionary mapping string to attribute value.
RETURNS (callable): Lexical attribute getter.
"""
def get_attr(string):
for lookup in lookups:
if string in lookup:
return lookup[string]
return default_func(string)
return get_attr
def update_exc(base_exceptions, *addition_dicts): def update_exc(base_exceptions, *addition_dicts):
"""Update and validate tokenizer exceptions. Will overwrite exceptions. """Update and validate tokenizer exceptions. Will overwrite exceptions.

View File

@ -231,11 +231,13 @@ cdef class Vocab:
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
token = &tokens[i] token = &tokens[i]
# Set the special tokens up to have arbitrary attributes # Set the special tokens up to have arbitrary attributes
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH]) lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
token.lex = lex
if attrs.TAG in props: if attrs.TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG]) self.morphology.assign_tag(token, props[attrs.TAG])
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value)
return tokens return tokens
@property @property

View File

@ -205,7 +205,7 @@ p
+cell #[code arrow_spacing] +cell #[code arrow_spacing]
+cell int +cell int
+cell Spacing between arrows in px to avoid overlaps. +cell Spacing between arrows in px to avoid overlaps.
+cell #[code 20] +cell #[code 20] / #[code 12] (compact)
+row +row
+cell #[code word_spacing] +cell #[code word_spacing]

View File

@ -64,7 +64,7 @@ p
doc = nlp(u'Give it back! He pleaded.') doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give' assert doc[0].text == 'Give'
assert doc[-1].text == '.' assert doc[-1].text == '.'
span = doc[1:1] span = doc[1:3]
assert span.text == 'it back' assert span.text == 'it back'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])

View File

@ -141,7 +141,7 @@ p
else: else:
tokens.append(substring) tokens.append(substring)
substring = '' substring = ''
tokens.extend(suffixes) tokens.extend(reversed(suffixes))
return tokens return tokens
p p

View File

@ -59,9 +59,11 @@ p
| to customise the layout, for example: | to customise the layout, for example:
+aside("Important note") +aside("Important note")
| There's currently a known issue with the #[code compact] mode for long | There's currently a known issue with the #[code compact] mode for
| sentences with arrow spacing. If the spacing is larger than the arc | sentences with short arrows and long dependency labels, that causes labels
| itself, it'll cause the arc and its label to flip. | longer than the arrow to wrap. So if you come across this problem,
| especially when using custom labels, you'll have to increase the
| #[code distance] setting in the #[code options] to allow longer arcs.
+table(["Name", "Type", "Description", "Default"]) +table(["Name", "Type", "Description", "Default"])
+row +row