Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-03 13:31:40 -05:00
commit 94e063ae2a
19 changed files with 2109 additions and 216 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7
cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0
thinc>=6.7.1,<6.8.0
thinc>=6.7.2,<6.8.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six

View File

@ -191,7 +191,7 @@ def setup_package():
'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0',
'thinc>=6.7.1,<6.8.0',
'thinc>=6.7.2,<6.8.0',
'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six',

View File

@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
try:
httpd.serve_forever()
except KeyboardInterrupt:
prints("Shutting down server on port %d." % port)
finally:
httpd.server_close()
def app(environ, start_response):
@ -65,12 +70,13 @@ def app(environ, start_response):
return [res]
def parse_deps(doc, options={}):
def parse_deps(orig_doc, options={}):
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if options.get('collapse_punct', True):
spans = []
for word in doc[:-1]:

View File

@ -18,12 +18,11 @@ class DependencyRenderer(object):
offset_x, color, bg, font)
"""
self.compact = options.get('compact', False)
distance, arrow_width = (85, 8) if self.compact else (175, 10)
self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 20)
self.arrow_width = options.get('arrow_width', arrow_width)
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', distance)
self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50)
self.color = options.get('color', '#000000')
self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
-self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2
if self.compact:
y_curve = self.offset_y-level*self.distance/6
if y_curve == 0 and len(self.levels) > 5:
y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)

View File

@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
TPL_DEP_ARCS = """
<g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.8em">
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
</text>
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)

File diff suppressed because it is too large Load Diff

View File

@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]:
for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
_exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
_exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
_exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
@ -36,72 +36,72 @@ for pron in ["i"]:
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "s"}]
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]:
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'s"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "s"}]
_exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}]
_exc[orth + "'ve"] = [
{ORTH: orth},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
_exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'d"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", NORM: "'d"}]
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d"}]
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
# Verbs
for verb_data in [
{ORTH: "ca", LEMMA: "can", TAG: "MD"},
{ORTH: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do"},
{ORTH: "does", LEMMA: "do"},
{ORTH: "did", LEMMA: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", TAG: "VBD"},
{ORTH: "may", TAG: "MD"},
{ORTH: "might", TAG: "MD"},
{ORTH: "must", TAG: "MD"},
{ORTH: "need"},
{ORTH: "ought"},
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"}]:
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do", NORM: "does"},
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
{ORTH: "may", NORM: "may", TAG: "MD"},
{ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "need", NORM: "need"},
{ORTH: "ought", NORM: "ought", TAG: "MD"},
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
{ORTH: "should", NORM: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "would", NORM: "would", TAG: "MD"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "n't've"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[data[ORTH] + "ntve"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for verb_data in [
{ORTH: "could", TAG: "MD"},
{ORTH: "might"},
{ORTH: "must"},
{ORTH: "should"}]:
{ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "should", NORM: "should", TAG: "MD"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [
for verb_data in [
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"},
{ORTH: "were", LEMMA: "be"}]:
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
# Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"},
{ORTH: "ol", LEMMA: "old"},
{ORTH: "somethin", LEMMA: "something"}]:
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old", NORM: "old"},
{ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
# Other contractions with leading apostrophe
for exc_data in [
{ORTH: "cause", LEMMA: "because"},
{ORTH: "cause", LEMMA: "because", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"},
{ORTH: "nuff", LEMMA: "enough"}]:
{ORTH: "ll", LEMMA: "will", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h},
{ORTH: period, LEMMA: "a.m."}]
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
for period in ["p.m.", "pm"]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h},
{ORTH: period, LEMMA: "p.m."}]
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
# Rest
@ -306,56 +306,56 @@ _other_exc = {
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"How'd'y": [
{ORTH: "How", LEMMA: "how"},
{ORTH: "How", LEMMA: "how", NORM: "how"},
{ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
"Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
"gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}],
{ORTH: "na", LEMMA: "to", NORM: "to"}],
"Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}],
{ORTH: "na", LEMMA: "to", NORM: "to"}],
"gotta": [
{ORTH: "got"},
{ORTH: "ta", LEMMA: "to"}],
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [
{ORTH: "Got"},
{ORTH: "ta", LEMMA: "to"}],
{ORTH: "Got", NORM: "got"},
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [
{ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [
{ORTH: "Let", LEMMA: "let"},
{ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
}
@ -363,72 +363,80 @@ _exc.update(_other_exc)
for exc_data in [
{ORTH: "'S", LEMMA: "'s"},
{ORTH: "'s", LEMMA: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
{ORTH: "'S", LEMMA: "'s", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"},
{ORTH: "'cause", LEMMA: "because"},
{ORTH: "ma'am", LEMMA: "madam"},
{ORTH: "Ma'am", LEMMA: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock"},
{ORTH: "'Cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cos", LEMMA: "because", NORM: "because"},
{ORTH: "'Cos", LEMMA: "because", NORM: "because"},
{ORTH: "'coz", LEMMA: "because", NORM: "because"},
{ORTH: "'Coz", LEMMA: "because", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"},
{ORTH: "Calif.", LEMMA: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"},
{ORTH: "Del.", LEMMA: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"},
{ORTH: "Fla.", LEMMA: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"},
{ORTH: "Jul.", LEMMA: "July"},
{ORTH: "Jun.", LEMMA: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"},
{ORTH: "Oct.", LEMMA: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"},
{ORTH: "Sept.", LEMMA: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -0,0 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}

View File

@ -0,0 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
from ...util import get_lang_class
from ..util import make_tempdir, assert_packed_msg_equal
import pytest
def load_tokenizer(b):
tok = get_lang_class('en').Defaults.create_tokenizer()
tok.from_bytes(b)
return tok
@pytest.mark.parametrize('text', ["I💜you", "theyre", "“hello”"])
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
tokenizer = en_tokenizer
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
doc1 = tokenizer(text)
doc2 = new_tokenizer(text)
assert [token.text for token in doc1] == [token.text for token in doc2]
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
tokenizer = en_tokenizer
with make_tempdir() as d:
file_path = d / 'tokenizer'
tokenizer.to_disk(file_path)
tokenizer_d = en_tokenizer.from_disk(file_path)
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()

View File

@ -10,6 +10,7 @@ import numpy
import tempfile
import shutil
import contextlib
import msgpack
from pathlib import Path
@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
def assert_packed_msg_equal(b1, b2):
"""Assert that two packed msgpack messages are equal."""
msg1 = msgpack.loads(b1, encoding='utf8')
msg2 = msgpack.loads(b2, encoding='utf8')
assert sorted(msg1.keys()) == sorted(msg2.keys())
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2
assert v1 == v2

View File

@ -2,6 +2,7 @@
# coding: utf8
from __future__ import unicode_literals
from collections import OrderedDict
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool
@ -355,14 +356,14 @@ cdef class Tokenizer:
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Tokenizer` object.
"""
serializers = {
'vocab': lambda: self.vocab.to_bytes(),
'prefix_search': lambda: self.prefix_search.__self__.pattern,
'suffix_search': lambda: self.suffix_search.__self__.pattern,
'infix_finditer': lambda: self.infix_finditer.__self__.pattern,
'token_match': lambda: self.token_match.__self__.pattern,
'exceptions': lambda: self._rules
}
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('prefix_search', lambda: self.prefix_search.__self__.pattern),
('suffix_search', lambda: self.suffix_search.__self__.pattern),
('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
('token_match', lambda: self.token_match.__self__.pattern),
('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
))
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
@ -372,15 +373,15 @@ cdef class Tokenizer:
**exclude: Named attributes to prevent from being loaded.
RETURNS (Tokenizer): The `Tokenizer` object.
"""
data = {}
deserializers = {
'vocab': lambda b: self.vocab.from_bytes(b),
'prefix_search': lambda b: data.setdefault('prefix', b),
'suffix_search': lambda b: data.setdefault('suffix_search', b),
'infix_finditer': lambda b: data.setdefault('infix_finditer', b),
'token_match': lambda b: data.setdefault('token_match', b),
'exceptions': lambda b: data.setdefault('rules', b)
}
data = OrderedDict()
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('prefix_search', lambda b: data.setdefault('prefix', b)),
('suffix_search', lambda b: data.setdefault('suffix_search', b)),
('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
('token_match', lambda b: data.setdefault('token_match', b)),
('exceptions', lambda b: data.setdefault('rules', b))
))
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'prefix_search' in data:
self.prefix_search = re.compile(data['prefix_search']).search
@ -392,3 +393,4 @@ cdef class Tokenizer:
self.token_match = re.compile(data['token_match']).search
for string, substrings in data.get('rules', {}).items():
self.add_special_case(string, substrings)
return self

View File

@ -437,7 +437,8 @@ cdef class Doc:
"""
def __get__(self):
if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self)
yield from self.user_hooks['sents'](self)
return
if not self.is_parsed:
raise ValueError(

View File

@ -299,6 +299,22 @@ def compile_infix_regex(entries):
return re.compile(expression)
def add_lookups(default_func, *lookups):
"""Extend an attribute function with special cases. If a word is in the
lookups, the value is returned. Otherwise the previous function is used.
default_func (callable): The default function to execute.
*lookups (dict): Lookup dictionary mapping string to attribute value.
RETURNS (callable): Lexical attribute getter.
"""
def get_attr(string):
for lookup in lookups:
if string in lookup:
return lookup[string]
return default_func(string)
return get_attr
def update_exc(base_exceptions, *addition_dicts):
"""Update and validate tokenizer exceptions. Will overwrite exceptions.

View File

@ -231,11 +231,13 @@ cdef class Vocab:
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
token.lex = lex
if attrs.TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG])
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value)
return tokens
@property

View File

@ -205,7 +205,7 @@ p
+cell #[code arrow_spacing]
+cell int
+cell Spacing between arrows in px to avoid overlaps.
+cell #[code 20]
+cell #[code 20] / #[code 12] (compact)
+row
+cell #[code word_spacing]

View File

@ -64,7 +64,7 @@ p
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
span = doc[1:3]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])

View File

@ -141,7 +141,7 @@ p
else:
tokens.append(substring)
substring = ''
tokens.extend(suffixes)
tokens.extend(reversed(suffixes))
return tokens
p

View File

@ -59,9 +59,11 @@ p
| to customise the layout, for example:
+aside("Important note")
| There's currently a known issue with the #[code compact] mode for long
| sentences with arrow spacing. If the spacing is larger than the arc
| itself, it'll cause the arc and its label to flip.
| There's currently a known issue with the #[code compact] mode for
| sentences with short arrows and long dependency labels, that causes labels
| longer than the arrow to wrap. So if you come across this problem,
| especially when using custom labels, you'll have to increase the
| #[code distance] setting in the #[code options] to allow longer arcs.
+table(["Name", "Type", "Description", "Default"])
+row