Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-03 13:31:40 -05:00
commit 94e063ae2a
19 changed files with 2109 additions and 216 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.7.1,<6.8.0 thinc>=6.7.2,<6.8.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six

View File

@ -191,7 +191,7 @@ def setup_package():
'murmurhash>=0.28,<0.29', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.7.1,<6.8.0', 'thinc>=6.7.2,<6.8.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0', 'pip>=9.0.0,<10.0.0',
'six', 'six',

View File

@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
render(docs, style=style, page=page, minify=minify, options=options, manual=manual) render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app) httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
httpd.serve_forever() try:
httpd.serve_forever()
except KeyboardInterrupt:
prints("Shutting down server on port %d." % port)
finally:
httpd.server_close()
def app(environ, start_response): def app(environ, start_response):
@ -65,12 +70,13 @@ def app(environ, start_response):
return [res] return [res]
def parse_deps(doc, options={}): def parse_deps(orig_doc, options={}):
"""Generate dependency parse in {'words': [], 'arcs': []} format. """Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse. doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if options.get('collapse_punct', True): if options.get('collapse_punct', True):
spans = [] spans = []
for word in doc[:-1]: for word in doc[:-1]:

View File

@ -18,12 +18,11 @@ class DependencyRenderer(object):
offset_x, color, bg, font) offset_x, color, bg, font)
""" """
self.compact = options.get('compact', False) self.compact = options.get('compact', False)
distance, arrow_width = (85, 8) if self.compact else (175, 10)
self.word_spacing = options.get('word_spacing', 45) self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 20) self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width', arrow_width) self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2) self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', distance) self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50) self.offset_x = options.get('offset_x', 50)
self.color = options.get('color', '#000000') self.color = options.get('color', '#000000')
self.bg = options.get('bg', '#ffffff') self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
-self.arrow_spacing*(self.highest_level-level)/4) -self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2 y_curve = self.offset_y-level*self.distance/2
if self.compact:
y_curve = self.offset_y-level*self.distance/6
if y_curve == 0 and len(self.levels) > 5: if y_curve == 0 and len(self.levels) > 5:
y_curve = -self.distance y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end) arrowhead = self.get_arrowhead(direction, x_start, y, x_end)

View File

@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
TPL_DEP_ARCS = """ TPL_DEP_ARCS = """
<g class="displacy-arrow"> <g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/> <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.8em"> <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath> <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
</text> </text>
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/> <path class="displacy-arrowhead" d="{head}" fill="currentColor"/>

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)

File diff suppressed because it is too large Load Diff

View File

@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]: for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}] {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}] {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
_exc[orth + "ma"] = [ _exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}] {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
@ -36,72 +36,72 @@ for pron in ["i"]:
for pron in ["i", "you", "he", "she", "it", "we", "they"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}] {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}] {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}] {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}] {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["i", "you", "we", "they"]: for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["you", "we", "they"]: for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}] {ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
for pron in ["he", "she", "it"]: for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'s"}] {ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "s"}] {ORTH: "s"}]
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'s"}] {ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "s"}] {ORTH: "s"}]
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}] {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}] {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}] {ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}] {ORTH: "re", LEMMA: "be", NORM: "are"}]
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d"}] {ORTH: "'d", NORM: "'d"}]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d"}] {ORTH: "d"}]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
# Verbs # Verbs
for verb_data in [ for verb_data in [
{ORTH: "ca", LEMMA: "can", TAG: "MD"}, {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "could", TAG: "MD"}, {ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do"}, {ORTH: "do", LEMMA: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do"}, {ORTH: "does", LEMMA: "do", NORM: "does"},
{ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
{ORTH: "may", TAG: "MD"}, {ORTH: "may", NORM: "may", TAG: "MD"},
{ORTH: "might", TAG: "MD"}, {ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", TAG: "MD"}, {ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "need"}, {ORTH: "need", NORM: "need"},
{ORTH: "ought"}, {ORTH: "ought", NORM: "ought", TAG: "MD"},
{ORTH: "sha", LEMMA: "shall", TAG: "MD"}, {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"}, {ORTH: "should", NORM: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"}, {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"}]: {ORTH: "would", NORM: "would", TAG: "MD"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}] {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "n't've"] = [ _exc[data[ORTH] + "n't've"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}, {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[data[ORTH] + "ntve"] = [ _exc[data[ORTH] + "ntve"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}, {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for verb_data in [ for verb_data in [
{ORTH: "could", TAG: "MD"}, {ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "might"}, {ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must"}, {ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "should"}]: {ORTH: "should", NORM: "should", TAG: "MD"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [
for verb_data in [ for verb_data in [
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"}, {ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be"}]: {ORTH: "were", LEMMA: "be", NORM: "were"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}] {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
# Other contractions with trailing apostrophe # Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
for exc_data in [ for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", LEMMA: "do", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", LEMMA: "go", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing"}, {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"}, {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old"}, {ORTH: "ol", LEMMA: "old", NORM: "old"},
{ORTH: "somethin", LEMMA: "something"}]: {ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
exc_data_tc = dict(exc_data) exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title() exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]: for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
# Other contractions with leading apostrophe # Other contractions with leading apostrophe
for exc_data in [ for exc_data in [
{ORTH: "cause", LEMMA: "because"}, {ORTH: "cause", LEMMA: "because", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"}, {ORTH: "ll", LEMMA: "will", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough"}]: {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]: for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc["%d%s" % (h, period)] = [ _exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h}, {ORTH: "%d" % h},
{ORTH: period, LEMMA: "a.m."}] {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc["%d%s" % (h, period)] = [ _exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h}, {ORTH: "%d" % h},
{ORTH: period, LEMMA: "p.m."}] {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
# Rest # Rest
@ -306,56 +306,56 @@ _other_exc = {
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"How'd'y": [ "How'd'y": [
{ORTH: "How", LEMMA: "how"}, {ORTH: "How", LEMMA: "how", NORM: "how"},
{ORTH: "'d", LEMMA: "do"}, {ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"not've": [ "not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}], {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"notve": [ "notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}], {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Not've": [ "Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}], {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Notve": [ "Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}], {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"cannot": [ "cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"}, {ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}], {ORTH: "not", LEMMA: "not", TAG: "RB"}],
"Cannot": [ "Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"}, {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}], {ORTH: "not", LEMMA: "not", TAG: "RB"}],
"gonna": [ "gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"}, {ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}], {ORTH: "na", LEMMA: "to", NORM: "to"}],
"Gonna": [ "Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"}, {ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}], {ORTH: "na", LEMMA: "to", NORM: "to"}],
"gotta": [ "gotta": [
{ORTH: "got"}, {ORTH: "got"},
{ORTH: "ta", LEMMA: "to"}], {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [ "Gotta": [
{ORTH: "Got"}, {ORTH: "Got", NORM: "got"},
{ORTH: "ta", LEMMA: "to"}], {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [ "let's": [
{ORTH: "let"}, {ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [ "Let's": [
{ORTH: "Let", LEMMA: "let"}, {ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
} }
@ -363,72 +363,80 @@ _exc.update(_other_exc)
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: "'s"}, {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s"}, {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"}, {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"}, {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}, {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"}, {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cause", LEMMA: "because"}, {ORTH: "'cause", LEMMA: "because", NORM: "because"},
{ORTH: "ma'am", LEMMA: "madam"}, {ORTH: "'cos", LEMMA: "because", NORM: "because"},
{ORTH: "Ma'am", LEMMA: "madam"}, {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
{ORTH: "o'clock", LEMMA: "o'clock"}, {ORTH: "'coz", LEMMA: "because", NORM: "because"},
{ORTH: "O'clock", LEMMA: "o'clock"}, {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"}, {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"}, {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"}, {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"}, {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"}, {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"}, {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California"}, {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"}, {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"}, {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"}, {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware"}, {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"}, {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida"}, {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"}, {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"}, {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"}, {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"}, {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"}, {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"}, {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July"}, {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June"}, {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"}, {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"}, {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"}, {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"}, {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"}, {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"}, {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"}, {ORTH: "May.", LEMMA: "May", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"}, {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"}, {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"}, {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"}, {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"}, {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"}, {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"}, {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"}, {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"}, {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"}, {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"}, {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"}, {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October"}, {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"}, {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"}, {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"}, {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"}, {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"}, {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"}, {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"}, {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]: {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -0,0 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}

View File

@ -0,0 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
from ...util import get_lang_class
from ..util import make_tempdir, assert_packed_msg_equal
import pytest
def load_tokenizer(b):
tok = get_lang_class('en').Defaults.create_tokenizer()
tok.from_bytes(b)
return tok
@pytest.mark.parametrize('text', ["I💜you", "theyre", "“hello”"])
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
tokenizer = en_tokenizer
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
doc1 = tokenizer(text)
doc2 = new_tokenizer(text)
assert [token.text for token in doc1] == [token.text for token in doc2]
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
tokenizer = en_tokenizer
with make_tempdir() as d:
file_path = d / 'tokenizer'
tokenizer.to_disk(file_path)
tokenizer_d = en_tokenizer.from_disk(file_path)
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()

View File

@ -10,6 +10,7 @@ import numpy
import tempfile import tempfile
import shutil import shutil
import contextlib import contextlib
import msgpack
from pathlib import Path from pathlib import Path
@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ] assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ] assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ] assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
def assert_packed_msg_equal(b1, b2):
"""Assert that two packed msgpack messages are equal."""
msg1 = msgpack.loads(b1, encoding='utf8')
msg2 = msgpack.loads(b2, encoding='utf8')
assert sorted(msg1.keys()) == sorted(msg2.keys())
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2
assert v1 == v2

View File

@ -2,6 +2,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import OrderedDict
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -355,14 +356,14 @@ cdef class Tokenizer:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Tokenizer` object. RETURNS (bytes): The serialized form of the `Tokenizer` object.
""" """
serializers = { serializers = OrderedDict((
'vocab': lambda: self.vocab.to_bytes(), ('vocab', lambda: self.vocab.to_bytes()),
'prefix_search': lambda: self.prefix_search.__self__.pattern, ('prefix_search', lambda: self.prefix_search.__self__.pattern),
'suffix_search': lambda: self.suffix_search.__self__.pattern, ('suffix_search', lambda: self.suffix_search.__self__.pattern),
'infix_finditer': lambda: self.infix_finditer.__self__.pattern, ('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
'token_match': lambda: self.token_match.__self__.pattern, ('token_match', lambda: self.token_match.__self__.pattern),
'exceptions': lambda: self._rules ('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
} ))
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
@ -372,15 +373,15 @@ cdef class Tokenizer:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (Tokenizer): The `Tokenizer` object. RETURNS (Tokenizer): The `Tokenizer` object.
""" """
data = {} data = OrderedDict()
deserializers = { deserializers = OrderedDict((
'vocab': lambda b: self.vocab.from_bytes(b), ('vocab', lambda b: self.vocab.from_bytes(b)),
'prefix_search': lambda b: data.setdefault('prefix', b), ('prefix_search', lambda b: data.setdefault('prefix', b)),
'suffix_search': lambda b: data.setdefault('suffix_search', b), ('suffix_search', lambda b: data.setdefault('suffix_search', b)),
'infix_finditer': lambda b: data.setdefault('infix_finditer', b), ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
'token_match': lambda b: data.setdefault('token_match', b), ('token_match', lambda b: data.setdefault('token_match', b)),
'exceptions': lambda b: data.setdefault('rules', b) ('exceptions', lambda b: data.setdefault('rules', b))
} ))
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'prefix_search' in data: if 'prefix_search' in data:
self.prefix_search = re.compile(data['prefix_search']).search self.prefix_search = re.compile(data['prefix_search']).search
@ -392,3 +393,4 @@ cdef class Tokenizer:
self.token_match = re.compile(data['token_match']).search self.token_match = re.compile(data['token_match']).search
for string, substrings in data.get('rules', {}).items(): for string, substrings in data.get('rules', {}).items():
self.add_special_case(string, substrings) self.add_special_case(string, substrings)
return self

View File

@ -437,7 +437,8 @@ cdef class Doc:
""" """
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self) yield from self.user_hooks['sents'](self)
return
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(

View File

@ -299,6 +299,22 @@ def compile_infix_regex(entries):
return re.compile(expression) return re.compile(expression)
def add_lookups(default_func, *lookups):
"""Extend an attribute function with special cases. If a word is in the
lookups, the value is returned. Otherwise the previous function is used.
default_func (callable): The default function to execute.
*lookups (dict): Lookup dictionary mapping string to attribute value.
RETURNS (callable): Lexical attribute getter.
"""
def get_attr(string):
for lookup in lookups:
if string in lookup:
return lookup[string]
return default_func(string)
return get_attr
def update_exc(base_exceptions, *addition_dicts): def update_exc(base_exceptions, *addition_dicts):
"""Update and validate tokenizer exceptions. Will overwrite exceptions. """Update and validate tokenizer exceptions. Will overwrite exceptions.

View File

@ -231,11 +231,13 @@ cdef class Vocab:
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
token = &tokens[i] token = &tokens[i]
# Set the special tokens up to have arbitrary attributes # Set the special tokens up to have arbitrary attributes
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH]) lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
token.lex = lex
if attrs.TAG in props: if attrs.TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG]) self.morphology.assign_tag(token, props[attrs.TAG])
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value)
return tokens return tokens
@property @property

View File

@ -205,7 +205,7 @@ p
+cell #[code arrow_spacing] +cell #[code arrow_spacing]
+cell int +cell int
+cell Spacing between arrows in px to avoid overlaps. +cell Spacing between arrows in px to avoid overlaps.
+cell #[code 20] +cell #[code 20] / #[code 12] (compact)
+row +row
+cell #[code word_spacing] +cell #[code word_spacing]

View File

@ -64,7 +64,7 @@ p
doc = nlp(u'Give it back! He pleaded.') doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give' assert doc[0].text == 'Give'
assert doc[-1].text == '.' assert doc[-1].text == '.'
span = doc[1:1] span = doc[1:3]
assert span.text == 'it back' assert span.text == 'it back'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])

View File

@ -141,7 +141,7 @@ p
else: else:
tokens.append(substring) tokens.append(substring)
substring = '' substring = ''
tokens.extend(suffixes) tokens.extend(reversed(suffixes))
return tokens return tokens
p p

View File

@ -59,9 +59,11 @@ p
| to customise the layout, for example: | to customise the layout, for example:
+aside("Important note") +aside("Important note")
| There's currently a known issue with the #[code compact] mode for long | There's currently a known issue with the #[code compact] mode for
| sentences with arrow spacing. If the spacing is larger than the arc | sentences with short arrows and long dependency labels, that causes labels
| itself, it'll cause the arc and its label to flip. | longer than the arrow to wrap. So if you come across this problem,
| especially when using custom labels, you'll have to increase the
| #[code distance] setting in the #[code options] to allow longer arcs.
+table(["Name", "Type", "Description", "Default"]) +table(["Name", "Type", "Description", "Default"])
+row +row