mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
94e063ae2a
|
@ -3,7 +3,7 @@ pathlib
|
|||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.7.1,<6.8.0
|
||||
thinc>=6.7.2,<6.8.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
|
|
2
setup.py
2
setup.py
|
@ -191,7 +191,7 @@ def setup_package():
|
|||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.7.1,<6.8.0',
|
||||
'thinc>=6.7.2,<6.8.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
|
|
|
@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
prints("Shutting down server on port %d." % port)
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
|
@ -65,12 +70,13 @@ def app(environ, start_response):
|
|||
return [res]
|
||||
|
||||
|
||||
def parse_deps(doc, options={}):
|
||||
def parse_deps(orig_doc, options={}):
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
doc (Doc): Document do parse.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||
if options.get('collapse_punct', True):
|
||||
spans = []
|
||||
for word in doc[:-1]:
|
||||
|
|
|
@ -18,12 +18,11 @@ class DependencyRenderer(object):
|
|||
offset_x, color, bg, font)
|
||||
"""
|
||||
self.compact = options.get('compact', False)
|
||||
distance, arrow_width = (85, 8) if self.compact else (175, 10)
|
||||
self.word_spacing = options.get('word_spacing', 45)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 20)
|
||||
self.arrow_width = options.get('arrow_width', arrow_width)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
|
||||
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
|
||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||
self.distance = options.get('distance', distance)
|
||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
||||
self.offset_x = options.get('offset_x', 50)
|
||||
self.color = options.get('color', '#000000')
|
||||
self.bg = options.get('bg', '#ffffff')
|
||||
|
@ -99,6 +98,8 @@ class DependencyRenderer(object):
|
|||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||
-self.arrow_spacing*(self.highest_level-level)/4)
|
||||
y_curve = self.offset_y-level*self.distance/2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y-level*self.distance/6
|
||||
if y_curve == 0 and len(self.levels) > 5:
|
||||
y_curve = -self.distance
|
||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||
|
|
|
@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
|
|||
TPL_DEP_ARCS = """
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
||||
<text dy="1.25em" style="font-size: 0.8em">
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
|
1761
spacy/lang/en/norm_exceptions.py
Normal file
1761
spacy/lang/en/norm_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
|||
for pron in ["i"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||
|
||||
_exc[orth + "m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
|
||||
|
||||
_exc[orth + "'ma"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||
|
||||
_exc[orth + "ma"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||
|
||||
|
@ -36,72 +36,72 @@ for pron in ["i"]:
|
|||
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "ll"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "llve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "dve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for pron in ["i", "you", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for pron in ["you", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
|
||||
|
||||
|
||||
for pron in ["he", "she", "it"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'s"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'s", NORM: "'s"}]
|
||||
|
||||
_exc[orth + "s"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "s"}]
|
||||
|
||||
|
||||
|
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
|
|||
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'s"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"}]
|
||||
|
||||
_exc[orth + "s"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "s"}]
|
||||
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "ll"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "llve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'d"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'d", NORM: "'d"}]
|
||||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d"}]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "dve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
# Verbs
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "ca", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "could", TAG: "MD"},
|
||||
{ORTH: "do", LEMMA: "do"},
|
||||
{ORTH: "does", LEMMA: "do"},
|
||||
{ORTH: "did", LEMMA: "do", TAG: "VBD"},
|
||||
{ORTH: "had", LEMMA: "have", TAG: "VBD"},
|
||||
{ORTH: "may", TAG: "MD"},
|
||||
{ORTH: "might", TAG: "MD"},
|
||||
{ORTH: "must", TAG: "MD"},
|
||||
{ORTH: "need"},
|
||||
{ORTH: "ought"},
|
||||
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
|
||||
{ORTH: "should", TAG: "MD"},
|
||||
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "would", TAG: "MD"}]:
|
||||
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||
{ORTH: "could", NORM: "could", TAG: "MD"},
|
||||
{ORTH: "do", LEMMA: "do", NORM: "do"},
|
||||
{ORTH: "does", LEMMA: "do", NORM: "does"},
|
||||
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
|
||||
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
|
||||
{ORTH: "may", NORM: "may", TAG: "MD"},
|
||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||
{ORTH: "need", NORM: "need"},
|
||||
{ORTH: "ought", NORM: "ought", TAG: "MD"},
|
||||
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
|
||||
{ORTH: "should", NORM: "should", TAG: "MD"},
|
||||
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "would", NORM: "would", TAG: "MD"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
_exc[data[ORTH] + "n't"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "nt"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "n't've"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[data[ORTH] + "ntve"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "could", TAG: "MD"},
|
||||
{ORTH: "might"},
|
||||
{ORTH: "must"},
|
||||
{ORTH: "should"}]:
|
||||
{ORTH: "could", NORM: "could", TAG: "MD"},
|
||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||
{ORTH: "should", NORM: "should", TAG: "MD"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
|
@ -228,21 +228,21 @@ for verb_data in [
|
|||
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
|
||||
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be"},
|
||||
{ORTH: "were", LEMMA: "be"}]:
|
||||
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
||||
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
_exc[data[ORTH] + "n't"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "nt"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
|
||||
# Other contractions with trailing apostrophe
|
||||
|
@ -250,10 +250,10 @@ for verb_data in [
|
|||
for exc_data in [
|
||||
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
|
||||
{ORTH: "goin", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "nothin", LEMMA: "nothing"},
|
||||
{ORTH: "nuthin", LEMMA: "nothing"},
|
||||
{ORTH: "ol", LEMMA: "old"},
|
||||
{ORTH: "somethin", LEMMA: "something"}]:
|
||||
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
|
||||
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
|
||||
{ORTH: "ol", LEMMA: "old", NORM: "old"},
|
||||
{ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
|
||||
exc_data_tc = dict(exc_data)
|
||||
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
||||
for data in [exc_data, exc_data_tc]:
|
||||
|
@ -266,10 +266,10 @@ for exc_data in [
|
|||
# Other contractions with leading apostrophe
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "cause", LEMMA: "because"},
|
||||
{ORTH: "cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
||||
{ORTH: "ll", LEMMA: "will"},
|
||||
{ORTH: "nuff", LEMMA: "enough"}]:
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will"},
|
||||
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
|
||||
exc_data_apos = dict(exc_data)
|
||||
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
||||
for data in [exc_data, exc_data_apos]:
|
||||
|
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
|
|||
for period in ["a.m.", "am"]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d" % h},
|
||||
{ORTH: period, LEMMA: "a.m."}]
|
||||
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
|
||||
for period in ["p.m.", "pm"]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d" % h},
|
||||
{ORTH: period, LEMMA: "p.m."}]
|
||||
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
|
||||
|
||||
|
||||
# Rest
|
||||
|
@ -306,56 +306,56 @@ _other_exc = {
|
|||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||
|
||||
"How'd'y": [
|
||||
{ORTH: "How", LEMMA: "how"},
|
||||
{ORTH: "How", LEMMA: "how", NORM: "how"},
|
||||
{ORTH: "'d", LEMMA: "do"},
|
||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||
|
||||
"not've": [
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"notve": [
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"Not've": [
|
||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"Notve": [
|
||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"cannot": [
|
||||
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||
|
||||
"Cannot": [
|
||||
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||
|
||||
"gonna": [
|
||||
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "na", LEMMA: "to"}],
|
||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"Gonna": [
|
||||
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "na", LEMMA: "to"}],
|
||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"gotta": [
|
||||
{ORTH: "got"},
|
||||
{ORTH: "ta", LEMMA: "to"}],
|
||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"Gotta": [
|
||||
{ORTH: "Got"},
|
||||
{ORTH: "ta", LEMMA: "to"}],
|
||||
{ORTH: "Got", NORM: "got"},
|
||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"let's": [
|
||||
{ORTH: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
|
||||
|
||||
"Let's": [
|
||||
{ORTH: "Let", LEMMA: "let"},
|
||||
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
|
||||
}
|
||||
|
||||
|
@ -363,72 +363,80 @@ _exc.update(_other_exc)
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "'S", LEMMA: "'s"},
|
||||
{ORTH: "'s", LEMMA: "'s"},
|
||||
{ORTH: "\u2018S", LEMMA: "'s"},
|
||||
{ORTH: "\u2018s", LEMMA: "'s"},
|
||||
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
|
||||
{ORTH: "'S", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "'s", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
|
||||
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"},
|
||||
{ORTH: "'Cause", LEMMA: "because"},
|
||||
{ORTH: "'cause", LEMMA: "because"},
|
||||
{ORTH: "ma'am", LEMMA: "madam"},
|
||||
{ORTH: "Ma'am", LEMMA: "madam"},
|
||||
{ORTH: "o'clock", LEMMA: "o'clock"},
|
||||
{ORTH: "O'clock", LEMMA: "o'clock"},
|
||||
{ORTH: "'Cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cos", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Cos", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'coz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Coz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
|
||||
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
|
||||
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
|
||||
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||
|
||||
{ORTH: "Mt.", LEMMA: "Mount"},
|
||||
{ORTH: "Ak.", LEMMA: "Alaska"},
|
||||
{ORTH: "Ala.", LEMMA: "Alabama"},
|
||||
{ORTH: "Apr.", LEMMA: "April"},
|
||||
{ORTH: "Ariz.", LEMMA: "Arizona"},
|
||||
{ORTH: "Ark.", LEMMA: "Arkansas"},
|
||||
{ORTH: "Aug.", LEMMA: "August"},
|
||||
{ORTH: "Calif.", LEMMA: "California"},
|
||||
{ORTH: "Colo.", LEMMA: "Colorado"},
|
||||
{ORTH: "Conn.", LEMMA: "Connecticut"},
|
||||
{ORTH: "Dec.", LEMMA: "December"},
|
||||
{ORTH: "Del.", LEMMA: "Delaware"},
|
||||
{ORTH: "Feb.", LEMMA: "February"},
|
||||
{ORTH: "Fla.", LEMMA: "Florida"},
|
||||
{ORTH: "Ga.", LEMMA: "Georgia"},
|
||||
{ORTH: "Ia.", LEMMA: "Iowa"},
|
||||
{ORTH: "Id.", LEMMA: "Idaho"},
|
||||
{ORTH: "Ill.", LEMMA: "Illinois"},
|
||||
{ORTH: "Ind.", LEMMA: "Indiana"},
|
||||
{ORTH: "Jan.", LEMMA: "January"},
|
||||
{ORTH: "Jul.", LEMMA: "July"},
|
||||
{ORTH: "Jun.", LEMMA: "June"},
|
||||
{ORTH: "Kan.", LEMMA: "Kansas"},
|
||||
{ORTH: "Kans.", LEMMA: "Kansas"},
|
||||
{ORTH: "Ky.", LEMMA: "Kentucky"},
|
||||
{ORTH: "La.", LEMMA: "Louisiana"},
|
||||
{ORTH: "Mar.", LEMMA: "March"},
|
||||
{ORTH: "Mass.", LEMMA: "Massachusetts"},
|
||||
{ORTH: "May.", LEMMA: "May"},
|
||||
{ORTH: "Mich.", LEMMA: "Michigan"},
|
||||
{ORTH: "Minn.", LEMMA: "Minnesota"},
|
||||
{ORTH: "Miss.", LEMMA: "Mississippi"},
|
||||
{ORTH: "N.C.", LEMMA: "North Carolina"},
|
||||
{ORTH: "N.D.", LEMMA: "North Dakota"},
|
||||
{ORTH: "N.H.", LEMMA: "New Hampshire"},
|
||||
{ORTH: "N.J.", LEMMA: "New Jersey"},
|
||||
{ORTH: "N.M.", LEMMA: "New Mexico"},
|
||||
{ORTH: "N.Y.", LEMMA: "New York"},
|
||||
{ORTH: "Neb.", LEMMA: "Nebraska"},
|
||||
{ORTH: "Nebr.", LEMMA: "Nebraska"},
|
||||
{ORTH: "Nev.", LEMMA: "Nevada"},
|
||||
{ORTH: "Nov.", LEMMA: "November"},
|
||||
{ORTH: "Oct.", LEMMA: "October"},
|
||||
{ORTH: "Okla.", LEMMA: "Oklahoma"},
|
||||
{ORTH: "Ore.", LEMMA: "Oregon"},
|
||||
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
|
||||
{ORTH: "S.C.", LEMMA: "South Carolina"},
|
||||
{ORTH: "Sep.", LEMMA: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September"},
|
||||
{ORTH: "Tenn.", LEMMA: "Tennessee"},
|
||||
{ORTH: "Va.", LEMMA: "Virginia"},
|
||||
{ORTH: "Wash.", LEMMA: "Washington"},
|
||||
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
|
||||
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
||||
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
||||
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
|
||||
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
|
||||
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
|
||||
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
|
||||
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
|
||||
{ORTH: "Calif.", LEMMA: "California", NORM: "California"},
|
||||
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
|
||||
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
|
||||
{ORTH: "Dec.", LEMMA: "December", NORM: "December"},
|
||||
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
|
||||
{ORTH: "Feb.", LEMMA: "February", NORM: "February"},
|
||||
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
|
||||
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
|
||||
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
|
||||
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
|
||||
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
|
||||
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
|
||||
{ORTH: "Jan.", LEMMA: "January", NORM: "January"},
|
||||
{ORTH: "Jul.", LEMMA: "July", NORM: "July"},
|
||||
{ORTH: "Jun.", LEMMA: "June", NORM: "June"},
|
||||
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
|
||||
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
|
||||
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
|
||||
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
|
||||
{ORTH: "Mar.", LEMMA: "March", NORM: "March"},
|
||||
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
|
||||
{ORTH: "May.", LEMMA: "May", NORM: "May"},
|
||||
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
|
||||
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
|
||||
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
|
||||
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
|
||||
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
|
||||
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
|
||||
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
|
||||
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
|
||||
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
|
||||
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
|
||||
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
|
||||
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
|
||||
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
|
||||
{ORTH: "Oct.", LEMMA: "October", NORM: "October"},
|
||||
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
|
||||
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
|
||||
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
|
||||
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
|
||||
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
|
||||
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
|
||||
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
|
||||
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||
|
||||
|
||||
|
|
46
spacy/lang/norm_exceptions.py
Normal file
46
spacy/lang/norm_exceptions.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Individual languages can also add their own exceptions and overwrite them -
|
||||
# for example, British vs. American spelling in English.
|
||||
|
||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||
# Note that this does not change any other token attributes. Its main purpose
|
||||
# is to normalise the word representations so that equivalent tokens receive
|
||||
# similar representations. For example: $ and € are very different, but they're
|
||||
# both currency symbols. By normalising currency symbols to $, all symbols are
|
||||
# seen as similar, no matter how common they are in the training data.
|
||||
|
||||
|
||||
BASE_NORMS = {
|
||||
"'s": "'s",
|
||||
"'S": "'s",
|
||||
"’s": "'s",
|
||||
"’S": "'s",
|
||||
"’": "'",
|
||||
"‘": "'",
|
||||
"´": "'",
|
||||
"`": "'",
|
||||
"”": '"',
|
||||
"“": '"',
|
||||
"''": '"',
|
||||
"``": '"',
|
||||
"´´": '"',
|
||||
"„": '"',
|
||||
"»": '"',
|
||||
"«": '"',
|
||||
"…": "...",
|
||||
"—": "-",
|
||||
"–": "-",
|
||||
"--": "-",
|
||||
"---": "-",
|
||||
"€": "$",
|
||||
"£": "$",
|
||||
"¥": "$",
|
||||
"฿": "$",
|
||||
"US$": "$",
|
||||
"C$": "$",
|
||||
"A$": "$"
|
||||
}
|
33
spacy/tests/serialize/test_serialize_tokenizer.py
Normal file
33
spacy/tests/serialize/test_serialize_tokenizer.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...util import get_lang_class
|
||||
from ..util import make_tempdir, assert_packed_msg_equal
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def load_tokenizer(b):
|
||||
tok = get_lang_class('en').Defaults.create_tokenizer()
|
||||
tok.from_bytes(b)
|
||||
return tok
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
|
||||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||||
tokenizer = en_tokenizer
|
||||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||||
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
||||
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
||||
doc1 = tokenizer(text)
|
||||
doc2 = new_tokenizer(text)
|
||||
assert [token.text for token in doc1] == [token.text for token in doc2]
|
||||
|
||||
|
||||
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
||||
tokenizer = en_tokenizer
|
||||
with make_tempdir() as d:
|
||||
file_path = d / 'tokenizer'
|
||||
tokenizer.to_disk(file_path)
|
||||
tokenizer_d = en_tokenizer.from_disk(file_path)
|
||||
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|
|
@ -10,6 +10,7 @@ import numpy
|
|||
import tempfile
|
||||
import shutil
|
||||
import contextlib
|
||||
import msgpack
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
|
|||
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
|
||||
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
|
||||
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
|
||||
|
||||
|
||||
def assert_packed_msg_equal(b1, b2):
|
||||
"""Assert that two packed msgpack messages are equal."""
|
||||
msg1 = msgpack.loads(b1, encoding='utf8')
|
||||
msg2 = msgpack.loads(b2, encoding='utf8')
|
||||
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
||||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||
assert k1 == k2
|
||||
assert v1 == v2
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -355,14 +356,14 @@ cdef class Tokenizer:
|
|||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||
"""
|
||||
serializers = {
|
||||
'vocab': lambda: self.vocab.to_bytes(),
|
||||
'prefix_search': lambda: self.prefix_search.__self__.pattern,
|
||||
'suffix_search': lambda: self.suffix_search.__self__.pattern,
|
||||
'infix_finditer': lambda: self.infix_finditer.__self__.pattern,
|
||||
'token_match': lambda: self.token_match.__self__.pattern,
|
||||
'exceptions': lambda: self._rules
|
||||
}
|
||||
serializers = OrderedDict((
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('prefix_search', lambda: self.prefix_search.__self__.pattern),
|
||||
('suffix_search', lambda: self.suffix_search.__self__.pattern),
|
||||
('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
|
||||
('token_match', lambda: self.token_match.__self__.pattern),
|
||||
('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
|
||||
))
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
|
@ -372,15 +373,15 @@ cdef class Tokenizer:
|
|||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||
"""
|
||||
data = {}
|
||||
deserializers = {
|
||||
'vocab': lambda b: self.vocab.from_bytes(b),
|
||||
'prefix_search': lambda b: data.setdefault('prefix', b),
|
||||
'suffix_search': lambda b: data.setdefault('suffix_search', b),
|
||||
'infix_finditer': lambda b: data.setdefault('infix_finditer', b),
|
||||
'token_match': lambda b: data.setdefault('token_match', b),
|
||||
'exceptions': lambda b: data.setdefault('rules', b)
|
||||
}
|
||||
data = OrderedDict()
|
||||
deserializers = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('prefix_search', lambda b: data.setdefault('prefix', b)),
|
||||
('suffix_search', lambda b: data.setdefault('suffix_search', b)),
|
||||
('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
|
||||
('token_match', lambda b: data.setdefault('token_match', b)),
|
||||
('exceptions', lambda b: data.setdefault('rules', b))
|
||||
))
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if 'prefix_search' in data:
|
||||
self.prefix_search = re.compile(data['prefix_search']).search
|
||||
|
@ -392,3 +393,4 @@ cdef class Tokenizer:
|
|||
self.token_match = re.compile(data['token_match']).search
|
||||
for string, substrings in data.get('rules', {}).items():
|
||||
self.add_special_case(string, substrings)
|
||||
return self
|
||||
|
|
|
@ -437,7 +437,8 @@ cdef class Doc:
|
|||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
return self.user_hooks['sents'](self)
|
||||
yield from self.user_hooks['sents'](self)
|
||||
return
|
||||
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
|
|
|
@ -299,6 +299,22 @@ def compile_infix_regex(entries):
|
|||
return re.compile(expression)
|
||||
|
||||
|
||||
def add_lookups(default_func, *lookups):
|
||||
"""Extend an attribute function with special cases. If a word is in the
|
||||
lookups, the value is returned. Otherwise the previous function is used.
|
||||
|
||||
default_func (callable): The default function to execute.
|
||||
*lookups (dict): Lookup dictionary mapping string to attribute value.
|
||||
RETURNS (callable): Lexical attribute getter.
|
||||
"""
|
||||
def get_attr(string):
|
||||
for lookup in lookups:
|
||||
if string in lookup:
|
||||
return lookup[string]
|
||||
return default_func(string)
|
||||
return get_attr
|
||||
|
||||
|
||||
def update_exc(base_exceptions, *addition_dicts):
|
||||
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
|
||||
|
||||
|
|
|
@ -231,11 +231,13 @@ cdef class Vocab:
|
|||
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
|
||||
token = &tokens[i]
|
||||
# Set the special tokens up to have arbitrary attributes
|
||||
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
||||
token.lex = lex
|
||||
if attrs.TAG in props:
|
||||
self.morphology.assign_tag(token, props[attrs.TAG])
|
||||
for attr_id, value in props.items():
|
||||
Token.set_struct_attr(token, attr_id, value)
|
||||
Lexeme.set_struct_attr(lex, attr_id, value)
|
||||
return tokens
|
||||
|
||||
@property
|
||||
|
|
|
@ -205,7 +205,7 @@ p
|
|||
+cell #[code arrow_spacing]
|
||||
+cell int
|
||||
+cell Spacing between arrows in px to avoid overlaps.
|
||||
+cell #[code 20]
|
||||
+cell #[code 20] / #[code 12] (compact)
|
||||
|
||||
+row
|
||||
+cell #[code word_spacing]
|
||||
|
|
|
@ -64,7 +64,7 @@ p
|
|||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
span = doc[1:3]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
|
|
@ -141,7 +141,7 @@ p
|
|||
else:
|
||||
tokens.append(substring)
|
||||
substring = ''
|
||||
tokens.extend(suffixes)
|
||||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
|
||||
p
|
||||
|
|
|
@ -59,9 +59,11 @@ p
|
|||
| to customise the layout, for example:
|
||||
|
||||
+aside("Important note")
|
||||
| There's currently a known issue with the #[code compact] mode for long
|
||||
| sentences with arrow spacing. If the spacing is larger than the arc
|
||||
| itself, it'll cause the arc and its label to flip.
|
||||
| There's currently a known issue with the #[code compact] mode for
|
||||
| sentences with short arrows and long dependency labels, that causes labels
|
||||
| longer than the arrow to wrap. So if you come across this problem,
|
||||
| especially when using custom labels, you'll have to increase the
|
||||
| #[code distance] setting in the #[code options] to allow longer arcs.
|
||||
|
||||
+table(["Name", "Type", "Description", "Default"])
|
||||
+row
|
||||
|
|
Loading…
Reference in New Issue
Block a user