Merge branch 'master' into pr/5060

This commit is contained in:
Ines Montani 2020-02-26 12:51:29 +01:00
commit ed9358420e
14 changed files with 109 additions and 10 deletions

View File

@ -1,3 +1,11 @@
[build-system] [build-system]
requires = ["setuptools"] requires = [
"setuptools",
"wheel",
"cython>=0.25",
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc==7.4.0.dev0",
]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -59,7 +59,7 @@ install_requires =
[options.extras_require] [options.extras_require]
lookups = lookups =
spacy_lookups_data>=0.0.5<0.2.0 spacy_lookups_data>=0.0.5,<0.2.0
cuda = cuda =
cupy>=5.0.0b4 cupy>=5.0.0b4
cuda80 = cuda80 =

View File

@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000
lang=("model language", "positional", None, str), lang=("model language", "positional", None, str),
train_path=("location of JSON-formatted training data", "positional", None, Path), train_path=("location of JSON-formatted training data", "positional", None, Path),
dev_path=("location of JSON-formatted development data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
base_model=("name of model to update (optional)", "option", "b", str), base_model=("name of model to update (optional)", "option", "b", str),
pipeline=( pipeline=(
"Comma-separated names of pipeline components to train", "Comma-separated names of pipeline components to train",
@ -41,6 +42,7 @@ def debug_data(
lang, lang,
train_path, train_path,
dev_path, dev_path,
tag_map_path=None,
base_model=None, base_model=None,
pipeline="tagger,parser,ner", pipeline="tagger,parser,ner",
ignore_warnings=False, ignore_warnings=False,
@ -60,6 +62,10 @@ def debug_data(
if not dev_path.exists(): if not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1) msg.fail("Development data not found", dev_path, exits=1)
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
# Initialize the model and pipeline # Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")] pipeline = [p.strip() for p in pipeline.split(",")]
if base_model: if base_model:
@ -67,6 +73,8 @@ def debug_data(
else: else:
lang_cls = get_lang_class(lang) lang_cls = get_lang_class(lang)
nlp = lang_cls() nlp = lang_cls()
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
msg.divider("Data format validation") msg.divider("Data format validation")
@ -344,7 +352,7 @@ def debug_data(
if "tagger" in pipeline: if "tagger" in pipeline:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]] labels = [label for label in gold_train_data["tags"]]
tag_map = nlp.Defaults.tag_map tag_map = nlp.vocab.morphology.tag_map
msg.info( msg.info(
"{} {} in data ({} {} in tag map)".format( "{} {} in data ({} {} in tag map)".format(
len(labels), len(labels),

View File

@ -57,6 +57,7 @@ from .. import about
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
verbose=("Display more information for debug", "flag", "VV", bool), verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool), debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on # fmt: on
@ -95,6 +96,7 @@ def train(
textcat_multilabel=False, textcat_multilabel=False,
textcat_arch="bow", textcat_arch="bow",
textcat_positive_label=None, textcat_positive_label=None,
tag_map_path=None,
verbose=False, verbose=False,
debug=False, debug=False,
): ):
@ -132,6 +134,9 @@ def train(
output_path.mkdir() output_path.mkdir()
msg.good("Created output directory: {}".format(output_path)) msg.good("Created output directory: {}".format(output_path))
tag_map = {}
if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path)
# Take dropout and batch size as generators of values -- dropout # Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly # Batch size starts at 1 and grows, so that we make updates quickly
@ -238,6 +243,9 @@ def train(
pipe_cfg = {} pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
if vectors: if vectors:
msg.text("Loading vector from model '{}'".format(vectors)) msg.text("Loading vector from model '{}'".format(vectors))
_load_vectors(nlp, vectors) _load_vectors(nlp, vectors)

View File

@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_
ELISION = " ' ".strip().replace(" ", "") ELISION = " ' ".strip().replace(" ", "")
abbrev = ("d", "D")
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
+ LIST_ICONS + LIST_ICONS
+ [ + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION),
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),

View File

@ -10,6 +10,8 @@ _exc = {}
# translate / delete what is not necessary # translate / delete what is not necessary
for exc_data in [ for exc_data in [
{ORTH: "t", LEMMA: "et", NORM: "et"},
{ORTH: "T", LEMMA: "et", NORM: "et"},
{ORTH: "'t", LEMMA: "et", NORM: "et"}, {ORTH: "'t", LEMMA: "et", NORM: "et"},
{ORTH: "'T", LEMMA: "et", NORM: "et"}, {ORTH: "'T", LEMMA: "et", NORM: "et"},
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},

View File

@ -15,6 +15,7 @@ import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .tokens.underscore import Underscore
from .vocab import Vocab from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .lookups import Lookups from .lookups import Lookups
@ -853,7 +854,10 @@ class Language(object):
sender.send() sender.send()
procs = [ procs = [
mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch)) mp.Process(
target=_apply_pipes,
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
)
for rch, sch in zip(texts_q, bytedocs_send_ch) for rch, sch in zip(texts_q, bytedocs_send_ch)
] ]
for proc in procs: for proc in procs:
@ -1108,16 +1112,18 @@ def _pipe(docs, proc, kwargs):
yield doc yield doc
def _apply_pipes(make_doc, pipes, reciever, sender): def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
"""Worker for Language.pipe """Worker for Language.pipe
receiver (multiprocessing.Connection): Pipe to receive text. Usually receiver (multiprocessing.Connection): Pipe to receive text. Usually
created by `multiprocessing.Pipe()` created by `multiprocessing.Pipe()`
sender (multiprocessing.Connection): Pipe to send doc. Usually created by sender (multiprocessing.Connection): Pipe to send doc. Usually created by
`multiprocessing.Pipe()` `multiprocessing.Pipe()`
underscore_state (tuple): The data in the Underscore class of the parent
""" """
Underscore.load_state(underscore_state)
while True: while True:
texts = reciever.get() texts = receiver.get()
docs = (make_doc(text) for text in texts) docs = (make_doc(text) for text in texts)
for pipe in pipes: for pipe in pipes:
docs = pipe(docs) docs = pipe(docs)

View File

@ -7,6 +7,15 @@ from spacy.tokens import Doc, Span, Token
from spacy.tokens.underscore import Underscore from spacy.tokens.underscore import Underscore
@pytest.fixture(scope="function", autouse=True)
def clean_underscore():
# reset the Underscore object after the test, to avoid having state copied across tests
yield
Underscore.doc_extensions = {}
Underscore.span_extensions = {}
Underscore.token_extensions = {}
def test_create_doc_underscore(): def test_create_doc_underscore():
doc = Mock() doc = Mock()
doc.doc = doc doc.doc = doc

View File

@ -6,6 +6,7 @@ import re
from mock import Mock from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token
from ..doc.test_underscore import clean_underscore
@pytest.fixture @pytest.fixture
@ -200,6 +201,7 @@ def test_matcher_any_token_operator(en_vocab):
assert matches[2] == "test hello world" assert matches[2] == "test hello world"
@pytest.mark.usefixtures("clean_underscore")
def test_matcher_extension_attribute(en_vocab): def test_matcher_extension_attribute(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
get_is_fruit = lambda token: token.text in ("apple", "banana") get_is_fruit = lambda token: token.text in ("apple", "banana")

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
from spacy.tokens.underscore import Underscore
def test_issue4849(): def test_issue4849():

View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals
import spacy
from spacy.lang.en import English
from spacy.tokens import Span, Doc
from spacy.tokens.underscore import Underscore
class CustomPipe:
name = "my_pipe"
def __init__(self):
Span.set_extension("my_ext", getter=self._get_my_ext)
Doc.set_extension("my_ext", default=None)
def __call__(self, doc):
gathered_ext = []
for sent in doc.sents:
sent_ext = self._get_my_ext(sent)
sent._.set("my_ext", sent_ext)
gathered_ext.append(sent_ext)
doc._.set("my_ext", "\n".join(gathered_ext))
return doc
@staticmethod
def _get_my_ext(span):
return str(span.end)
def test_issue4903():
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
nlp = English()
custom_component = CustomPipe()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(custom_component, after="sentencizer")
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
docs = list(nlp.pipe(text, n_process=2))
assert docs[0].text == "I like bananas."
assert docs[1].text == "Do you like them?"
assert docs[2].text == "No, I prefer wasabi."

View File

@ -11,6 +11,6 @@ def nlp():
return spacy.blank("en") return spacy.blank("en")
def test_evaluate(nlp): def test_issue4924(nlp):
docs_golds = [("", {})] docs_golds = [("", {})]
nlp.evaluate(docs_golds) nlp.evaluate(docs_golds)

View File

@ -79,6 +79,14 @@ class Underscore(object):
def _get_key(self, name): def _get_key(self, name):
return ("._.", name, self._start, self._end) return ("._.", name, self._start, self._end)
@classmethod
def get_state(cls):
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod
def load_state(cls, state):
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
def get_ext_args(**kwargs): def get_ext_args(**kwargs):
"""Validate and convert arguments. Reused in Doc, Token and Span.""" """Validate and convert arguments. Reused in Doc, Token and Span."""

View File

@ -437,8 +437,8 @@ The L2 norm of the token's vector representation.
| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | | `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
| `lower` | int | Lowercase form of the token. | | `lower` | int | Lowercase form of the token. |
| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | | `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | | `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. |
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | | `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |