mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'master' into pr/5060
This commit is contained in:
commit
ed9358420e
|
@ -1,3 +1,11 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools"]
|
requires = [
|
||||||
|
"setuptools",
|
||||||
|
"wheel",
|
||||||
|
"cython>=0.25",
|
||||||
|
"cymem>=2.0.2,<2.1.0",
|
||||||
|
"preshed>=3.0.2,<3.1.0",
|
||||||
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
|
"thinc==7.4.0.dev0",
|
||||||
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -59,7 +59,7 @@ install_requires =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=0.0.5<0.2.0
|
spacy_lookups_data>=0.0.5,<0.2.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4
|
cupy>=5.0.0b4
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
|
|
@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||||
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
base_model=("name of model to update (optional)", "option", "b", str),
|
base_model=("name of model to update (optional)", "option", "b", str),
|
||||||
pipeline=(
|
pipeline=(
|
||||||
"Comma-separated names of pipeline components to train",
|
"Comma-separated names of pipeline components to train",
|
||||||
|
@ -41,6 +42,7 @@ def debug_data(
|
||||||
lang,
|
lang,
|
||||||
train_path,
|
train_path,
|
||||||
dev_path,
|
dev_path,
|
||||||
|
tag_map_path=None,
|
||||||
base_model=None,
|
base_model=None,
|
||||||
pipeline="tagger,parser,ner",
|
pipeline="tagger,parser,ner",
|
||||||
ignore_warnings=False,
|
ignore_warnings=False,
|
||||||
|
@ -60,6 +62,10 @@ def debug_data(
|
||||||
if not dev_path.exists():
|
if not dev_path.exists():
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
msg.fail("Development data not found", dev_path, exits=1)
|
||||||
|
|
||||||
|
tag_map = {}
|
||||||
|
if tag_map_path is not None:
|
||||||
|
tag_map = srsly.read_json(tag_map_path)
|
||||||
|
|
||||||
# Initialize the model and pipeline
|
# Initialize the model and pipeline
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
if base_model:
|
if base_model:
|
||||||
|
@ -67,6 +73,8 @@ def debug_data(
|
||||||
else:
|
else:
|
||||||
lang_cls = get_lang_class(lang)
|
lang_cls = get_lang_class(lang)
|
||||||
nlp = lang_cls()
|
nlp = lang_cls()
|
||||||
|
# Update tag map with provided mapping
|
||||||
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
|
||||||
msg.divider("Data format validation")
|
msg.divider("Data format validation")
|
||||||
|
|
||||||
|
@ -344,7 +352,7 @@ def debug_data(
|
||||||
if "tagger" in pipeline:
|
if "tagger" in pipeline:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
tag_map = nlp.Defaults.tag_map
|
tag_map = nlp.vocab.morphology.tag_map
|
||||||
msg.info(
|
msg.info(
|
||||||
"{} {} in data ({} {} in tag map)".format(
|
"{} {} in data ({} {} in tag map)".format(
|
||||||
len(labels),
|
len(labels),
|
||||||
|
|
|
@ -57,6 +57,7 @@ from .. import about
|
||||||
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
|
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
|
||||||
textcat_arch=("Textcat model architecture", "option", "ta", str),
|
textcat_arch=("Textcat model architecture", "option", "ta", str),
|
||||||
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
|
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
|
||||||
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
verbose=("Display more information for debug", "flag", "VV", bool),
|
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||||
debug=("Run data diagnostics before training", "flag", "D", bool),
|
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -95,6 +96,7 @@ def train(
|
||||||
textcat_multilabel=False,
|
textcat_multilabel=False,
|
||||||
textcat_arch="bow",
|
textcat_arch="bow",
|
||||||
textcat_positive_label=None,
|
textcat_positive_label=None,
|
||||||
|
tag_map_path=None,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
debug=False,
|
debug=False,
|
||||||
):
|
):
|
||||||
|
@ -132,6 +134,9 @@ def train(
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
msg.good("Created output directory: {}".format(output_path))
|
msg.good("Created output directory: {}".format(output_path))
|
||||||
|
|
||||||
|
tag_map = {}
|
||||||
|
if tag_map_path is not None:
|
||||||
|
tag_map = srsly.read_json(tag_map_path)
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
|
@ -238,6 +243,9 @@ def train(
|
||||||
pipe_cfg = {}
|
pipe_cfg = {}
|
||||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||||
|
|
||||||
|
# Update tag map with provided mapping
|
||||||
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
|
||||||
if vectors:
|
if vectors:
|
||||||
msg.text("Loading vector from model '{}'".format(vectors))
|
msg.text("Loading vector from model '{}'".format(vectors))
|
||||||
_load_vectors(nlp, vectors)
|
_load_vectors(nlp, vectors)
|
||||||
|
|
|
@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
ELISION = " ' ’ ".strip().replace(" ", "")
|
||||||
|
|
||||||
|
abbrev = ("d", "D")
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
+ LIST_ICONS
|
+ LIST_ICONS
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION),
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||||
|
|
|
@ -10,6 +10,8 @@ _exc = {}
|
||||||
|
|
||||||
# translate / delete what is not necessary
|
# translate / delete what is not necessary
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
|
{ORTH: "’t", LEMMA: "et", NORM: "et"},
|
||||||
|
{ORTH: "’T", LEMMA: "et", NORM: "et"},
|
||||||
{ORTH: "'t", LEMMA: "et", NORM: "et"},
|
{ORTH: "'t", LEMMA: "et", NORM: "et"},
|
||||||
{ORTH: "'T", LEMMA: "et", NORM: "et"},
|
{ORTH: "'T", LEMMA: "et", NORM: "et"},
|
||||||
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
|
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
|
||||||
|
|
|
@ -15,6 +15,7 @@ import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
|
@ -853,7 +854,10 @@ class Language(object):
|
||||||
sender.send()
|
sender.send()
|
||||||
|
|
||||||
procs = [
|
procs = [
|
||||||
mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
|
mp.Process(
|
||||||
|
target=_apply_pipes,
|
||||||
|
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
|
||||||
|
)
|
||||||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||||
]
|
]
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
|
@ -1108,16 +1112,18 @@ def _pipe(docs, proc, kwargs):
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
def _apply_pipes(make_doc, pipes, reciever, sender):
|
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
|
||||||
"""Worker for Language.pipe
|
"""Worker for Language.pipe
|
||||||
|
|
||||||
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
||||||
created by `multiprocessing.Pipe()`
|
created by `multiprocessing.Pipe()`
|
||||||
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
|
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
|
||||||
`multiprocessing.Pipe()`
|
`multiprocessing.Pipe()`
|
||||||
|
underscore_state (tuple): The data in the Underscore class of the parent
|
||||||
"""
|
"""
|
||||||
|
Underscore.load_state(underscore_state)
|
||||||
while True:
|
while True:
|
||||||
texts = reciever.get()
|
texts = receiver.get()
|
||||||
docs = (make_doc(text) for text in texts)
|
docs = (make_doc(text) for text in texts)
|
||||||
for pipe in pipes:
|
for pipe in pipes:
|
||||||
docs = pipe(docs)
|
docs = pipe(docs)
|
||||||
|
|
|
@ -7,6 +7,15 @@ from spacy.tokens import Doc, Span, Token
|
||||||
from spacy.tokens.underscore import Underscore
|
from spacy.tokens.underscore import Underscore
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def clean_underscore():
|
||||||
|
# reset the Underscore object after the test, to avoid having state copied across tests
|
||||||
|
yield
|
||||||
|
Underscore.doc_extensions = {}
|
||||||
|
Underscore.span_extensions = {}
|
||||||
|
Underscore.token_extensions = {}
|
||||||
|
|
||||||
|
|
||||||
def test_create_doc_underscore():
|
def test_create_doc_underscore():
|
||||||
doc = Mock()
|
doc = Mock()
|
||||||
doc.doc = doc
|
doc.doc = doc
|
||||||
|
|
|
@ -6,6 +6,7 @@ import re
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import Matcher, DependencyMatcher
|
from spacy.matcher import Matcher, DependencyMatcher
|
||||||
from spacy.tokens import Doc, Token
|
from spacy.tokens import Doc, Token
|
||||||
|
from ..doc.test_underscore import clean_underscore
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -200,6 +201,7 @@ def test_matcher_any_token_operator(en_vocab):
|
||||||
assert matches[2] == "test hello world"
|
assert matches[2] == "test hello world"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("clean_underscore")
|
||||||
def test_matcher_extension_attribute(en_vocab):
|
def test_matcher_extension_attribute(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
get_is_fruit = lambda token: token.text in ("apple", "banana")
|
get_is_fruit = lambda token: token.text in ("apple", "banana")
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.tokens.underscore import Underscore
|
||||||
|
|
||||||
|
|
||||||
def test_issue4849():
|
def test_issue4849():
|
||||||
|
|
45
spacy/tests/regression/test_issue4903.py
Normal file
45
spacy/tests/regression/test_issue4903.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.tokens import Span, Doc
|
||||||
|
from spacy.tokens.underscore import Underscore
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPipe:
|
||||||
|
name = "my_pipe"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||||
|
Doc.set_extension("my_ext", default=None)
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
gathered_ext = []
|
||||||
|
for sent in doc.sents:
|
||||||
|
sent_ext = self._get_my_ext(sent)
|
||||||
|
sent._.set("my_ext", sent_ext)
|
||||||
|
gathered_ext.append(sent_ext)
|
||||||
|
|
||||||
|
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_my_ext(span):
|
||||||
|
return str(span.end)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4903():
|
||||||
|
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
custom_component = CustomPipe()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(custom_component, after="sentencizer")
|
||||||
|
|
||||||
|
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||||
|
docs = list(nlp.pipe(text, n_process=2))
|
||||||
|
assert docs[0].text == "I like bananas."
|
||||||
|
assert docs[1].text == "Do you like them?"
|
||||||
|
assert docs[2].text == "No, I prefer wasabi."
|
|
@ -11,6 +11,6 @@ def nlp():
|
||||||
return spacy.blank("en")
|
return spacy.blank("en")
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate(nlp):
|
def test_issue4924(nlp):
|
||||||
docs_golds = [("", {})]
|
docs_golds = [("", {})]
|
||||||
nlp.evaluate(docs_golds)
|
nlp.evaluate(docs_golds)
|
||||||
|
|
|
@ -79,6 +79,14 @@ class Underscore(object):
|
||||||
def _get_key(self, name):
|
def _get_key(self, name):
|
||||||
return ("._.", name, self._start, self._end)
|
return ("._.", name, self._start, self._end)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_state(cls):
|
||||||
|
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_state(cls, state):
|
||||||
|
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
|
||||||
|
|
||||||
|
|
||||||
def get_ext_args(**kwargs):
|
def get_ext_args(**kwargs):
|
||||||
"""Validate and convert arguments. Reused in Doc, Token and Span."""
|
"""Validate and convert arguments. Reused in Doc, Token and Span."""
|
||||||
|
|
|
@ -437,8 +437,8 @@ The L2 norm of the token's vector representation.
|
||||||
| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
|
| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
|
||||||
| `lower` | int | Lowercase form of the token. |
|
| `lower` | int | Lowercase form of the token. |
|
||||||
| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
|
| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
|
||||||
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||||
| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||||
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
|
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
|
||||||
| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. |
|
| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. |
|
||||||
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
|
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user