Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-06-26 14:47:46 +02:00
commit 30d6c2ccc2
6 changed files with 50 additions and 14 deletions

View File

@ -1,6 +1,6 @@
@ARTICLE{spacy2, @unpublished{spacy2,
AUTHOR = {Honnibal, Matthew AND Montani, Ines}, AUTHOR = {Honnibal, Matthew and Montani, Ines},
TITLE = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing}, TITLE = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
YEAR = {2017}, YEAR = {2017},
JOURNAL = {To appear} Note = {To appear}
} }

View File

@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec
@plac.annotations( @plac.annotations(
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the " texts_loc=(
"key 'tokens'", "positional", None, str), "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
"key 'tokens'",
"positional",
None,
str,
),
vectors_model=("Name or path to spaCy model with vectors to learn from"), vectors_model=("Name or path to spaCy model with vectors to learn from"),
output_dir=("Directory to write models to on each epoch", "positional", None, str), output_dir=("Directory to write models to on each epoch", "positional", None, str),
width=("Width of CNN layers", "option", "cw", int), width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int), depth=("Depth of CNN layers", "option", "cd", int),
embed_rows=("Number of embedding rows", "option", "er", int), embed_rows=("Number of embedding rows", "option", "er", int),
loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str), loss_func=(
"Loss function to use for the objective. Either 'L2' or 'cosine'",
"option",
"L",
str,
),
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
dropout=("Dropout rate", "option", "d", float), dropout=("Dropout rate", "option", "d", float),
batch_size=("Number of words per training batch", "option", "bs", int), batch_size=("Number of words per training batch", "option", "bs", int),
max_length=("Max words per example. Longer examples are discarded", "option", "xw", int), max_length=(
min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int), "Max words per example. Longer examples are discarded",
"option",
"xw",
int,
),
min_length=(
"Min words per example. Shorter examples are discarded",
"option",
"nw",
int,
),
seed=("Seed for random number generators", "option", "s", int), seed=("Seed for random number generators", "option", "s", int),
n_iter=("Number of iterations to pretrain", "option", "i", int), n_iter=("Number of iterations to pretrain", "option", "i", int),
n_save_every=("Save model every X batches.", "option", "se", int), n_save_every=("Save model every X batches.", "option", "se", int),

View File

@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}):
doc (Doc): Document do parse. doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
if not doc.is_parsed: if not doc.is_parsed:
user_warning(Warnings.W005) user_warning(Warnings.W005)
if options.get("collapse_phrases", False): if options.get("collapse_phrases", False):

View File

@ -537,6 +537,7 @@ for orth in [
"Sen.", "Sen.",
"St.", "St.",
"vs.", "vs.",
"v.s."
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]

View File

@ -0,0 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.displacy import parse_deps
from spacy.tokens import Doc
def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc.
"""
doc = Doc(en_vocab, words=["Hello", "world"])
doc.is_parsed = True
doc.user_data["test"] = set()
parse_deps(doc)

View File

@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o".
```python ```python
### {executable="true"} ### {executable="true"}
import spacy from spacy.lang.en import English
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Span from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm") nlp = English()
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
def add_event_ent(matcher, doc, i, matches): def add_event_ent(matcher, doc, i, matches):
@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches):
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern) matcher.add("GoogleIO", add_event_ent, pattern)
doc = nlp(u"This is a text about Google I/O.") doc = nlp(u"This is a text about Google I/O")
matches = matcher(doc) matches = matcher(doc)
``` ```