Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-06-26 14:47:46 +02:00
commit 30d6c2ccc2
6 changed files with 50 additions and 14 deletions

View File

@ -1,6 +1,6 @@
@ARTICLE{spacy2,
AUTHOR = {Honnibal, Matthew AND Montani, Ines},
TITLE = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing},
@unpublished{spacy2,
AUTHOR = {Honnibal, Matthew and Montani, Ines},
TITLE = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
YEAR = {2017},
JOURNAL = {To appear}
Note = {To appear}
}

View File

@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec
@plac.annotations(
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
"key 'tokens'", "positional", None, str),
texts_loc=(
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
"key 'tokens'",
"positional",
None,
str,
),
vectors_model=("Name or path to spaCy model with vectors to learn from"),
output_dir=("Directory to write models to on each epoch", "positional", None, str),
width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int),
embed_rows=("Number of embedding rows", "option", "er", int),
loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str),
loss_func=(
"Loss function to use for the objective. Either 'L2' or 'cosine'",
"option",
"L",
str,
),
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
dropout=("Dropout rate", "option", "d", float),
batch_size=("Number of words per training batch", "option", "bs", int),
max_length=("Max words per example. Longer examples are discarded", "option", "xw", int),
min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int),
max_length=(
"Max words per example. Longer examples are discarded",
"option",
"xw",
int,
),
min_length=(
"Min words per example. Shorter examples are discarded",
"option",
"nw",
int,
),
seed=("Seed for random number generators", "option", "s", int),
n_iter=("Number of iterations to pretrain", "option", "i", int),
n_save_every=("Save model every X batches.", "option", "se", int),

View File

@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}):
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
if not doc.is_parsed:
user_warning(Warnings.W005)
if options.get("collapse_phrases", False):

View File

@ -537,6 +537,7 @@ for orth in [
"Sen.",
"St.",
"vs.",
"v.s."
]:
_exc[orth] = [{ORTH: orth}]

View File

@ -0,0 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.displacy import parse_deps
from spacy.tokens import Doc
def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc.
"""
doc = Doc(en_vocab, words=["Hello", "world"])
doc.is_parsed = True
doc.user_data["test"] = set()
parse_deps(doc)

View File

@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o".
```python
### {executable="true"}
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
nlp = English()
matcher = Matcher(nlp.vocab)
def add_event_ent(matcher, doc, i, matches):
@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches):
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern)
doc = nlp(u"This is a text about Google I/O.")
doc = nlp(u"This is a text about Google I/O")
matches = matcher(doc)
```