diff --git a/CITATION b/CITATION index 301224955..e820c197d 100644 --- a/CITATION +++ b/CITATION @@ -1,6 +1,6 @@ -@ARTICLE{spacy2, - AUTHOR = {Honnibal, Matthew AND Montani, Ines}, - TITLE = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing}, - YEAR = {2017}, - JOURNAL = {To appear} +@unpublished{spacy2, + AUTHOR = {Honnibal, Matthew and Montani, Ines}, + TITLE = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing}, + YEAR = {2017}, + Note = {To appear} } diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7afd10520..2fe5b247a 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec @plac.annotations( - texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the " - "key 'tokens'", "positional", None, str), + texts_loc=( + "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the " + "key 'tokens'", + "positional", + None, + str, + ), vectors_model=("Name or path to spaCy model with vectors to learn from"), output_dir=("Directory to write models to on each epoch", "positional", None, str), width=("Width of CNN layers", "option", "cw", int), depth=("Depth of CNN layers", "option", "cd", int), embed_rows=("Number of embedding rows", "option", "er", int), - loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str), + loss_func=( + "Loss function to use for the objective. Either 'L2' or 'cosine'", + "option", + "L", + str, + ), use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), dropout=("Dropout rate", "option", "d", float), batch_size=("Number of words per training batch", "option", "bs", int), - max_length=("Max words per example. Longer examples are discarded", "option", "xw", int), - min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int), + max_length=( + "Max words per example. Longer examples are discarded", + "option", + "xw", + int, + ), + min_length=( + "Min words per example. Shorter examples are discarded", + "option", + "nw", + int, + ), seed=("Seed for random number generators", "option", "s", int), n_iter=("Number of iterations to pretrain", "option", "i", int), n_save_every=("Save model every X batches.", "option", "se", int), diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index b651c0996..d2ef21dbd 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}): doc (Doc): Document do parse. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ - doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) + doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) if not doc.is_parsed: user_warning(Warnings.W005) if options.get("collapse_phrases", False): diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 5063319a6..9731dc752 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -537,6 +537,7 @@ for orth in [ "Sen.", "St.", "vs.", + "v.s." ]: _exc[orth] = [{ORTH: orth}] diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py new file mode 100644 index 000000000..1b2dcea25 --- /dev/null +++ b/spacy/tests/regression/test_issue3882.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.displacy import parse_deps +from spacy.tokens import Doc + + +def test_issue3882(en_vocab): + """Test that displaCy doesn't serialize the doc.user_data when making a + copy of the Doc. + """ + doc = Doc(en_vocab, words=["Hello", "world"]) + doc.is_parsed = True + doc.user_data["test"] = set() + parse_deps(doc) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index ee901e3fd..2354092f0 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o". ```python ### {executable="true"} -import spacy +from spacy.lang.en import English from spacy.matcher import Matcher from spacy.tokens import Span -nlp = spacy.load("en_core_web_sm") +nlp = English() matcher = Matcher(nlp.vocab) def add_event_ent(matcher, doc, i, matches): @@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches): pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] matcher.add("GoogleIO", add_event_ent, pattern) -doc = nlp(u"This is a text about Google I/O.") +doc = nlp(u"This is a text about Google I/O") matches = matcher(doc) ```