Merge branch 'master' into spacy.io

2025-07-15 10:42:34 +03:00 · 2019-06-26 14:47:46 +02:00 · 2019-06-26 14:47:46 +02:00 · 30d6c2ccc2
commit 30d6c2ccc2
parent 1e0bbb615b d361e380b8
6 changed files with 50 additions and 14 deletions
--- a/10
+++ b/10
@ -1,6 +1,6 @@
-@ARTICLE{spacy2,
-   AUTHOR  = {Honnibal, Matthew AND Montani, Ines},
-   TITLE   = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing},
-   YEAR    = {2017},
-   JOURNAL = {To appear}
+@unpublished{spacy2,
+    AUTHOR = {Honnibal, Matthew and Montani, Ines},
+    TITLE  = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
+    YEAR   = {2017},
+    Note   = {To appear}
 }
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec


@plac.annotations(
-    texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
-               "key 'tokens'", "positional", None, str),
+    texts_loc=(
+        "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
+        "key 'tokens'",
+        "positional",
+        None,
+        str,
+    ),
    vectors_model=("Name or path to spaCy model with vectors to learn from"),
    output_dir=("Directory to write models to on each epoch", "positional", None, str),
    width=("Width of CNN layers", "option", "cw", int),
    depth=("Depth of CNN layers", "option", "cd", int),
    embed_rows=("Number of embedding rows", "option", "er", int),
-    loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str),
+    loss_func=(
+        "Loss function to use for the objective. Either 'L2' or 'cosine'",
+        "option",
+        "L",
+        str,
+    ),
    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
    dropout=("Dropout rate", "option", "d", float),
    batch_size=("Number of words per training batch", "option", "bs", int),
-    max_length=("Max words per example. Longer examples are discarded", "option", "xw", int),
-    min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int),
+    max_length=(
+        "Max words per example. Longer examples are discarded",
+        "option",
+        "xw",
+        int,
+    ),
+    min_length=(
+        "Min words per example. Shorter examples are discarded",
+        "option",
+        "nw",
+        int,
+    ),
    seed=("Seed for random number generators", "option", "s", int),
    n_iter=("Number of iterations to pretrain", "option", "i", int),
    n_save_every=("Save model every X batches.", "option", "se", int),
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}):
    doc (Doc): Document do parse.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
-    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
+    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
    if not doc.is_parsed:
        user_warning(Warnings.W005)
    if options.get("collapse_phrases", False):
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -537,6 +537,7 @@ for orth in [
    "Sen.",
    "St.",
    "vs.",
+    "v.s."
 ]:
    _exc[orth] = [{ORTH: orth}]

--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@ -0,0 +1,15 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.displacy import parse_deps
+from spacy.tokens import Doc
+
+
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc.is_parsed = True
+    doc.user_data["test"] = set()
+    parse_deps(doc)
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o".

 ```python
 ### {executable="true"}
-import spacy
+from spacy.lang.en import English
 from spacy.matcher import Matcher
 from spacy.tokens import Span

-nlp = spacy.load("en_core_web_sm")
+nlp = English()
 matcher = Matcher(nlp.vocab)

 def add_event_ent(matcher, doc, i, matches):
@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches):

 pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
 matcher.add("GoogleIO", add_event_ent, pattern)
-doc = nlp(u"This is a text about Google I/O.")
+doc = nlp(u"This is a text about Google I/O")
 matches = matcher(doc)
 ```