Merge branch 'master' into spacy.io

2025-08-01 19:00:20 +03:00 · 2019-11-05 18:30:46 +01:00 · 2019-11-05 18:30:46 +01:00 · e5c319a051
commit e5c319a051
parent d7a94edba6 828ef27a32
7 changed files with 84 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -104,6 +104,13 @@ For detailed installation instructions, see the
 [pip]: https://pypi.org/project/spacy/
 [conda]: https://anaconda.org/conda-forge/spacy

+> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
+> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
+> providers and other tooling to support it. This means that in order to run
+> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
+> the library and its Cython dependencies locally. If this is causing problems
+> for you, the easiest solution is to **use Python 3.7** in the meantime.
+
 ### pip

 Using pip, spaCy releases are available as source packages and binary wheels (as
@ -180,9 +187,6 @@ pointing pip to a path or URL.
 # download best-matching version of specific model for your spaCy installation
 python -m spacy download en_core_web_sm

-# out-of-the-box: download best-matching default model
-python -m spacy download en
-
 # pip install .tar.gz archive from path or URL
 pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
 pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -7,7 +7,6 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path
 import re
-import sys
 import json

 import spacy
@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
 from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict

-import itertools
 import random
-import numpy.random

 from spacy import lang
 from spacy.lang import zh
@ -323,10 +319,6 @@ def get_token_conllu(token, i):
    return "\n".join(lines)


-Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
-Token.set_extension("begins_fused", default=False, force=True)
-Token.set_extension("inside_fused", default=False, force=True)
-

 ##################
 # Initialization #
@ -459,13 +451,13 @@ class TreebankPaths(object):

@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+    parses_dir=("Directory to write the development parses", "positional", None, Path),
    corpus=(
-        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
        "positional",
        None,
        str,
    ),
-    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "option", "C", Path),
    limit=("Size limit", "option", "n", int),
    gpu_device=("Use GPU", "option", "g", int),
@ -490,6 +482,10 @@ def main(
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
@ -506,8 +502,8 @@ def main(

    docs, golds = read_data(
        nlp,
-        paths.train.conllu.open(),
-        paths.train.text.open(),
+        paths.train.conllu.open(encoding="utf8"),
+        paths.train.text.open(encoding="utf8"),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )
--- a/examples/load_from_docbin.py
+++ b/examples/load_from_docbin.py
@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+Example of loading previously parsed text using spaCy's DocBin class. The example
+performs an entity count to show that the annotations are available.
+For more details, see https://spacy.io/usage/saving-loading#docs
+Installation:
+python -m spacy download en_core_web_lg
+Usage:
+python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
+"""
+from __future__ import unicode_literals
+
+import spacy
+from spacy.tokens import DocBin
+from timeit import default_timer as timer
+from collections import Counter
+
+EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
+
+
+def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
+    nlp = spacy.load(model)
+    print("Reading data from {}".format(docbin_path))
+    with open(docbin_path, "rb") as file_:
+        bytes_data = file_.read()
+    nr_word = 0
+    start_time = timer()
+    entities = Counter()
+    docbin = DocBin().from_bytes(bytes_data)
+    for doc in docbin.get_docs(nlp.vocab):
+        nr_word += len(doc)
+        entities.update((e.label_, e.text) for e in doc.ents)
+    end_time = timer()
+    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
+    wps = nr_word / (end_time - start_time)
+    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
+    print("Most common entities:")
+    for (label, entity), freq in entities.most_common(30):
+        print(freq, entity, label)
+
+
+if __name__ == "__main__":
+    import plac
+
+    plac.call(main)
--- a/examples/training/conllu-config.json
+++ b/examples/training/conllu-config.json
@ -0,0 +1 @@
+{"nr_epoch": 3, "batch_size": 24, "dropout":  0.001, "vectors":  0, "multitask_tag":  0, "multitask_sent":  0}
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -13,8 +13,7 @@ import spacy.util
 from spacy.tokens import Token, Doc
 from spacy.gold import GoldParse
 from spacy.syntax.nonproj import projectivize
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
 from spacy.matcher import Matcher

 import itertools
@ -290,11 +289,6 @@ def get_token_conllu(token, i):
    return "\n".join(lines)


-Token.set_extension("get_conllu_lines", method=get_token_conllu)
-Token.set_extension("begins_fused", default=False)
-Token.set_extension("inside_fused", default=False)
-
-
 ##################
 # Initialization #
 ##################
@ -381,20 +375,24 @@ class TreebankPaths(object):

@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+    parses_dir=("Directory to write the development parses", "positional", None, Path),
+    config=("Path to json formatted config file", "positional", None, Config.load),
    corpus=(
-        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
        "positional",
        None,
        str,
    ),
-    parses_dir=("Directory to write the development parses", "positional", None, Path),
-    config=("Path to json formatted config file", "positional", None, Config.load),
    limit=("Size limit", "option", "n", int),
 )
 def main(ud_dir, parses_dir, config, corpus, limit=0):
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):

    docs, golds = read_data(
        nlp,
-        paths.train.conllu.open(),
-        paths.train.text.open(),
+        paths.train.conllu.open(encoding="utf8"),
+        paths.train.text.open(encoding="utf8"),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )
--- a/spacy/language.py
+++ b/spacy/language.py
@ -769,6 +769,7 @@ class Language(object):
                texts,
                batch_size=batch_size,
                disable=disable,
+                n_process=n_process,
                component_cfg=component_cfg,
            )
            for doc, context in izip(docs, contexts):
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
 > possible, the new docs also include notes on features that have changed in
 > v2.0, and features that were introduced in the new version.

+<Infobox variant="warning" title="Important note for Python 3.8">
+
+We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
+as we're still waiting for our CI providers and other tooling to support it.
+This means that in order to run spaCy on Python 3.8, you'll need
+[a compiler installed](#source) and compile the library and its Cython
+dependencies locally. If this is causing problems for you, the easiest solution
+is to **use Python 3.7** in the meantime.
+
+</Infobox>
+
 ## Quickstart {hidden="true"}

 import QuickstartInstall from 'widgets/quickstart-install.js'
				`@ -0,0 +1 @@`
				`{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}`