Merge branch 'master' into spacy.io

2025-10-21 19:24:39 +03:00 · 2019-11-05 18:30:46 +01:00 · 2019-11-05 18:30:46 +01:00 · e5c319a051
commit e5c319a051
parent d7a94edba6 828ef27a32
7 changed files with 84 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -104,6 +104,13 @@ For detailed installation instructions, see the
 [pip]: https://pypi.org/project/spacy/
 [conda]: https://anaconda.org/conda-forge/spacy
 > ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
 > wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
 > providers and other tooling to support it. This means that in order to run
 > spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
 > the library and its Cython dependencies locally. If this is causing problems
 > for you, the easiest solution is to **use Python 3.7** in the meantime.
 ### pip
 Using pip, spaCy releases are available as source packages and binary wheels (as
@ -180,9 +187,6 @@ pointing pip to a path or URL.
 # download best-matching version of specific model for your spaCy installation
 python -m spacy download en_core_web_sm
 # out-of-the-box: download best-matching default model
 python -m spacy download en
 # pip install .tar.gz archive from path or URL
 pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
 pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -7,7 +7,6 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path
 import re
 import sys
 import json
 import spacy
@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
 from spacy import displacy
-from collections import defaultdict, Counter
+from collections import defaultdict
 from timeit import default_timer as timer
 import itertools
 import random
 import numpy.random
 from spacy import lang
 from spacy.lang import zh
@ -323,10 +319,6 @@ def get_token_conllu(token, i):
    return "\n".join(lines)
 Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
 Token.set_extension("begins_fused", default=False, force=True)
 Token.set_extension("inside_fused", default=False, force=True)
 ##################
 # Initialization #
@ -459,13 +451,13 @@ class TreebankPaths(object):
@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    corpus=(
-        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
        "positional",
        None,
        str,
    ),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "option", "C", Path),
    limit=("Size limit", "option", "n", int),
    gpu_device=("Use GPU", "option", "g", int),
@ -490,6 +482,10 @@ def main(
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm
    Token.set_extension("get_conllu_lines", method=get_token_conllu)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
@ -506,8 +502,8 @@ def main(
    docs, golds = read_data(
        nlp,
-        paths.train.conllu.open(),
+        paths.train.conllu.open(encoding="utf8"),
-        paths.train.text.open(),
+        paths.train.text.open(encoding="utf8"),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )
--- a/examples/load_from_docbin.py
+++ b/examples/load_from_docbin.py
@ -0,0 +1,45 @@
 # coding: utf-8
 """
 Example of loading previously parsed text using spaCy's DocBin class. The example
 performs an entity count to show that the annotations are available.
 For more details, see https://spacy.io/usage/saving-loading#docs
 Installation:
 python -m spacy download en_core_web_lg
 Usage:
 python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
 """
 from __future__ import unicode_literals
 import spacy
 from spacy.tokens import DocBin
 from timeit import default_timer as timer
 from collections import Counter
 EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
 def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
    nlp = spacy.load(model)
    print("Reading data from {}".format(docbin_path))
    with open(docbin_path, "rb") as file_:
        bytes_data = file_.read()
    nr_word = 0
    start_time = timer()
    entities = Counter()
    docbin = DocBin().from_bytes(bytes_data)
    for doc in docbin.get_docs(nlp.vocab):
        nr_word += len(doc)
        entities.update((e.label_, e.text) for e in doc.ents)
    end_time = timer()
    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
    wps = nr_word / (end_time - start_time)
    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
    print("Most common entities:")
    for (label, entity), freq in entities.most_common(30):
        print(freq, entity, label)
 if __name__ == "__main__":
    import plac
    plac.call(main)
--- a/examples/training/conllu-config.json
+++ b/examples/training/conllu-config.json
@ -0,0 +1 @@
 {"nr_epoch": 3, "batch_size": 24, "dropout":  0.001, "vectors":  0, "multitask_tag":  0, "multitask_sent":  0}
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -13,8 +13,7 @@ import spacy.util
 from spacy.tokens import Token, Doc
 from spacy.gold import GoldParse
 from spacy.syntax.nonproj import projectivize
-from collections import defaultdict, Counter
+from collections import defaultdict
 from timeit import default_timer as timer
 from spacy.matcher import Matcher
 import itertools
@ -290,11 +289,6 @@ def get_token_conllu(token, i):
    return "\n".join(lines)
 Token.set_extension("get_conllu_lines", method=get_token_conllu)
 Token.set_extension("begins_fused", default=False)
 Token.set_extension("inside_fused", default=False)
 ##################
 # Initialization #
 ##################
@ -381,20 +375,24 @@ class TreebankPaths(object):
@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "positional", None, Config.load),
    corpus=(
-        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
        "positional",
        None,
        str,
    ),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "positional", None, Config.load),
    limit=("Size limit", "option", "n", int),
 )
 def main(ud_dir, parses_dir, config, corpus, limit=0):
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm
    Token.set_extension("get_conllu_lines", method=get_token_conllu)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
    docs, golds = read_data(
        nlp,
-        paths.train.conllu.open(),
+        paths.train.conllu.open(encoding="utf8"),
-        paths.train.text.open(),
+        paths.train.text.open(encoding="utf8"),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )
--- a/spacy/language.py
+++ b/spacy/language.py
@ -769,6 +769,7 @@ class Language(object):
                texts,
                batch_size=batch_size,
                disable=disable,
                n_process=n_process,
                component_cfg=component_cfg,
            )
            for doc, context in izip(docs, contexts):
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
 > possible, the new docs also include notes on features that have changed in
 > v2.0, and features that were introduced in the new version.
 <Infobox variant="warning" title="Important note for Python 3.8">
 We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
 as we're still waiting for our CI providers and other tooling to support it.
 This means that in order to run spaCy on Python 3.8, you'll need
 [a compiler installed](#source) and compile the library and its Cython
 dependencies locally. If this is causing problems for you, the easiest solution
 is to **use Python 3.7** in the meantime.
 </Infobox>
 ## Quickstart {hidden="true"}
 import QuickstartInstall from 'widgets/quickstart-install.js'
		`@ -0,0 +1 @@`
							`{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}`