From 4e43c0ba93969a7629eaad428d93dba54d830bca Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Nov 2019 20:29:03 +0100 Subject: [PATCH 1/5] Fix multiprocessing for as_tuples=True (#4582) --- spacy/language.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language.py b/spacy/language.py index d53710f58..97d6515c5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -769,6 +769,7 @@ class Language(object): texts, batch_size=batch_size, disable=disable, + n_process=n_process, component_cfg=component_cfg, ) for doc, context in izip(docs, contexts): From 4ec76232880066e36ad9b613f934dd7dc66404ea Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 4 Nov 2019 20:31:26 +0100 Subject: [PATCH 2/5] Fix conllu script (#4579) * force extensions to avoid clash between example scripts * fix arg order and default file encoding * add example config for conllu script * newline * move extension definitions to main function * few more encodings fixes --- bin/ud/ud_train.py | 22 +++++++++------------- examples/training/conllu-config.json | 1 + examples/training/conllu.py | 22 ++++++++++------------ 3 files changed, 20 insertions(+), 25 deletions(-) create mode 100644 examples/training/conllu-config.json diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 945bf57eb..2784d7c3c 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -7,7 +7,6 @@ from __future__ import unicode_literals import plac from pathlib import Path import re -import sys import json import spacy @@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy -from collections import defaultdict, Counter -from timeit import default_timer as timer +from collections import defaultdict -import itertools import random -import numpy.random from spacy import lang from spacy.lang import zh @@ -323,10 +319,6 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True) -Token.set_extension("begins_fused", default=False, force=True) -Token.set_extension("inside_fused", default=False, force=True) - ################## # Initialization # @@ -459,13 +451,13 @@ class TreebankPaths(object): @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), + parses_dir=("Directory to write the development parses", "positional", None, Path), corpus=( - "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", "positional", None, str, ), - parses_dir=("Directory to write the development parses", "positional", None, Path), config=("Path to json formatted config file", "option", "C", Path), limit=("Size limit", "option", "n", int), gpu_device=("Use GPU", "option", "g", int), @@ -490,6 +482,10 @@ def main( # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False @@ -506,8 +502,8 @@ def main( docs, golds = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, ) diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json new file mode 100644 index 000000000..9a11dd96b --- /dev/null +++ b/examples/training/conllu-config.json @@ -0,0 +1 @@ +{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0} diff --git a/examples/training/conllu.py b/examples/training/conllu.py index dfc790456..d9ee721ec 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -13,8 +13,7 @@ import spacy.util from spacy.tokens import Token, Doc from spacy.gold import GoldParse from spacy.syntax.nonproj import projectivize -from collections import defaultdict, Counter -from timeit import default_timer as timer +from collections import defaultdict from spacy.matcher import Matcher import itertools @@ -290,11 +289,6 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu) -Token.set_extension("begins_fused", default=False) -Token.set_extension("inside_fused", default=False) - - ################## # Initialization # ################## @@ -381,20 +375,24 @@ class TreebankPaths(object): @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), + parses_dir=("Directory to write the development parses", "positional", None, Path), + config=("Path to json formatted config file", "positional", None, Config.load), corpus=( - "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", "positional", None, str, ), - parses_dir=("Directory to write the development parses", "positional", None, Path), - config=("Path to json formatted config file", "positional", None, Config.load), limit=("Size limit", "option", "n", int), ) def main(ud_dir, parses_dir, config, corpus, limit=0): # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() @@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): docs, golds = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, ) From 83381018d3b165008cf9678117a77ef40c66ce18 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 5 Nov 2019 11:52:43 +0100 Subject: [PATCH 3/5] Add load_from_docbin example [ci skip] TODO: upload the file somewhere --- examples/load_from_docbin.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/load_from_docbin.py diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py new file mode 100644 index 000000000..f26e7fc49 --- /dev/null +++ b/examples/load_from_docbin.py @@ -0,0 +1,45 @@ +# coding: utf-8 +""" +Example of loading previously parsed text using spaCy's DocBin class. The example +performs an entity count to show that the annotations are available. +For more details, see https://spacy.io/usage/saving-loading#docs +Installation: +python -m spacy download en_core_web_lg +Usage: +python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy +""" +from __future__ import unicode_literals + +import spacy +from spacy.tokens import DocBin +from timeit import default_timer as timer +from collections import Counter + +EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy" + + +def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH): + nlp = spacy.load(model) + print("Reading data from {}".format(docbin_path)) + with open(docbin_path, "rb") as file_: + bytes_data = file_.read() + nr_word = 0 + start_time = timer() + entities = Counter() + docbin = DocBin().from_bytes(bytes_data) + for doc in docbin.get_docs(nlp.vocab): + nr_word += len(doc) + entities.update((e.label_, e.text) for e in doc.ents) + end_time = timer() + msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)" + wps = nr_word / (end_time - start_time) + print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps)) + print("Most common entities:") + for (label, entity), freq in entities.most_common(30): + print(freq, entity, label) + + +if __name__ == "__main__": + import plac + + plac.call(main) From fed53b1552b935c5beb8500d89a4d411b210d5bc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 5 Nov 2019 18:26:47 +0100 Subject: [PATCH 4/5] Update README.md --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 99d66bb31..529fa419a 100644 --- a/README.md +++ b/README.md @@ -180,9 +180,6 @@ pointing pip to a path or URL. # download best-matching version of specific model for your spaCy installation python -m spacy download en_core_web_sm -# out-of-the-box: download best-matching default model -python -m spacy download en - # pip install .tar.gz archive from path or URL pip install /Users/you/en_core_web_sm-2.2.0.tar.gz pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz From 828ef27a3206a969a56a6dbe47fa38d6e9a1a621 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 5 Nov 2019 18:30:11 +0100 Subject: [PATCH 5/5] Add warnings about 3.8 (resolves #4593) [ci skip] --- README.md | 7 +++++++ website/docs/usage/index.md | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/README.md b/README.md index 529fa419a..980fc5b0b 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,13 @@ For detailed installation instructions, see the [pip]: https://pypi.org/project/spacy/ [conda]: https://anaconda.org/conda-forge/spacy +> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary +> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI +> providers and other tooling to support it. This means that in order to run +> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile +> the library and its Cython dependencies locally. If this is causing problems +> for you, the easiest solution is to **use Python 3.7** in the meantime. + ### pip Using pip, spaCy releases are available as source packages and binary wheels (as diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 441297813..2b0045bc3 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and > possible, the new docs also include notes on features that have changed in > v2.0, and features that were introduced in the new version. + + +We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8, +as we're still waiting for our CI providers and other tooling to support it. +This means that in order to run spaCy on Python 3.8, you'll need +[a compiler installed](#source) and compile the library and its Cython +dependencies locally. If this is causing problems for you, the easiest solution +is to **use Python 3.7** in the meantime. + + + ## Quickstart {hidden="true"} import QuickstartInstall from 'widgets/quickstart-install.js'