From 53a3b967ac704ff0a67a7102ede6d916e2a4545a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 10 Mar 2021 11:10:53 +0100 Subject: [PATCH 01/18] Update thinc pin and set version to v3.0.5 (#7389) --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/about.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3113cf6c5..f00fdc9f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0,<8.1.0", + "thinc>=8.0.2,<8.1.0", "blis>=0.4.0,<0.8.0", "pathy", "numpy>=1.15.0", diff --git a/requirements.txt b/requirements.txt index 01a3be120..e09a5b221 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ spacy-legacy>=3.0.0,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0,<8.1.0 +thinc>=8.0.2,<8.1.0 blis>=0.4.0,<0.8.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 482c1fbdd..09f989c54 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,14 +34,14 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0,<8.1.0 + thinc>=8.0.2,<8.1.0 install_requires = # Our libraries spacy-legacy>=3.0.0,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0,<8.1.0 + thinc>=8.0.2,<8.1.0 blis>=0.4.0,<0.8.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.0,<3.0.0 diff --git a/spacy/about.py b/spacy/about.py index 4cbfdbad3..2987f3c53 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.4" +__version__ = "3.0.5" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 28726c25a19248b06b59c5ca759410b84b70668c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 10 Mar 2021 11:42:02 +0100 Subject: [PATCH 02/18] Update docs for convert CLI and NER examples --- extra/example_data/ner_example_data/README.md | 20 ++++++++++++++++- website/docs/api/cli.md | 22 +++++++++---------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/extra/example_data/ner_example_data/README.md b/extra/example_data/ner_example_data/README.md index af70694f5..3c6a4a86b 100644 --- a/extra/example_data/ner_example_data/README.md +++ b/extra/example_data/ner_example_data/README.md @@ -1,7 +1,25 @@ ## Examples of NER/IOB data that can be converted with `spacy convert` -spacy JSON training files were generated with: +To convert an IOB file to `.spacy` ([`DocBin`](https://spacy.io/api/docbin)) +for spaCy v3: +```bash +python -m spacy convert -c iob -s -n 10 -b en_core_web_sm file.iob . ``` + +See all the `spacy convert` options: https://spacy.io/api/cli#convert + +--- + +The spaCy v2 JSON training files were generated using **spaCy v2** with: + +```bash python -m spacy convert -c iob -s -n 10 -b en file.iob ``` + +To convert an existing JSON training file to `.spacy` for spaCy v3, convert +with **spaCy v3**: + +```bash +python -m spacy convert file.json . +``` diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e8be0f79c..fd149b285 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -261,24 +261,24 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] | `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ | | `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ | | `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | -| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ | -| `--seg-sents`, `-s` 2.2 | Segment sentences (for `--converter ner`). ~~bool (flag)~~ | +| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | +| `--seg-sents`, `-s` 2.2 | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ | | `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | -| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ | -| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ | +| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ | +| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ | | `--lang`, `-l` 2.1 | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | ### Converters {#converters} -| ID | Description | -| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `json` | JSON-formatted training data used in spaCy v2.x. | -| `conll` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| ID | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x. | +| `conllu` | Universal Dependencies `.conllu` format. | +| `ner` / `conll` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | ## debug {#debug new="3"} From fbf3a755d7af0afc32fb7f7d83d4b9933ed724e4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 Mar 2021 09:36:58 +0100 Subject: [PATCH 03/18] Make spacy.load kwargs keyword-only --- spacy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/__init__.py b/spacy/__init__.py index 36074c440..cd5a40406 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -28,6 +28,7 @@ if sys.maxunicode == 65535: def load( name: Union[str, Path], + *, disable: Iterable[str] = util.SimpleFrozenList(), exclude: Iterable[str] = util.SimpleFrozenList(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), From 4294bcf4ab6ab1a45ff05adf05ca369fa02bdc81 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 Mar 2021 09:52:40 +0100 Subject: [PATCH 04/18] Align keyword-only in docs for init/util --- website/docs/api/top-level.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index e1d81a5b5..cf9a58941 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -140,9 +140,9 @@ pipelines. -In a Jupyter notebook, run `prefer_gpu()` in the same cell as `spacy.load()` -to ensure that the model is loaded on the correct device. See [more -details](/usage/v3#jupyter-notebook-gpu). +In a Jupyter notebook, run `prefer_gpu()` in the same cell as `spacy.load()` to +ensure that the model is loaded on the correct device. See +[more details](/usage/v3#jupyter-notebook-gpu). @@ -168,9 +168,9 @@ and _before_ loading any pipelines. -In a Jupyter notebook, run `require_gpu()` in the same cell as `spacy.load()` -to ensure that the model is loaded on the correct device. See [more -details](/usage/v3#jupyter-notebook-gpu). +In a Jupyter notebook, run `require_gpu()` in the same cell as `spacy.load()` to +ensure that the model is loaded on the correct device. See +[more details](/usage/v3#jupyter-notebook-gpu). @@ -195,9 +195,9 @@ after importing spaCy and _before_ loading any pipelines. -In a Jupyter notebook, run `require_cpu()` in the same cell as `spacy.load()` -to ensure that the model is loaded on the correct device. See [more -details](/usage/v3#jupyter-notebook-gpu). +In a Jupyter notebook, run `require_cpu()` in the same cell as `spacy.load()` to +ensure that the model is loaded on the correct device. See +[more details](/usage/v3#jupyter-notebook-gpu). @@ -945,7 +945,8 @@ and create a `Language` object. The model data will then be loaded in via | Name | Description | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `name` | Package name or path. ~~str~~ | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| _keyword-only_ | | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | @@ -968,6 +969,7 @@ A helper function to use in the `load()` method of a pipeline package's | Name | Description | | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | +| _keyword-only_ | | | `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | From 84470d9b9e65bd1843dd250e5d94bb44fd87469e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 Mar 2021 10:10:58 +0100 Subject: [PATCH 05/18] Incorporate BILUO note from #7407 --- website/docs/api/cli.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index fd149b285..44a8e2fc2 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -272,13 +272,13 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] ### Converters {#converters} -| ID | Description | -| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `json` | JSON-formatted training data used in spaCy v2.x. | -| `conllu` | Universal Dependencies `.conllu` format. | -| `ner` / `conll` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| ID | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x. | +| `conllu` | Universal Dependencies `.conllu` format. | +| `ner` / `conll` | NER with IOB/IOB2/BILUO tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the NER tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| `iob` | NER with IOB/IOB2/BILUO tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | ## debug {#debug new="3"} From 124304b14672cb3d82c495b0fd45f60ecca90ea8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 Mar 2021 10:58:59 +0100 Subject: [PATCH 06/18] Add vocab kwarg back to spacy.load * Additional minor formatting and docs cleanup --- spacy/__init__.py | 8 ++++++-- website/docs/api/top-level.md | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index cd5a40406..1eef7e621 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -29,6 +29,7 @@ if sys.maxunicode == 65535: def load( name: Union[str, Path], *, + vocab: Union[Vocab, bool] = True, disable: Iterable[str] = util.SimpleFrozenList(), exclude: Iterable[str] = util.SimpleFrozenList(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), @@ -36,6 +37,7 @@ def load( """Load a spaCy model from an installed package or a local path. name (str): Package name or model path. + vocab (Vocab): A Vocab object. If True, a vocab is created. disable (Iterable[str]): Names of pipeline components to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. @@ -45,7 +47,9 @@ def load( keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ - return util.load_model(name, disable=disable, exclude=exclude, config=config) + return util.load_model( + name, vocab=vocab, disable=disable, exclude=exclude, config=config + ) def blank( @@ -53,7 +57,7 @@ def blank( *, vocab: Union[Vocab, bool] = True, config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), - meta: Dict[str, Any] = util.SimpleFrozenDict() + meta: Dict[str, Any] = util.SimpleFrozenDict(), ) -> Language: """Create a blank nlp object for a given language code. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index cf9a58941..eef8958cf 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -48,6 +48,7 @@ specified separately using the new `exclude` keyword argument. | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | | _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | @@ -83,9 +84,9 @@ Create a blank pipeline of a given language class. This function is the twin of | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | | _keyword-only_ | | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| `meta` 3 | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | +| `meta` | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | | **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | ### spacy.info {#spacy.info tag="function"} @@ -946,7 +947,7 @@ and create a `Language` object. The model data will then be loaded in via | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `name` | Package name or path. ~~str~~ | | _keyword-only_ | | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | @@ -970,7 +971,7 @@ A helper function to use in the `load()` method of a pipeline package's | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | | _keyword-only_ | | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | @@ -1149,11 +1150,11 @@ vary on each step. > nlp.update(batch) > ``` -| Name | Description | -| ---------- | ---------------------------------------- | -| `items` | The items to batch up. ~~Iterable[Any]~~ | -| `size` | int / iterable | The batch size(s). ~~Union[int, Sequence[int]]~~ | -| **YIELDS** | The batches. | +| Name | Description | +| ---------- | ------------------------------------------------ | +| `items` | The items to batch up. ~~Iterable[Any]~~ | +| `size` | The batch size(s). ~~Union[int, Sequence[int]]~~ | +| **YIELDS** | The batches. | ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} From deffc3a5321fcb21f3ff4b0bed23deea81f81f12 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 Mar 2021 16:24:31 +0100 Subject: [PATCH 07/18] Update package requirements tests (#7409) * Add hypothesis to packages skipped in version check * Add numpy back to tests following 2df1ab8a --- spacy/tests/package/test_requirements.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index a0e43ccfa..82c39b72c 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -6,15 +6,14 @@ def test_build_dependencies(): # Check that library requirements are pinned exactly the same across different setup files. # TODO: correct checks for numpy rather than ignoring libs_ignore_requirements = [ - "numpy", "pytest", "pytest-timeout", "mock", "flake8", + "hypothesis", ] # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ - "numpy", "fugashi", "natto-py", "pythainlp", From 81efde0ce401f3004e23fc1ed794d445b8cafa51 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 Mar 2021 19:49:46 +0100 Subject: [PATCH 08/18] Add examples README --- examples/README.md | 130 ++++++++++++++++++++++++++++++++++++ examples/training/README.md | 5 ++ 2 files changed, 135 insertions(+) create mode 100644 examples/README.md create mode 100644 examples/training/README.md diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..23ff59acd --- /dev/null +++ b/examples/README.md @@ -0,0 +1,130 @@ + + +# spaCy examples + +For spaCy v3 we've converted many of the [v2 example +scripts](https://github.com/explosion/spaCy/tree/v2.3.x/examples/) into +end-to-end [spacy projects](https://spacy.io/usage/projects) workflows. The +workflows include all the steps to go from data to packaged spaCy models. + +## 🪐 Pipeline component demos + +The simplest demos for training a single pipeline component are in the +[`pipelines`](https://github.com/explosion/projects/blob/v3/pipelines) category +including: + +- [`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo): + Train a named entity recognizer +- [`pipelines/textcat_demo`](https://github.com/explosion/projects/blob/v3/pipelines/textcat_demo): + Train a text classifier +- [`pipelines/parser_intent_demo`](https://github.com/explosion/projects/blob/v3/pipelines/parser_intent_demo): + Train a dependency parser for custom semantics + +## 🪐 Tutorials + +The [`tutorials`](https://github.com/explosion/projects/blob/v3/tutorials) +category includes examples that work through specific NLP use cases end-to-end: + +- [`tutorials/textcat_goemotions`](https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions): + Train a text classifier to categorize emotions in Reddit posts +- [`tutorials/nel_emerson`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson): + Use an entity linker to disambiguate mentions of the same name + +Check out the [projects documentation](https://spacy.io/usage/projects) and +browse through the [available +projects](https://github.com/explosion/projects/)! + +## 🚀 Get started with a demo project + +The +[`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo) +project converts the spaCy v2 +[`train_ner.py`](https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/train_ner.py) +demo script into a spaCy v3 project. + +1. Clone the project: + + ```bash + python -m spacy project clone pipelines/ner_demo + ``` + +2. Install requirements and download any data assets: + + ```bash + cd ner_demo + python -m pip install -r requirements.txt + python -m spacy project assets + ``` + +3. Run the default workflow to convert, train and evaluate: + + ```bash + python -m spacy project run all + ``` + + Sample output: + + ```none + ℹ Running workflow 'all' + + ================================== convert ================================== + Running command: /home/user/venv/bin/python scripts/convert.py en assets/train.json corpus/train.spacy + Running command: /home/user/venv/bin/python scripts/convert.py en assets/dev.json corpus/dev.spacy + + =============================== create-config =============================== + Running command: /home/user/venv/bin/python -m spacy init config --lang en --pipeline ner configs/config.cfg --force + ℹ Generated config template specific for your use case + - Language: en + - Pipeline: ner + - Optimize for: efficiency + - Hardware: CPU + - Transformer: None + ✔ Auto-filled config with all values + ✔ Saved config + configs/config.cfg + You can now add your data and train your pipeline: + python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy + + =================================== train =================================== + Running command: /home/user/venv/bin/python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.max_steps 100 --gpu-id -1 + ℹ Using CPU + + =========================== Initializing pipeline =========================== + [2021-03-11 19:34:59,101] [INFO] Set up nlp object from config + [2021-03-11 19:34:59,109] [INFO] Pipeline: ['tok2vec', 'ner'] + [2021-03-11 19:34:59,113] [INFO] Created vocabulary + [2021-03-11 19:34:59,113] [INFO] Finished initializing nlp object + [2021-03-11 19:34:59,265] [INFO] Initialized pipeline components: ['tok2vec', 'ner'] + ✔ Initialized pipeline + + ============================= Training pipeline ============================= + ℹ Pipeline: ['tok2vec', 'ner'] + ℹ Initial learn rate: 0.001 + E # LOSS TOK2VEC LOSS NER ENTS_F ENTS_P ENTS_R SCORE + --- ------ ------------ -------- ------ ------ ------ ------ + 0 0 0.00 7.90 0.00 0.00 0.00 0.00 + 10 10 0.11 71.07 0.00 0.00 0.00 0.00 + 20 20 0.65 22.44 50.00 50.00 50.00 0.50 + 30 30 0.22 6.38 80.00 66.67 100.00 0.80 + 40 40 0.00 0.00 80.00 66.67 100.00 0.80 + 50 50 0.00 0.00 80.00 66.67 100.00 0.80 + 60 60 0.00 0.00 100.00 100.00 100.00 1.00 + 70 70 0.00 0.00 100.00 100.00 100.00 1.00 + 80 80 0.00 0.00 100.00 100.00 100.00 1.00 + 90 90 0.00 0.00 100.00 100.00 100.00 1.00 + 100 100 0.00 0.00 100.00 100.00 100.00 1.00 + ✔ Saved pipeline to output directory + training/model-last + ``` + +4. Package the model: + + ```bash + python -m spacy project run package + ``` + +5. Visualize the model's output with [Streamlit](https://streamlit.io): + + ```bash + python -m spacy project run visualize-model + ``` diff --git a/examples/training/README.md b/examples/training/README.md new file mode 100644 index 000000000..34689ceb6 --- /dev/null +++ b/examples/training/README.md @@ -0,0 +1,5 @@ + + +# spaCy examples + +See [examples/README.md](../README.md) From 508cb3bef75079cb132b4a9754a197de421d07f8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 Mar 2021 09:41:59 +0100 Subject: [PATCH 09/18] Also exclude user hooks in displacy conversion (#7419) --- spacy/displacy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 2049809a7..aa61fb9f7 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -120,7 +120,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: doc (Doc): Document do parse. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ - doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) + doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"])) if not doc.has_annotation("DEP"): warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): From ce6317231f047fcfc946fa9139c7f56ded9cb84f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 Mar 2021 09:51:26 +0100 Subject: [PATCH 10/18] Add --code to spacy debug CLI --- spacy/cli/debug_config.py | 2 +- spacy/cli/debug_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 549072a1e..56ee12336 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -20,7 +20,7 @@ def debug_config_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") # fmt: on diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 316b615c5..be11f8d1c 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -39,7 +39,7 @@ def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), From 03e9e7b567dc05b428fd689dab2f997c260875ab Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 Mar 2021 10:00:31 +0100 Subject: [PATCH 11/18] Add --code option to init fill-config --- spacy/cli/init_config.py | 7 +++++-- website/docs/api/cli.md | 17 +++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 9880c389c..55622452b 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -10,7 +10,8 @@ from jinja2 import Template from .. import util from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema -from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list +from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND +from ._util import string_to_list, import_code ROOT = Path(__file__).parent / "templates" @@ -70,7 +71,8 @@ def init_fill_config_cli( base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False), output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), - diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes") + diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), + code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), # fmt: on ): """ @@ -82,6 +84,7 @@ def init_fill_config_cli( DOCS: https://spacy.io/api/cli#init-fill-config """ + import_code(code_path) fill_config(output_file, base_path, pretraining=pretraining, diff=diff) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e8be0f79c..8564eff43 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -170,14 +170,15 @@ validation error with more details. $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Name | Description | -| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | -| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Complete and auto-filled config file for training. | +| Name | Description | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | ### init vectors {#init-vectors new="3" tag="command"} From 316810360558b3581f0132baa37072b3cb597dd7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 Mar 2021 10:04:57 +0100 Subject: [PATCH 12/18] Fix type of spacy train --output in docs --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8564eff43..56d69ad6d 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -806,7 +806,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | Name | Description | | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | +| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | From 61472e7cb385c6ca578dce2f4301fb27666e058b Mon Sep 17 00:00:00 2001 From: bsweileh <42196212+bsweileh@users.noreply.github.com> Date: Mon, 15 Mar 2021 02:21:35 -0600 Subject: [PATCH 13/18] Update _training.md - Fix broken link on backpropagation (#7431) * Update _training.md Fix broken link on backpropagation * Add agreement add spacy contributor agreement --- .github/contributors/bsweileh.md | 106 ++++++++++++++++++++++++++++ website/docs/usage/101/_training.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/bsweileh.md diff --git a/.github/contributors/bsweileh.md b/.github/contributors/bsweileh.md new file mode 100644 index 000000000..13f78a4b7 --- /dev/null +++ b/.github/contributors/bsweileh.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Belal | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | March 13, 2021 | +| GitHub username | bsweileh | +| Website (optional) | | diff --git a/website/docs/usage/101/_training.md b/website/docs/usage/101/_training.md index b73a83d6a..4218c1b5a 100644 --- a/website/docs/usage/101/_training.md +++ b/website/docs/usage/101/_training.md @@ -10,7 +10,7 @@ any other information. Training is an iterative process in which the model's predictions are compared against the reference annotations in order to estimate the **gradient of the loss**. The gradient of the loss is then used to calculate the gradient of the -weights through [backpropagation](https://thinc.ai/backprop101). The gradients +weights through [backpropagation](https://thinc.ai/docs/backprop101). The gradients indicate how the weight values should be changed so that the model's predictions become more similar to the reference labels over time. From 3bcf74aca7b35680a81c4239a6823aa5f46c429a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Mar 2021 11:11:06 +0100 Subject: [PATCH 14/18] Rename and update ru pymorphy2 lookup lemmatize * To allow default lookup lemmatization with a blank Russian model, rename pymorphy2 lookup mode to `pymorphy2_lookup` * Bug fix: update pymorphy2 lookup lemmatize to return list rather than string --- spacy/lang/ru/lemmatizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index e4689815e..c337b9bc3 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -90,12 +90,12 @@ class RussianLemmatizer(Lemmatizer): return [string.lower()] return list(set([analysis.normal_form for analysis in filtered_analyses])) - def lookup_lemmatize(self, token: Token) -> List[str]: + def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text analyses = self._morph.parse(string) if len(analyses) == 1: - return analyses[0].normal_form - return string + return [analyses[0].normal_form] + return [string] def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: From 02b5c8a1a2e49add3eaa5434678d513861dd00ab Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Mar 2021 09:48:31 +0100 Subject: [PATCH 15/18] Add py.typed --- MANIFEST.in | 1 + spacy/py.typed | 0 2 files changed, 1 insertion(+) create mode 100644 spacy/py.typed diff --git a/MANIFEST.in b/MANIFEST.in index b4887cdb8..8008b4507 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja include LICENSE include README.md include pyproject.toml +include spacy/py.typed recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz recursive-include spacy/cli *.json *.yml diff --git a/spacy/py.typed b/spacy/py.typed new file mode 100644 index 000000000..e69de29bb From 00e59be966f1710f4245af68b103033786e3f884 Mon Sep 17 00:00:00 2001 From: Paolo Arduin Date: Tue, 16 Mar 2021 18:22:03 +0100 Subject: [PATCH 16/18] Add SpikeX to spaCy universe --- website/meta/universe.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index f67b7c219..db7657591 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,36 @@ { "resources": [ + { + "id": "spikex", + "title": "SpikeX - SpaCy Pipes for Knowledge Extraction", + "slogan": "Use SpikeX to build knowledge extraction tools with almost-zero effort", + "description": "SpikeX is a collection of pipes ready to be plugged in a spaCy pipeline. It aims to help in building knowledge extraction tools with almost-zero effort.", + "github": "erre-quadro/spikex", + "pip": "spikex", + "code_example": [ + "from spacy import load as spacy_load", + "from spikex.wikigraph import load as wg_load", + "from spikex.pipes import WikiPageX", + "", + "# load a spacy model and get a doc", + "nlp = spacy_load('en_core_web_sm')", + "doc = nlp('An apple a day keeps the doctor away')", + "# load a WikiGraph", + "wg = wg_load('simplewiki_core')", + "# get a WikiPageX and extract all pages", + "wikipagex = WikiPageX(wg)", + "doc = wikipagex(doc)", + "# see all pages extracted from the doc", + "for span in doc._.wiki_spans:", + " print(span._.wiki_pages)" + ], + "category": ["pipeline", "standalone"], + "author": "Erre Quadro", + "author_links": { + "github": "erre-quadro", + "website": "https://www.errequadrosrl.com" + } + }, { "id": "spacy-dbpedia-spotlight", "title": "DBpedia Spotlight for SpaCy", From 3c362ac5209d6e5a2f220a0181738c0c3b992d41 Mon Sep 17 00:00:00 2001 From: Lukas Winkler Date: Thu, 18 Mar 2021 21:09:11 +0100 Subject: [PATCH 17/18] replace "is not" with != --- spacy/training/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index c791732db..6d7850212 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -142,7 +142,7 @@ def create_pretraining_model(nlp, pretrain_config): # If the config referred to a Tok2VecListener, grab the original model instead if type(tok2vec).__name__ == "Tok2VecListener": original_tok2vec = ( - tok2vec.upstream_name if tok2vec.upstream_name is not "*" else "tok2vec" + tok2vec.upstream_name if tok2vec.upstream_name != "*" else "tok2vec" ) tok2vec = nlp.get_pipe(original_tok2vec).model try: From 0ad9e16ec3524826e1da93043bd9bfdacaebd634 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 18 Mar 2021 21:18:25 +0100 Subject: [PATCH 18/18] Check for callbacks entry points --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 4b82eea8d..389e3504f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -88,7 +88,7 @@ class registry(thinc.registry): displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) misc = catalogue.create("spacy", "misc", entry_points=True) # Callback functions used to manipulate nlp object etc. - callbacks = catalogue.create("spacy", "callbacks") + callbacks = catalogue.create("spacy", "callbacks", entry_points=True) batchers = catalogue.create("spacy", "batchers", entry_points=True) readers = catalogue.create("spacy", "readers", entry_points=True) augmenters = catalogue.create("spacy", "augmenters", entry_points=True)