diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 2f77706b8..d0db75f9a 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -107,7 +107,7 @@ steps:
displayName: "Run CPU tests"
- script: |
- python -m pip install --pre thinc-apple-ops
+ python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
index c9833cdba..794adee85 100644
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -15,11 +15,11 @@ jobs:
action:
runs-on: ubuntu-latest
steps:
- - uses: dessant/lock-threads@v3
+ - uses: dessant/lock-threads@v4
with:
process-only: 'issues'
issue-inactive-days: '30'
- issue-comment: >
- This thread has been automatically locked since there
- has not been any recent activity after it was closed.
+ issue-comment: >
+ This thread has been automatically locked since there
+ has not been any recent activity after it was closed.
Please open a new issue for related bugs.
diff --git a/README.md b/README.md
index abfc3da67..195424551 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the MIT license.
+open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
@@ -46,6 +46,7 @@ open-source software, released under the MIT license.
| 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
+| | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3
@@ -59,6 +60,7 @@ open-source software, released under the MIT license.
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
+
## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
diff --git a/build-constraints.txt b/build-constraints.txt
index 956973abf..c1e82f1b0 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
-numpy; python_version>='3.11'
+numpy==1.23.2; python_version=='3.11'
+numpy; python_version>='3.12'
diff --git a/requirements.txt b/requirements.txt
index 7b9bbd1f4..02479f946 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
thinc>=9.0.0.dev1,<9.1.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
-wasabi>=0.9.1,<1.1.0
+wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
diff --git a/setup.cfg b/setup.cfg
index 146852526..4a8c350cd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,7 +39,7 @@ install_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=9.0.0.dev1,<9.1.0
- wasabi>=0.9.1,<1.1.0
+ wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index aab2c8d12..aabd1cfef 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
+from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 7ce006108..c46abffe5 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -158,15 +158,15 @@ def load_project_config(
sys.exit(1)
validate_project_version(config)
validate_project_commands(config)
+ if interpolate:
+ err = f"{PROJECT_FILE} validation error"
+ with show_validation_error(title=err, hint_fill=False):
+ config = substitute_project_variables(config, overrides)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
- if interpolate:
- err = f"{PROJECT_FILE} validation error"
- with show_validation_error(title=err, hint_fill=False):
- config = substitute_project_variables(config, overrides)
return config
@@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
+def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+ if not path.is_dir():
+ return [path]
+ paths = [path]
+ locs = []
+ seen = set()
+ for path in paths:
+ if str(path) in seen:
+ continue
+ seen.add(str(path))
+ if path.parts[-1].startswith("."):
+ continue
+ elif path.is_dir():
+ paths.extend(path.iterdir())
+ elif suffix is not None and not path.parts[-1].endswith(suffix):
+ continue
+ else:
+ locs.append(path)
+ # It's good to sort these, in case the ordering messes up cache.
+ locs.sort()
+ return locs
+
+
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
as happens with `round(number, ndigits)`"""
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
new file mode 100644
index 000000000..9d170bc95
--- /dev/null
+++ b/spacy/cli/apply.py
@@ -0,0 +1,143 @@
+import tqdm
+import srsly
+
+from itertools import chain
+from pathlib import Path
+from typing import Optional, List, Iterable, cast, Union
+
+from wasabi import msg
+
+from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
+
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
+from ..util import ensure_path, load_model
+
+
+path_help = """Location of the documents to predict on.
+Can be a single file in .spacy format or a .jsonl file.
+Files with other extensions are treated as single plain text documents.
+If a directory is provided it is traversed recursively to grab
+all files to be processed.
+The files can be a mixture of .spacy, .jsonl and text files.
+If .jsonl is provided the specified field is going
+to be grabbed ("text" by default)."""
+
+out_help = "Path to save the resulting .spacy file"
+code_help = (
+ "Path to Python file with additional " "code (registered functions) to be imported"
+)
+gold_help = "Use gold preprocessing provided in the .spacy files"
+force_msg = (
+ "The provided output file already exists. "
+ "To force overwriting the output file, set the --force or -F flag."
+)
+
+
+DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
+
+
+def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
+ """
+ Stream Doc objects from DocBin.
+ """
+ docbin = DocBin().from_disk(path)
+ for doc in docbin.get_docs(vocab):
+ yield doc
+
+
+def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
+ """
+ Stream "text" field from JSONL. If the field "text" is
+ not found it raises error.
+ """
+ for entry in srsly.read_jsonl(path):
+ if field not in entry:
+ msg.fail(
+ f"{path} does not contain the required '{field}' field.", exits=1
+ )
+ else:
+ yield entry[field]
+
+
+def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
+ """
+ Yields strings from text files in paths.
+ """
+ for path in paths:
+ with open(path, "r") as fin:
+ text = fin.read()
+ yield text
+
+
+@app.command("apply")
+def apply_cli(
+ # fmt: off
+ model: str = Arg(..., help="Model name or path"),
+ data_path: Path = Arg(..., help=path_help, exists=True),
+ output_file: Path = Arg(..., help=out_help, dir_okay=False),
+ code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
+ text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
+ force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
+ use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
+ batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
+ n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
+):
+ """
+ Apply a trained pipeline to documents to get predictions.
+ Expects a loadable spaCy pipeline and path to the data, which
+ can be a directory or a file.
+ The data files can be provided in multiple formats:
+ 1. .spacy files
+ 2. .jsonl files with a specified "field" to read the text from.
+ 3. Files with any other extension are assumed to be containing
+ a single document.
+ DOCS: https://spacy.io/api/cli#apply
+ """
+ data_path = ensure_path(data_path)
+ output_file = ensure_path(output_file)
+ code_path = ensure_path(code_path)
+ if output_file.exists() and not force_overwrite:
+ msg.fail(force_msg, exits=1)
+ if not data_path.exists():
+ msg.fail(f"Couldn't find data path: {data_path}", exits=1)
+ import_code(code_path)
+ setup_gpu(use_gpu)
+ apply(data_path, output_file, model, text_key, batch_size, n_process)
+
+
+def apply(
+ data_path: Path,
+ output_file: Path,
+ model: str,
+ json_field: str,
+ batch_size: int,
+ n_process: int,
+):
+ docbin = DocBin(store_user_data=True)
+ paths = walk_directory(data_path)
+ if len(paths) == 0:
+ docbin.to_disk(output_file)
+ msg.warn("Did not find data to process,"
+ f" {data_path} seems to be an empty directory.")
+ return
+ nlp = load_model(model)
+ msg.good(f"Loaded model {model}")
+ vocab = nlp.vocab
+ streams: List[DocOrStrStream] = []
+ text_files = []
+ for path in paths:
+ if path.suffix == ".spacy":
+ streams.append(_stream_docbin(path, vocab))
+ elif path.suffix == ".jsonl":
+ streams.append(_stream_jsonl(path, json_field))
+ else:
+ text_files.append(path)
+ if len(text_files) > 0:
+ streams.append(_stream_texts(text_files))
+ datagen = cast(DocOrStrStream, chain(*streams))
+ for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
+ docbin.add(doc)
+ if output_file.suffix == "":
+ output_file = output_file.with_suffix(".spacy")
+ docbin.to_disk(output_file)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 04eb7078f..7f365ae2c 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, Union
from enum import Enum
from pathlib import Path
from wasabi import Printer
@@ -7,7 +7,7 @@ import re
import sys
import itertools
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
-def walk_directory(path: Path, converter: str) -> List[Path]:
- if not path.is_dir():
- return [path]
- paths = [path]
- locs = []
- seen = set()
- for path in paths:
- if str(path) in seen:
- continue
- seen.add(str(path))
- if path.parts[-1].startswith("."):
- continue
- elif path.is_dir():
- paths.extend(path.iterdir())
- elif converter == "json" and not path.parts[-1].endswith("json"):
- continue
- elif converter == "conll" and not path.parts[-1].endswith("conll"):
- continue
- elif converter == "iob" and not path.parts[-1].endswith("iob"):
- continue
- else:
- locs.append(path)
- # It's good to sort these, in case the ordering messes up cache.
- locs.sort()
- return locs
-
-
def verify_cli_args(
msg: Printer,
input_path: Path,
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index a109c4a5a..6dd174902 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -101,8 +101,8 @@ def project_run(
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
- err_kwargs = {"exits": 1} if not dry else {}
- msg.fail(err, err_help, **err_kwargs)
+ err_exits = 1 if not dry else None
+ msg.fail(err, err_help, exits=err_exits)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
msg.divider(subcommand)
diff --git a/spacy/errors.py b/spacy/errors.py
index c097348ef..e800be1fa 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -336,6 +336,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
+ E079 = ("Error computing states in beam: number of predicted beams "
+ "({pbeams}) does not equal number of gold beams ({gbeams}).")
+ E080 = ("Duplicate state found in beam: {key}.")
+ E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+ "does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py
index a2c6198e7..cd4fdefdf 100644
--- a/spacy/lang/nl/stop_words.py
+++ b/spacy/lang/nl/stop_words.py
@@ -15,7 +15,7 @@
STOP_WORDS = set(
"""
-aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
+aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
afgelopen aldus alhoewel anderzijds
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 0531d4ba5..2a2242aa4 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -350,9 +350,9 @@ class EditTreeLemmatizer(TrainablePipe):
tree = dict(tree)
if "orig" in tree:
- tree["orig"] = self.vocab.strings[tree["orig"]]
+ tree["orig"] = self.vocab.strings.add(tree["orig"])
if "orig" in tree:
- tree["subst"] = self.vocab.strings[tree["subst"]]
+ tree["subst"] = self.vocab.strings.add(tree["subst"])
trees.append(tree)
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index bdf933c10..d64be66f6 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -155,11 +155,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
-<<<<<<< HEAD
- save_activations (bool): save model activations in Doc when annotating.
-=======
scorer (Optional[Callable]): The scoring method.
->>>>>>> upstream/master
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/textcategorizer#init
"""
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index c334cc6eb..1f2d7d999 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start
arr = doc.to_array(["HEAD"])
- arr[0] = -1
+ arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
# head after end
arr = doc.to_array(["HEAD"])
- arr[0] = 5
+ arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
diff --git a/spacy/tests/doc/test_span_group.py b/spacy/tests/doc/test_span_group.py
index da3c24908..5e8bea127 100644
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@@ -1,7 +1,10 @@
+from typing import List
+
import pytest
from random import Random
from spacy.matcher import Matcher
-from spacy.tokens import Span, SpanGroup
+from spacy.tokens import Span, SpanGroup, Doc
+from spacy.util import filter_spans
@pytest.fixture
@@ -242,3 +245,13 @@ def test_span_group_extend(doc):
def test_span_group_dealloc(span_group):
with pytest.raises(AttributeError):
print(span_group.doc)
+
+
+@pytest.mark.issue(11975)
+def test_span_group_typing(doc: Doc):
+ """Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
+ span_group: SpanGroup = doc.spans["SPANS"]
+ spans: List[Span] = list(span_group)
+ for i, span in enumerate(span_group):
+ assert span == span_group[i] == spans[i]
+ filter_spans(span_group)
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index ad2e56729..5eeb55aa2 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -62,10 +62,45 @@ def test_initialize_from_labels():
nlp2 = Language()
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
lemmatizer2.initialize(
- get_examples=lambda: train_examples,
+ # We want to check that the strings in replacement nodes are
+ # added to the string store. Avoid that they get added through
+ # the examples.
+ get_examples=lambda: train_examples[:1],
labels=lemmatizer.label_data,
)
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
+ assert lemmatizer2.label_data == {
+ "trees": [
+ {"orig": "S", "subst": "s"},
+ {
+ "prefix_len": 1,
+ "suffix_len": 0,
+ "prefix_tree": 0,
+ "suffix_tree": 4294967295,
+ },
+ {"orig": "s", "subst": ""},
+ {
+ "prefix_len": 0,
+ "suffix_len": 1,
+ "prefix_tree": 4294967295,
+ "suffix_tree": 2,
+ },
+ {
+ "prefix_len": 0,
+ "suffix_len": 0,
+ "prefix_tree": 4294967295,
+ "suffix_tree": 4294967295,
+ },
+ {"orig": "E", "subst": "e"},
+ {
+ "prefix_len": 1,
+ "suffix_len": 0,
+ "prefix_tree": 5,
+ "suffix_tree": 4294967295,
+ },
+ ],
+ "labels": (1, 3, 4, 6),
+ }
def test_no_data():
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 2e706458f..c6768a3fd 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
import pkg_resources
import time
+import spacy
import numpy
import pytest
import srsly
@@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs
+from spacy.cli.apply import apply
from spacy.cli.find_threshold import find_threshold
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@@ -123,6 +125,25 @@ def test_issue7055():
assert "model" in filled_cfg["components"]["ner"]
+@pytest.mark.issue(11235)
+def test_issue11235():
+ """
+ Test that the cli handles interpolation in the directory names correctly when loading project config.
+ """
+ lang_var = "en"
+ variables = {"lang": lang_var}
+ commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
+ directories = ["cfg", "${vars.lang}_model"]
+ project = {"commands": commands, "vars": variables, "directories": directories}
+ with make_tempdir() as d:
+ srsly.write_yaml(d / "project.yml", project)
+ cfg = load_project_config(d)
+ # Check that the directories are interpolated and created correctly
+ assert os.path.exists(d / "cfg")
+ assert os.path.exists(d / f"{lang_var}_model")
+ assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
+
+
def test_cli_info():
nlp = Dutch()
nlp.add_pipe("textcat")
@@ -866,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
+def test_applycli_empty_dir():
+ with make_tempdir() as data_path:
+ output = data_path / "test.spacy"
+ apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_docbin():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ nlp = spacy.blank("en")
+ doc = nlp("testing apply cli.")
+ # test empty DocBin case
+ docbin = DocBin()
+ docbin.to_disk(data_path / "testin.spacy")
+ apply(data_path, output, "blank:en", "text", 1, 1)
+ docbin.add(doc)
+ docbin.to_disk(data_path / "testin.spacy")
+ apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_jsonl():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ data = [{"field": "Testing apply cli.", "key": 234}]
+ data2 = [{"field": "234"}]
+ srsly.write_jsonl(data_path / "test.jsonl", data)
+ apply(data_path, output, "blank:en", "field", 1, 1)
+ srsly.write_jsonl(data_path / "test2.jsonl", data2)
+ apply(data_path, output, "blank:en", "field", 1, 1)
+
+
+def test_applycli_txt():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ with open(data_path / "test.foo", "w") as ftest:
+ ftest.write("Testing apply cli.")
+ apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_mixed():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ text = "Testing apply cli"
+ nlp = spacy.blank("en")
+ doc = nlp(text)
+ jsonl_data = [{"text": text}]
+ srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
+ docbin = DocBin()
+ docbin.add(doc)
+ docbin.to_disk(data_path / "testin.spacy")
+ with open(data_path / "test.txt", "w") as ftest:
+ ftest.write(text)
+ apply(data_path, output, "blank:en", "text", 1, 1)
+ # Check whether it worked
+ result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
+ assert len(result) == 3
+ for doc in result:
+ assert doc.text == text
+
+
+def test_applycli_user_data():
+ Doc.set_extension("ext", default=0)
+ val = ("ext", 0)
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ nlp = spacy.blank("en")
+ doc = nlp("testing apply cli.")
+ doc._.ext = val
+ docbin = DocBin(store_user_data=True)
+ docbin.add(doc)
+ docbin.to_disk(data_path / "testin.spacy")
+ apply(data_path, output, "blank:en", "", 1, 1)
+ result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
+ assert result[0]._.ext == val
+
+
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index b7506f745..25af6ca6a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -359,6 +359,7 @@ cdef class Doc:
for annot in annotations:
if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs:
+ annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]
@@ -1573,6 +1574,7 @@ cdef class Doc:
for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD:
+ annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
array[i, j] = annot[i]
elif attr is MORPH:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index abda49361..5168f3b03 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -93,8 +93,8 @@ class Span:
self,
start_idx: int,
end_idx: int,
- label: int = ...,
- kb_id: int = ...,
+ label: Union[int, str] = ...,
+ kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
) -> Span: ...
@property
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 6f2d0379c..b605434fd 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -318,7 +318,7 @@ cdef class Span:
for ancestor in ancestors:
ancestor_i = ancestor.i - span_c.start
if ancestor_i in range(length):
- array[i, head_col] = ancestor_i - i
+ array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col]
@@ -326,7 +326,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None:
# take the same artificial root as a previous token from the same sentence
- array[i, head_col] = new_root - i
+ array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else:
# set this token as the new artificial root
array[i, head_col] = 0
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
index 21cd124ab..0b4aa83aa 100644
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@@ -18,6 +18,7 @@ class SpanGroup:
def doc(self) -> Doc: ...
@property
def has_overlap(self) -> bool: ...
+ def __iter__(self): ...
def __len__(self) -> int: ...
def append(self, span: Span) -> None: ...
def extend(self, spans: Iterable[Span]) -> None: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 7caa01ee7..7325c1fa7 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -159,6 +159,16 @@ cdef class SpanGroup:
return self._concat(other)
return NotImplemented
+ def __iter__(self):
+ """
+ Iterate over the spans in this SpanGroup.
+ YIELDS (Span): A span in this SpanGroup.
+
+ DOCS: https://spacy.io/api/spangroup#iter
+ """
+ for i in range(self.c.size()):
+ yield self[i]
+
def append(self, Span span):
"""Add a span to the group. The span must refer to the same Doc
object as the span group.
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index dfd337b9e..95b0f0de9 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]:
- pass
+ continue
elif key == "HEAD":
attrs.append(key)
- values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+ row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP":
attrs.append(key)
- values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+ row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START":
attrs.append(key)
- values.append([to_ternary_int(v) for v in value])
+ row = [to_ternary_int(v) for v in value]
elif key == "MORPH":
attrs.append(key)
- values.append([vocab.morphology.add(v) for v in value])
+ row = [vocab.morphology.add(v) for v in value]
else:
attrs.append(key)
if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
- values.append([vocab.strings.add(v) for v in value])
- array = numpy.asarray(values, dtype="uint64")
+ row = [vocab.strings.add(v) for v in value]
+ values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+ array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 92a123241..275e37ee0 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -12,6 +12,7 @@ menu:
- ['train', 'train']
- ['pretrain', 'pretrain']
- ['evaluate', 'evaluate']
+ - ['apply', 'apply']
- ['find-threshold', 'find-threshold']
- ['assemble', 'assemble']
- ['package', 'package']
@@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
+## apply {#apply new="3.5" tag="command"}
+
+Applies a trained pipeline to data and stores the resulting annotated documents
+in a `DocBin`. The input can be a single file or a directory. The recognized
+input formats are:
+
+1. `.spacy`
+2. `.jsonl` containing a user specified `text_key`
+3. Files with any other extension are assumed to be plain text files containing
+ a single document.
+
+When a directory is provided it is traversed recursively to collect all files.
+
+```cli
+$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
+```
+
+| Name | Description |
+| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
+| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
+| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
+| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
+| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
+| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
+
## find-threshold {#find-threshold new="3.5" tag="command"}
Runs prediction trials for a trained model with varying tresholds to maximize
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index cd4086562..e13f25209 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
-| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
+| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md
index 2d1cf73c4..bd9659acb 100644
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@@ -202,6 +202,23 @@ already present in the current span group.
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
| **RETURNS** | The span group. ~~SpanGroup~~ |
+## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
+
+Iterate over the spans in this span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
+> for error_span in doc.spans["errors"]:
+> print(error_span)
+> ```
+
+| Name | Description |
+| ---------- | ----------------------------------- |
+| **YIELDS** | A span in this span group. ~~Span~~ |
+
## SpanGroup.append {#append tag="method"}
Add a [`Span`](/api/span) object to the group. The span must refer to the same
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 2d8745d77..339e4085b 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -45,7 +45,7 @@
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
{
"text": "Custom Solutions",
- "url": "https://explosion.ai/spacy-tailored-pipelines"
+ "url": "https://explosion.ai/custom-solutions"
}
]
}
diff --git a/website/meta/site.json b/website/meta/site.json
index 360a72178..fa79d3c69 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -51,7 +51,7 @@
{ "text": "Online Course", "url": "https://course.spacy.io" },
{
"text": "Custom Solutions",
- "url": "https://explosion.ai/spacy-tailored-pipelines"
+ "url": "https://explosion.ai/custom-solutions"
}
]
},
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 168a39a5f..84314328d 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1024,25 +1024,6 @@
"category": ["pipeline"],
"spacy_version": 2
},
- {
- "id": "spacy-sentence-segmenter",
- "title": "Sentence Segmenter",
- "slogan": "Custom sentence segmentation for spaCy",
- "code_example": [
- "from seg.newline.segmenter import NewLineSegmenter",
- "import spacy",
- "",
- "nlseg = NewLineSegmenter()",
- "nlp = spacy.load('en')",
- "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
- "doc = nlp(my_doc_text)"
- ],
- "author": "tc64",
- "author_links": {
- "github": "tc64"
- },
- "category": ["pipeline"]
- },
{
"id": "spacy_cld",
"title": "spaCy-CLD",
@@ -1472,13 +1453,26 @@
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
"code_example": [
"import spacy",
- "import scattertext as st",
"",
- "nlp = spacy.load('en')",
- "corpus = st.CorpusFromPandas(convention_df,",
- " category_col='party',",
- " text_col='text',",
- " nlp=nlp).build()"
+ "from scattertext import SampleCorpora, produce_scattertext_explorer",
+ "from scattertext import produce_scattertext_html",
+ "from scattertext.CorpusFromPandas import CorpusFromPandas",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "convention_df = SampleCorpora.ConventionData2012.get_data()",
+ "corpus = CorpusFromPandas(convention_df,",
+ " category_col='party',",
+ " text_col='text',",
+ " nlp=nlp).build()",
+ "",
+ "html = produce_scattertext_html(corpus,",
+ " category='democrat',",
+ " category_name='Democratic',",
+ " not_category_name='Republican',",
+ " minimum_term_frequency=5,",
+ " width_in_pixels=1000)",
+ "open('./simple.html', 'wb').write(html.encode('utf-8'))",
+ "print('Open ./simple.html in Chrome or Firefox.')"
],
"author": "Jason Kessler",
"author_links": {
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index b7ae35f6e..c3aaa8a22 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -105,13 +105,13 @@ const Landing = ({ data }) => {
-
+