mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Merge remote-tracking branch 'upstream/master' into chore/v4-merge-master-20221222
This commit is contained in:
commit
207565a788
2
.github/azure-steps.yml
vendored
2
.github/azure-steps.yml
vendored
|
@ -107,7 +107,7 @@ steps:
|
||||||
displayName: "Run CPU tests"
|
displayName: "Run CPU tests"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install --pre thinc-apple-ops
|
python -m pip install 'spacy[apple]'
|
||||||
python -m pytest --pyargs spacy
|
python -m pytest --pyargs spacy
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||||
|
|
8
.github/workflows/lock.yml
vendored
8
.github/workflows/lock.yml
vendored
|
@ -15,11 +15,11 @@ jobs:
|
||||||
action:
|
action:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v3
|
- uses: dessant/lock-threads@v4
|
||||||
with:
|
with:
|
||||||
process-only: 'issues'
|
process-only: 'issues'
|
||||||
issue-inactive-days: '30'
|
issue-inactive-days: '30'
|
||||||
issue-comment: >
|
issue-comment: >
|
||||||
This thread has been automatically locked since there
|
This thread has been automatically locked since there
|
||||||
has not been any recent activity after it was closed.
|
has not been any recent activity after it was closed.
|
||||||
Please open a new issue for related bugs.
|
Please open a new issue for related bugs.
|
||||||
|
|
|
@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
|
||||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the MIT license.
|
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
💫 **Version 3.4 out now!**
|
💫 **Version 3.4 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
|
|
|
@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy==1.19.3; python_version=='3.9'
|
||||||
numpy==1.21.3; python_version=='3.10'
|
numpy==1.21.3; python_version=='3.10'
|
||||||
numpy; python_version>='3.11'
|
numpy==1.23.2; python_version=='3.11'
|
||||||
|
numpy; python_version>='3.12'
|
||||||
|
|
|
@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
|
||||||
thinc>=9.0.0.dev0,<9.1.0
|
thinc>=9.0.0.dev0,<9.1.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.1.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
|
|
|
@ -39,7 +39,7 @@ install_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=9.0.0.dev0,<9.1.0
|
thinc>=9.0.0.dev0,<9.1.0
|
||||||
wasabi>=0.9.1,<1.1.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
|
|
|
@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .debug_diff import debug_diff # noqa: F401
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
|
from .apply import apply # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
|
|
|
@ -158,15 +158,15 @@ def load_project_config(
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
validate_project_version(config)
|
validate_project_version(config)
|
||||||
validate_project_commands(config)
|
validate_project_commands(config)
|
||||||
|
if interpolate:
|
||||||
|
err = f"{PROJECT_FILE} validation error"
|
||||||
|
with show_validation_error(title=err, hint_fill=False):
|
||||||
|
config = substitute_project_variables(config, overrides)
|
||||||
# Make sure directories defined in config exist
|
# Make sure directories defined in config exist
|
||||||
for subdir in config.get("directories", []):
|
for subdir in config.get("directories", []):
|
||||||
dir_path = path / subdir
|
dir_path = path / subdir
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
if interpolate:
|
|
||||||
err = f"{PROJECT_FILE} validation error"
|
|
||||||
with show_validation_error(title=err, hint_fill=False):
|
|
||||||
config = substitute_project_variables(config, overrides)
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||||
|
|
||||||
|
|
||||||
|
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||||
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
|
paths = [path]
|
||||||
|
locs = []
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith("."):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
locs.append(path)
|
||||||
|
# It's good to sort these, in case the ordering messes up cache.
|
||||||
|
locs.sort()
|
||||||
|
return locs
|
||||||
|
|
||||||
|
|
||||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||||
as happens with `round(number, ndigits)`"""
|
as happens with `round(number, ndigits)`"""
|
||||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
import tqdm
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from itertools import chain
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, List, Iterable, cast, Union
|
||||||
|
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||||
|
|
||||||
|
from ..tokens import Doc, DocBin
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ..util import ensure_path, load_model
|
||||||
|
|
||||||
|
|
||||||
|
path_help = """Location of the documents to predict on.
|
||||||
|
Can be a single file in .spacy format or a .jsonl file.
|
||||||
|
Files with other extensions are treated as single plain text documents.
|
||||||
|
If a directory is provided it is traversed recursively to grab
|
||||||
|
all files to be processed.
|
||||||
|
The files can be a mixture of .spacy, .jsonl and text files.
|
||||||
|
If .jsonl is provided the specified field is going
|
||||||
|
to be grabbed ("text" by default)."""
|
||||||
|
|
||||||
|
out_help = "Path to save the resulting .spacy file"
|
||||||
|
code_help = (
|
||||||
|
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||||
|
)
|
||||||
|
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||||
|
force_msg = (
|
||||||
|
"The provided output file already exists. "
|
||||||
|
"To force overwriting the output file, set the --force or -F flag."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||||
|
"""
|
||||||
|
Stream Doc objects from DocBin.
|
||||||
|
"""
|
||||||
|
docbin = DocBin().from_disk(path)
|
||||||
|
for doc in docbin.get_docs(vocab):
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||||
|
"""
|
||||||
|
Stream "text" field from JSONL. If the field "text" is
|
||||||
|
not found it raises error.
|
||||||
|
"""
|
||||||
|
for entry in srsly.read_jsonl(path):
|
||||||
|
if field not in entry:
|
||||||
|
msg.fail(
|
||||||
|
f"{path} does not contain the required '{field}' field.", exits=1
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
yield entry[field]
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||||
|
"""
|
||||||
|
Yields strings from text files in paths.
|
||||||
|
"""
|
||||||
|
for path in paths:
|
||||||
|
with open(path, "r") as fin:
|
||||||
|
text = fin.read()
|
||||||
|
yield text
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("apply")
|
||||||
|
def apply_cli(
|
||||||
|
# fmt: off
|
||||||
|
model: str = Arg(..., help="Model name or path"),
|
||||||
|
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||||
|
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||||
|
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||||
|
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||||
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||||
|
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||||
|
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Apply a trained pipeline to documents to get predictions.
|
||||||
|
Expects a loadable spaCy pipeline and path to the data, which
|
||||||
|
can be a directory or a file.
|
||||||
|
The data files can be provided in multiple formats:
|
||||||
|
1. .spacy files
|
||||||
|
2. .jsonl files with a specified "field" to read the text from.
|
||||||
|
3. Files with any other extension are assumed to be containing
|
||||||
|
a single document.
|
||||||
|
DOCS: https://spacy.io/api/cli#apply
|
||||||
|
"""
|
||||||
|
data_path = ensure_path(data_path)
|
||||||
|
output_file = ensure_path(output_file)
|
||||||
|
code_path = ensure_path(code_path)
|
||||||
|
if output_file.exists() and not force_overwrite:
|
||||||
|
msg.fail(force_msg, exits=1)
|
||||||
|
if not data_path.exists():
|
||||||
|
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||||
|
import_code(code_path)
|
||||||
|
setup_gpu(use_gpu)
|
||||||
|
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||||
|
|
||||||
|
|
||||||
|
def apply(
|
||||||
|
data_path: Path,
|
||||||
|
output_file: Path,
|
||||||
|
model: str,
|
||||||
|
json_field: str,
|
||||||
|
batch_size: int,
|
||||||
|
n_process: int,
|
||||||
|
):
|
||||||
|
docbin = DocBin(store_user_data=True)
|
||||||
|
paths = walk_directory(data_path)
|
||||||
|
if len(paths) == 0:
|
||||||
|
docbin.to_disk(output_file)
|
||||||
|
msg.warn("Did not find data to process,"
|
||||||
|
f" {data_path} seems to be an empty directory.")
|
||||||
|
return
|
||||||
|
nlp = load_model(model)
|
||||||
|
msg.good(f"Loaded model {model}")
|
||||||
|
vocab = nlp.vocab
|
||||||
|
streams: List[DocOrStrStream] = []
|
||||||
|
text_files = []
|
||||||
|
for path in paths:
|
||||||
|
if path.suffix == ".spacy":
|
||||||
|
streams.append(_stream_docbin(path, vocab))
|
||||||
|
elif path.suffix == ".jsonl":
|
||||||
|
streams.append(_stream_jsonl(path, json_field))
|
||||||
|
else:
|
||||||
|
text_files.append(path)
|
||||||
|
if len(text_files) > 0:
|
||||||
|
streams.append(_stream_texts(text_files))
|
||||||
|
datagen = cast(DocOrStrStream, chain(*streams))
|
||||||
|
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||||
|
docbin.add(doc)
|
||||||
|
if output_file.suffix == "":
|
||||||
|
output_file = output_file.with_suffix(".spacy")
|
||||||
|
docbin.to_disk(output_file)
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -7,7 +7,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, walk_directory
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
|
||||||
if not path.is_dir():
|
|
||||||
return [path]
|
|
||||||
paths = [path]
|
|
||||||
locs = []
|
|
||||||
seen = set()
|
|
||||||
for path in paths:
|
|
||||||
if str(path) in seen:
|
|
||||||
continue
|
|
||||||
seen.add(str(path))
|
|
||||||
if path.parts[-1].startswith("."):
|
|
||||||
continue
|
|
||||||
elif path.is_dir():
|
|
||||||
paths.extend(path.iterdir())
|
|
||||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
|
||||||
continue
|
|
||||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
|
||||||
continue
|
|
||||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
locs.append(path)
|
|
||||||
# It's good to sort these, in case the ordering messes up cache.
|
|
||||||
locs.sort()
|
|
||||||
return locs
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(
|
def verify_cli_args(
|
||||||
msg: Printer,
|
msg: Printer,
|
||||||
input_path: Path,
|
input_path: Path,
|
||||||
|
|
|
@ -101,8 +101,8 @@ def project_run(
|
||||||
if not (project_dir / dep).exists():
|
if not (project_dir / dep).exists():
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_exits = 1 if not dry else None
|
||||||
msg.fail(err, err_help, **err_kwargs)
|
msg.fail(err, err_help, exits=err_exits)
|
||||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
msg.divider(subcommand)
|
msg.divider(subcommand)
|
||||||
|
|
|
@ -336,6 +336,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
|
E079 = ("Error computing states in beam: number of predicted beams "
|
||||||
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||||
|
E080 = ("Duplicate state found in beam: {key}.")
|
||||||
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||||
|
"does not equal number of losses ({losses}).")
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||||
afgelopen aldus alhoewel anderzijds
|
afgelopen aldus alhoewel anderzijds
|
||||||
|
|
||||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
||||||
|
|
|
@ -350,9 +350,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
|
||||||
tree = dict(tree)
|
tree = dict(tree)
|
||||||
if "orig" in tree:
|
if "orig" in tree:
|
||||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||||
if "orig" in tree:
|
if "orig" in tree:
|
||||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||||
|
|
||||||
trees.append(tree)
|
trees.append(tree)
|
||||||
|
|
||||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
||||||
|
|
||||||
# head before start
|
# head before start
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = -1
|
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
||||||
# head after end
|
# head after end
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = 5
|
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from random import Random
|
from random import Random
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Span, SpanGroup
|
from spacy.tokens import Span, SpanGroup, Doc
|
||||||
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -242,3 +245,13 @@ def test_span_group_extend(doc):
|
||||||
def test_span_group_dealloc(span_group):
|
def test_span_group_dealloc(span_group):
|
||||||
with pytest.raises(AttributeError):
|
with pytest.raises(AttributeError):
|
||||||
print(span_group.doc)
|
print(span_group.doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11975)
|
||||||
|
def test_span_group_typing(doc: Doc):
|
||||||
|
"""Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
|
||||||
|
span_group: SpanGroup = doc.spans["SPANS"]
|
||||||
|
spans: List[Span] = list(span_group)
|
||||||
|
for i, span in enumerate(span_group):
|
||||||
|
assert span == span_group[i] == spans[i]
|
||||||
|
filter_spans(span_group)
|
||||||
|
|
|
@ -62,10 +62,45 @@ def test_initialize_from_labels():
|
||||||
nlp2 = Language()
|
nlp2 = Language()
|
||||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||||
lemmatizer2.initialize(
|
lemmatizer2.initialize(
|
||||||
get_examples=lambda: train_examples,
|
# We want to check that the strings in replacement nodes are
|
||||||
|
# added to the string store. Avoid that they get added through
|
||||||
|
# the examples.
|
||||||
|
get_examples=lambda: train_examples[:1],
|
||||||
labels=lemmatizer.label_data,
|
labels=lemmatizer.label_data,
|
||||||
)
|
)
|
||||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||||
|
assert lemmatizer2.label_data == {
|
||||||
|
"trees": [
|
||||||
|
{"orig": "S", "subst": "s"},
|
||||||
|
{
|
||||||
|
"prefix_len": 1,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 0,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
{"orig": "s", "subst": ""},
|
||||||
|
{
|
||||||
|
"prefix_len": 0,
|
||||||
|
"suffix_len": 1,
|
||||||
|
"prefix_tree": 4294967295,
|
||||||
|
"suffix_tree": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prefix_len": 0,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 4294967295,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
{"orig": "E", "subst": "e"},
|
||||||
|
{
|
||||||
|
"prefix_len": 1,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 5,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"labels": (1, 3, 4, 6),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_no_data():
|
def test_no_data():
|
||||||
|
|
|
@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import spacy
|
||||||
import numpy
|
import numpy
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
|
||||||
from spacy.cli.project.remote_storage import RemoteStorage
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
from spacy.cli.project.run import _check_requirements
|
from spacy.cli.project.run import _check_requirements
|
||||||
from spacy.cli.validate import get_model_pkgs
|
from spacy.cli.validate import get_model_pkgs
|
||||||
|
from spacy.cli.apply import apply
|
||||||
from spacy.cli.find_threshold import find_threshold
|
from spacy.cli.find_threshold import find_threshold
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.nl import Dutch
|
from spacy.lang.nl import Dutch
|
||||||
|
@ -123,6 +125,25 @@ def test_issue7055():
|
||||||
assert "model" in filled_cfg["components"]["ner"]
|
assert "model" in filled_cfg["components"]["ner"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11235)
|
||||||
|
def test_issue11235():
|
||||||
|
"""
|
||||||
|
Test that the cli handles interpolation in the directory names correctly when loading project config.
|
||||||
|
"""
|
||||||
|
lang_var = "en"
|
||||||
|
variables = {"lang": lang_var}
|
||||||
|
commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
|
||||||
|
directories = ["cfg", "${vars.lang}_model"]
|
||||||
|
project = {"commands": commands, "vars": variables, "directories": directories}
|
||||||
|
with make_tempdir() as d:
|
||||||
|
srsly.write_yaml(d / "project.yml", project)
|
||||||
|
cfg = load_project_config(d)
|
||||||
|
# Check that the directories are interpolated and created correctly
|
||||||
|
assert os.path.exists(d / "cfg")
|
||||||
|
assert os.path.exists(d / f"{lang_var}_model")
|
||||||
|
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||||
|
|
||||||
|
|
||||||
def test_cli_info():
|
def test_cli_info():
|
||||||
nlp = Dutch()
|
nlp = Dutch()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
|
@ -866,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_empty_dir():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "test.spacy"
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_docbin():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp("testing apply cli.")
|
||||||
|
# test empty DocBin case
|
||||||
|
docbin = DocBin()
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_jsonl():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
data = [{"field": "Testing apply cli.", "key": 234}]
|
||||||
|
data2 = [{"field": "234"}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||||
|
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||||
|
srsly.write_jsonl(data_path / "test2.jsonl", data2)
|
||||||
|
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_txt():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
with open(data_path / "test.foo", "w") as ftest:
|
||||||
|
ftest.write("Testing apply cli.")
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_mixed():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
text = "Testing apply cli"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp(text)
|
||||||
|
jsonl_data = [{"text": text}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
|
||||||
|
docbin = DocBin()
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
with open(data_path / "test.txt", "w") as ftest:
|
||||||
|
ftest.write(text)
|
||||||
|
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||||
|
# Check whether it worked
|
||||||
|
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||||
|
assert len(result) == 3
|
||||||
|
for doc in result:
|
||||||
|
assert doc.text == text
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_user_data():
|
||||||
|
Doc.set_extension("ext", default=0)
|
||||||
|
val = ("ext", 0)
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp("testing apply cli.")
|
||||||
|
doc._.ext = val
|
||||||
|
docbin = DocBin(store_user_data=True)
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", "", 1, 1)
|
||||||
|
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||||
|
assert result[0]._.ext == val
|
||||||
|
|
||||||
|
|
||||||
def test_local_remote_storage():
|
def test_local_remote_storage():
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
filename = "a.txt"
|
filename = "a.txt"
|
||||||
|
|
|
@ -359,6 +359,7 @@ cdef class Doc:
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
@ -1573,6 +1574,7 @@ cdef class Doc:
|
||||||
|
|
||||||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||||
if attr is HEAD:
|
if attr is HEAD:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
array[i, j] = annot[i]
|
array[i, j] = annot[i]
|
||||||
elif attr is MORPH:
|
elif attr is MORPH:
|
||||||
|
|
|
@ -93,8 +93,8 @@ class Span:
|
||||||
self,
|
self,
|
||||||
start_idx: int,
|
start_idx: int,
|
||||||
end_idx: int,
|
end_idx: int,
|
||||||
label: int = ...,
|
label: Union[int, str] = ...,
|
||||||
kb_id: int = ...,
|
kb_id: Union[int, str] = ...,
|
||||||
vector: Optional[Floats1d] = ...,
|
vector: Optional[Floats1d] = ...,
|
||||||
) -> Span: ...
|
) -> Span: ...
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -318,7 +318,7 @@ cdef class Span:
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - span_c.start
|
ancestor_i = ancestor.i - span_c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||||
|
|
||||||
# if there is no appropriate ancestor, define a new artificial root
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
value = array[i, head_col]
|
value = array[i, head_col]
|
||||||
|
@ -326,7 +326,7 @@ cdef class Span:
|
||||||
new_root = old_to_new_root.get(ancestor_i, None)
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
if new_root is not None:
|
if new_root is not None:
|
||||||
# take the same artificial root as a previous token from the same sentence
|
# take the same artificial root as a previous token from the same sentence
|
||||||
array[i, head_col] = new_root - i
|
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
# set this token as the new artificial root
|
# set this token as the new artificial root
|
||||||
array[i, head_col] = 0
|
array[i, head_col] = 0
|
||||||
|
|
|
@ -18,6 +18,7 @@ class SpanGroup:
|
||||||
def doc(self) -> Doc: ...
|
def doc(self) -> Doc: ...
|
||||||
@property
|
@property
|
||||||
def has_overlap(self) -> bool: ...
|
def has_overlap(self) -> bool: ...
|
||||||
|
def __iter__(self): ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def append(self, span: Span) -> None: ...
|
def append(self, span: Span) -> None: ...
|
||||||
def extend(self, spans: Iterable[Span]) -> None: ...
|
def extend(self, spans: Iterable[Span]) -> None: ...
|
||||||
|
|
|
@ -159,6 +159,16 @@ cdef class SpanGroup:
|
||||||
return self._concat(other)
|
return self._concat(other)
|
||||||
return NotImplemented
|
return NotImplemented
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
"""
|
||||||
|
Iterate over the spans in this SpanGroup.
|
||||||
|
YIELDS (Span): A span in this SpanGroup.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spangroup#iter
|
||||||
|
"""
|
||||||
|
for i in range(self.c.size()):
|
||||||
|
yield self[i]
|
||||||
|
|
||||||
def append(self, Span span):
|
def append(self, Span span):
|
||||||
"""Add a span to the group. The span must refer to the same Doc
|
"""Add a span to the group. The span must refer to the same Doc
|
||||||
object as the span group.
|
object as the span group.
|
||||||
|
|
|
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
continue
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([to_ternary_int(v) for v in value])
|
row = [to_ternary_int(v) for v in value]
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
row = [vocab.morphology.add(v) for v in value]
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
if not all(isinstance(v, str) for v in value):
|
if not all(isinstance(v, str) for v in value):
|
||||||
types = set([type(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
row = [vocab.strings.add(v) for v in value]
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||||
|
array = numpy.array(values, dtype=numpy.uint64)
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ menu:
|
||||||
- ['train', 'train']
|
- ['train', 'train']
|
||||||
- ['pretrain', 'pretrain']
|
- ['pretrain', 'pretrain']
|
||||||
- ['evaluate', 'evaluate']
|
- ['evaluate', 'evaluate']
|
||||||
|
- ['apply', 'apply']
|
||||||
- ['find-threshold', 'find-threshold']
|
- ['find-threshold', 'find-threshold']
|
||||||
- ['assemble', 'assemble']
|
- ['assemble', 'assemble']
|
||||||
- ['package', 'package']
|
- ['package', 'package']
|
||||||
|
@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||||
|
|
||||||
|
## apply {#apply new="3.5" tag="command"}
|
||||||
|
|
||||||
|
Applies a trained pipeline to data and stores the resulting annotated documents
|
||||||
|
in a `DocBin`. The input can be a single file or a directory. The recognized
|
||||||
|
input formats are:
|
||||||
|
|
||||||
|
1. `.spacy`
|
||||||
|
2. `.jsonl` containing a user specified `text_key`
|
||||||
|
3. Files with any other extension are assumed to be plain text files containing
|
||||||
|
a single document.
|
||||||
|
|
||||||
|
When a directory is provided it is traversed recursively to collect all files.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
|
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||||
|
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||||
|
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
|
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||||
|
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||||
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
|
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||||
|
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||||
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||||
|
|
||||||
## find-threshold {#find-threshold new="3.5" tag="command"}
|
## find-threshold {#find-threshold new="3.5" tag="command"}
|
||||||
|
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||||
|
|
|
@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
|
||||||
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
||||||
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
||||||
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
||||||
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
|
| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
|
||||||
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
|
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
|
||||||
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
|
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
|
||||||
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
|
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
|
||||||
|
|
|
@ -202,6 +202,23 @@ already present in the current span group.
|
||||||
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
|
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
|
||||||
| **RETURNS** | The span group. ~~SpanGroup~~ |
|
| **RETURNS** | The span group. ~~SpanGroup~~ |
|
||||||
|
|
||||||
|
## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
|
||||||
|
|
||||||
|
Iterate over the spans in this span group.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[1:3]]
|
||||||
|
> for error_span in doc.spans["errors"]:
|
||||||
|
> print(error_span)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | ----------------------------------- |
|
||||||
|
| **YIELDS** | A span in this span group. ~~Span~~ |
|
||||||
|
|
||||||
## SpanGroup.append {#append tag="method"}
|
## SpanGroup.append {#append tag="method"}
|
||||||
|
|
||||||
Add a [`Span`](/api/span) object to the group. The span must refer to the same
|
Add a [`Span`](/api/span) object to the group. The span must refer to the same
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@
|
||||||
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -1024,25 +1024,6 @@
|
||||||
"category": ["pipeline"],
|
"category": ["pipeline"],
|
||||||
"spacy_version": 2
|
"spacy_version": 2
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "spacy-sentence-segmenter",
|
|
||||||
"title": "Sentence Segmenter",
|
|
||||||
"slogan": "Custom sentence segmentation for spaCy",
|
|
||||||
"code_example": [
|
|
||||||
"from seg.newline.segmenter import NewLineSegmenter",
|
|
||||||
"import spacy",
|
|
||||||
"",
|
|
||||||
"nlseg = NewLineSegmenter()",
|
|
||||||
"nlp = spacy.load('en')",
|
|
||||||
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
|
|
||||||
"doc = nlp(my_doc_text)"
|
|
||||||
],
|
|
||||||
"author": "tc64",
|
|
||||||
"author_links": {
|
|
||||||
"github": "tc64"
|
|
||||||
},
|
|
||||||
"category": ["pipeline"]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "spacy_cld",
|
"id": "spacy_cld",
|
||||||
"title": "spaCy-CLD",
|
"title": "spaCy-CLD",
|
||||||
|
@ -1472,13 +1453,26 @@
|
||||||
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"import scattertext as st",
|
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load('en')",
|
"from scattertext import SampleCorpora, produce_scattertext_explorer",
|
||||||
"corpus = st.CorpusFromPandas(convention_df,",
|
"from scattertext import produce_scattertext_html",
|
||||||
" category_col='party',",
|
"from scattertext.CorpusFromPandas import CorpusFromPandas",
|
||||||
" text_col='text',",
|
"",
|
||||||
" nlp=nlp).build()"
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"convention_df = SampleCorpora.ConventionData2012.get_data()",
|
||||||
|
"corpus = CorpusFromPandas(convention_df,",
|
||||||
|
" category_col='party',",
|
||||||
|
" text_col='text',",
|
||||||
|
" nlp=nlp).build()",
|
||||||
|
"",
|
||||||
|
"html = produce_scattertext_html(corpus,",
|
||||||
|
" category='democrat',",
|
||||||
|
" category_name='Democratic',",
|
||||||
|
" not_category_name='Republican',",
|
||||||
|
" minimum_term_frequency=5,",
|
||||||
|
" width_in_pixels=1000)",
|
||||||
|
"open('./simple.html', 'wb').write(html.encode('utf-8'))",
|
||||||
|
"print('Open ./simple.html in Chrome or Firefox.')"
|
||||||
],
|
],
|
||||||
"author": "Jason Kessler",
|
"author": "Jason Kessler",
|
||||||
"author_links": {
|
"author_links": {
|
||||||
|
|
|
@ -105,13 +105,13 @@ const Landing = ({ data }) => {
|
||||||
|
|
||||||
<LandingBannerGrid>
|
<LandingBannerGrid>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
to="https://explosion.ai/spacy-tailored-pipelines"
|
to="https://explosion.ai/custom-solutions"
|
||||||
button="Learn more"
|
button="Learn more"
|
||||||
background="#E4F4F9"
|
background="#E4F4F9"
|
||||||
color="#1e1935"
|
color="#1e1935"
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
<Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
|
<Link to="https://explosion.ai/custom-solutions" hidden>
|
||||||
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
||||||
</Link>
|
</Link>
|
||||||
<strong>
|
<strong>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user