mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/master' into chore/v4-merge-master-20221222
This commit is contained in:
		
						commit
						207565a788
					
				
							
								
								
									
										2
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -107,7 +107,7 @@ steps:
 | 
			
		|||
    displayName: "Run CPU tests"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m pip install --pre thinc-apple-ops
 | 
			
		||||
      python -m pip install 'spacy[apple]'
 | 
			
		||||
      python -m pytest --pyargs spacy
 | 
			
		||||
    displayName: "Run CPU tests with thinc-apple-ops"
 | 
			
		||||
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										2
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -15,7 +15,7 @@ jobs:
 | 
			
		|||
  action:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - uses: dessant/lock-threads@v3
 | 
			
		||||
      - uses: dessant/lock-threads@v4
 | 
			
		||||
        with:
 | 
			
		||||
          process-only: 'issues'
 | 
			
		||||
          issue-inactive-days: '30'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
 | 
			
		|||
multi-task learning with pretrained **transformers** like BERT, as well as a
 | 
			
		||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
 | 
			
		||||
model packaging, deployment and workflow management. spaCy is commercial
 | 
			
		||||
open-source software, released under the MIT license.
 | 
			
		||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 | 
			
		||||
 | 
			
		||||
💫 **Version 3.4 out now!**
 | 
			
		||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
			
		||||
| 
						 | 
				
			
			@ -46,6 +46,7 @@ open-source software, released under the MIT license.
 | 
			
		|||
| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
 | 
			
		||||
| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
 | 
			
		||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
 | 
			
		||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
 | 
			
		||||
 | 
			
		||||
[spacy 101]: https://spacy.io/usage/spacy-101
 | 
			
		||||
[new in v3.0]: https://spacy.io/usage/v3
 | 
			
		||||
| 
						 | 
				
			
			@ -59,6 +60,7 @@ open-source software, released under the MIT license.
 | 
			
		|||
[changelog]: https://spacy.io/usage#changelog
 | 
			
		||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 💬 Where to ask questions
 | 
			
		||||
 | 
			
		||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 | 
			
		|||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 | 
			
		||||
numpy==1.19.3; python_version=='3.9'
 | 
			
		||||
numpy==1.21.3; python_version=='3.10'
 | 
			
		||||
numpy; python_version>='3.11'
 | 
			
		||||
numpy==1.23.2; python_version=='3.11'
 | 
			
		||||
numpy; python_version>='3.12'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
 | 
			
		|||
thinc>=9.0.0.dev0,<9.1.0
 | 
			
		||||
ml_datasets>=0.2.0,<0.3.0
 | 
			
		||||
murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
wasabi>=0.9.1,<1.1.0
 | 
			
		||||
wasabi>=0.9.1,<1.2.0
 | 
			
		||||
srsly>=2.4.3,<3.0.0
 | 
			
		||||
catalogue>=2.0.6,<2.1.0
 | 
			
		||||
typer>=0.3.0,<0.8.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -39,7 +39,7 @@ install_requires =
 | 
			
		|||
    cymem>=2.0.2,<2.1.0
 | 
			
		||||
    preshed>=3.0.2,<3.1.0
 | 
			
		||||
    thinc>=9.0.0.dev0,<9.1.0
 | 
			
		||||
    wasabi>=0.9.1,<1.1.0
 | 
			
		||||
    wasabi>=0.9.1,<1.2.0
 | 
			
		||||
    srsly>=2.4.3,<3.0.0
 | 
			
		||||
    catalogue>=2.0.6,<2.1.0
 | 
			
		||||
    # Third-party dependencies
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@ from .debug_config import debug_config  # noqa: F401
 | 
			
		|||
from .debug_model import debug_model  # noqa: F401
 | 
			
		||||
from .debug_diff import debug_diff  # noqa: F401
 | 
			
		||||
from .evaluate import evaluate  # noqa: F401
 | 
			
		||||
from .apply import apply  # noqa: F401
 | 
			
		||||
from .convert import convert  # noqa: F401
 | 
			
		||||
from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
			
		||||
from .init_config import init_config, fill_config  # noqa: F401
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -158,15 +158,15 @@ def load_project_config(
 | 
			
		|||
        sys.exit(1)
 | 
			
		||||
    validate_project_version(config)
 | 
			
		||||
    validate_project_commands(config)
 | 
			
		||||
    if interpolate:
 | 
			
		||||
        err = f"{PROJECT_FILE} validation error"
 | 
			
		||||
        with show_validation_error(title=err, hint_fill=False):
 | 
			
		||||
            config = substitute_project_variables(config, overrides)
 | 
			
		||||
    # Make sure directories defined in config exist
 | 
			
		||||
    for subdir in config.get("directories", []):
 | 
			
		||||
        dir_path = path / subdir
 | 
			
		||||
        if not dir_path.exists():
 | 
			
		||||
            dir_path.mkdir(parents=True)
 | 
			
		||||
    if interpolate:
 | 
			
		||||
        err = f"{PROJECT_FILE} validation error"
 | 
			
		||||
        with show_validation_error(title=err, hint_fill=False):
 | 
			
		||||
            config = substitute_project_variables(config, overrides)
 | 
			
		||||
    return config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
 | 
			
		|||
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
 | 
			
		||||
    if not path.is_dir():
 | 
			
		||||
        return [path]
 | 
			
		||||
    paths = [path]
 | 
			
		||||
    locs = []
 | 
			
		||||
    seen = set()
 | 
			
		||||
    for path in paths:
 | 
			
		||||
        if str(path) in seen:
 | 
			
		||||
            continue
 | 
			
		||||
        seen.add(str(path))
 | 
			
		||||
        if path.parts[-1].startswith("."):
 | 
			
		||||
            continue
 | 
			
		||||
        elif path.is_dir():
 | 
			
		||||
            paths.extend(path.iterdir())
 | 
			
		||||
        elif suffix is not None and not path.parts[-1].endswith(suffix):
 | 
			
		||||
            continue
 | 
			
		||||
        else:
 | 
			
		||||
            locs.append(path)
 | 
			
		||||
    # It's good to sort these, in case the ordering messes up cache.
 | 
			
		||||
    locs.sort()
 | 
			
		||||
    return locs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
 | 
			
		||||
    """Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
 | 
			
		||||
    as happens with `round(number, ndigits)`"""
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										143
									
								
								spacy/cli/apply.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								spacy/cli/apply.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,143 @@
 | 
			
		|||
import tqdm
 | 
			
		||||
import srsly
 | 
			
		||||
 | 
			
		||||
from itertools import chain
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Optional, List, Iterable, cast, Union
 | 
			
		||||
 | 
			
		||||
from wasabi import msg
 | 
			
		||||
 | 
			
		||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
 | 
			
		||||
 | 
			
		||||
from ..tokens import Doc, DocBin
 | 
			
		||||
from ..vocab import Vocab
 | 
			
		||||
from ..util import ensure_path, load_model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
path_help = """Location of the documents to predict on.
 | 
			
		||||
Can be a single file in .spacy format or a .jsonl file.
 | 
			
		||||
Files with other extensions are treated as single plain text documents.
 | 
			
		||||
If a directory is provided it is traversed recursively to grab
 | 
			
		||||
all files to be processed.
 | 
			
		||||
The files can be a mixture of .spacy, .jsonl and text files.
 | 
			
		||||
If .jsonl is provided the specified field is going
 | 
			
		||||
to be grabbed ("text" by default)."""
 | 
			
		||||
 | 
			
		||||
out_help = "Path to save the resulting .spacy file"
 | 
			
		||||
code_help = (
 | 
			
		||||
    "Path to Python file with additional " "code (registered functions) to be imported"
 | 
			
		||||
)
 | 
			
		||||
gold_help = "Use gold preprocessing provided in the .spacy files"
 | 
			
		||||
force_msg = (
 | 
			
		||||
    "The provided output file already exists. "
 | 
			
		||||
    "To force overwriting the output file, set the --force or -F flag."
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
 | 
			
		||||
    """
 | 
			
		||||
    Stream Doc objects from DocBin.
 | 
			
		||||
    """
 | 
			
		||||
    docbin = DocBin().from_disk(path)
 | 
			
		||||
    for doc in docbin.get_docs(vocab):
 | 
			
		||||
        yield doc
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
 | 
			
		||||
    """
 | 
			
		||||
    Stream "text" field from JSONL. If the field "text" is
 | 
			
		||||
    not found it raises error.
 | 
			
		||||
    """
 | 
			
		||||
    for entry in srsly.read_jsonl(path):
 | 
			
		||||
        if field not in entry:
 | 
			
		||||
            msg.fail(
 | 
			
		||||
                f"{path} does not contain the required '{field}' field.", exits=1
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            yield entry[field]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
 | 
			
		||||
    """
 | 
			
		||||
    Yields strings from text files in paths.
 | 
			
		||||
    """
 | 
			
		||||
    for path in paths:
 | 
			
		||||
        with open(path, "r") as fin:
 | 
			
		||||
            text = fin.read()
 | 
			
		||||
            yield text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.command("apply")
 | 
			
		||||
def apply_cli(
 | 
			
		||||
    # fmt: off
 | 
			
		||||
    model: str = Arg(..., help="Model name or path"),
 | 
			
		||||
    data_path: Path = Arg(..., help=path_help, exists=True),
 | 
			
		||||
    output_file: Path = Arg(..., help=out_help, dir_okay=False),
 | 
			
		||||
    code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
 | 
			
		||||
    text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
 | 
			
		||||
    force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
 | 
			
		||||
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
 | 
			
		||||
    batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
 | 
			
		||||
    n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Apply a trained pipeline to documents to get predictions.
 | 
			
		||||
    Expects a loadable spaCy pipeline and path to the data, which
 | 
			
		||||
    can be a directory or a file.
 | 
			
		||||
    The data files can be provided in multiple formats:
 | 
			
		||||
        1. .spacy files
 | 
			
		||||
        2. .jsonl files with a specified "field" to read the text from.
 | 
			
		||||
        3. Files with any other extension are assumed to be containing
 | 
			
		||||
           a single document.
 | 
			
		||||
    DOCS: https://spacy.io/api/cli#apply
 | 
			
		||||
    """
 | 
			
		||||
    data_path = ensure_path(data_path)
 | 
			
		||||
    output_file = ensure_path(output_file)
 | 
			
		||||
    code_path = ensure_path(code_path)
 | 
			
		||||
    if output_file.exists() and not force_overwrite:
 | 
			
		||||
        msg.fail(force_msg, exits=1)
 | 
			
		||||
    if not data_path.exists():
 | 
			
		||||
        msg.fail(f"Couldn't find data path: {data_path}", exits=1)
 | 
			
		||||
    import_code(code_path)
 | 
			
		||||
    setup_gpu(use_gpu)
 | 
			
		||||
    apply(data_path, output_file, model, text_key, batch_size, n_process)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def apply(
 | 
			
		||||
    data_path: Path,
 | 
			
		||||
    output_file: Path,
 | 
			
		||||
    model: str,
 | 
			
		||||
    json_field: str,
 | 
			
		||||
    batch_size: int,
 | 
			
		||||
    n_process: int,
 | 
			
		||||
):
 | 
			
		||||
    docbin = DocBin(store_user_data=True)
 | 
			
		||||
    paths = walk_directory(data_path)
 | 
			
		||||
    if len(paths) == 0:
 | 
			
		||||
        docbin.to_disk(output_file)
 | 
			
		||||
        msg.warn("Did not find data to process,"
 | 
			
		||||
                 f" {data_path} seems to be an empty directory.")
 | 
			
		||||
        return
 | 
			
		||||
    nlp = load_model(model)
 | 
			
		||||
    msg.good(f"Loaded model {model}")
 | 
			
		||||
    vocab = nlp.vocab
 | 
			
		||||
    streams: List[DocOrStrStream] = []
 | 
			
		||||
    text_files = []
 | 
			
		||||
    for path in paths:
 | 
			
		||||
        if path.suffix == ".spacy":
 | 
			
		||||
            streams.append(_stream_docbin(path, vocab))
 | 
			
		||||
        elif path.suffix == ".jsonl":
 | 
			
		||||
            streams.append(_stream_jsonl(path, json_field))
 | 
			
		||||
        else:
 | 
			
		||||
            text_files.append(path)
 | 
			
		||||
    if len(text_files) > 0:
 | 
			
		||||
        streams.append(_stream_texts(text_files))
 | 
			
		||||
    datagen = cast(DocOrStrStream, chain(*streams))
 | 
			
		||||
    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
 | 
			
		||||
        docbin.add(doc)
 | 
			
		||||
    if output_file.suffix == "":
 | 
			
		||||
        output_file = output_file.with_suffix(".spacy")
 | 
			
		||||
    docbin.to_disk(output_file)
 | 
			
		||||
| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
 | 
			
		||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
 | 
			
		||||
from enum import Enum
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from wasabi import Printer
 | 
			
		||||
| 
						 | 
				
			
			@ -7,7 +7,7 @@ import re
 | 
			
		|||
import sys
 | 
			
		||||
import itertools
 | 
			
		||||
 | 
			
		||||
from ._util import app, Arg, Opt
 | 
			
		||||
from ._util import app, Arg, Opt, walk_directory
 | 
			
		||||
from ..training import docs_to_json
 | 
			
		||||
from ..tokens import Doc, DocBin
 | 
			
		||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
 | 
			
		||||
| 
						 | 
				
			
			@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
 | 
			
		|||
    return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def walk_directory(path: Path, converter: str) -> List[Path]:
 | 
			
		||||
    if not path.is_dir():
 | 
			
		||||
        return [path]
 | 
			
		||||
    paths = [path]
 | 
			
		||||
    locs = []
 | 
			
		||||
    seen = set()
 | 
			
		||||
    for path in paths:
 | 
			
		||||
        if str(path) in seen:
 | 
			
		||||
            continue
 | 
			
		||||
        seen.add(str(path))
 | 
			
		||||
        if path.parts[-1].startswith("."):
 | 
			
		||||
            continue
 | 
			
		||||
        elif path.is_dir():
 | 
			
		||||
            paths.extend(path.iterdir())
 | 
			
		||||
        elif converter == "json" and not path.parts[-1].endswith("json"):
 | 
			
		||||
            continue
 | 
			
		||||
        elif converter == "conll" and not path.parts[-1].endswith("conll"):
 | 
			
		||||
            continue
 | 
			
		||||
        elif converter == "iob" and not path.parts[-1].endswith("iob"):
 | 
			
		||||
            continue
 | 
			
		||||
        else:
 | 
			
		||||
            locs.append(path)
 | 
			
		||||
    # It's good to sort these, in case the ordering messes up cache.
 | 
			
		||||
    locs.sort()
 | 
			
		||||
    return locs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def verify_cli_args(
 | 
			
		||||
    msg: Printer,
 | 
			
		||||
    input_path: Path,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -101,8 +101,8 @@ def project_run(
 | 
			
		|||
            if not (project_dir / dep).exists():
 | 
			
		||||
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
 | 
			
		||||
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
 | 
			
		||||
                err_kwargs = {"exits": 1} if not dry else {}
 | 
			
		||||
                msg.fail(err, err_help, **err_kwargs)
 | 
			
		||||
                err_exits = 1 if not dry else None
 | 
			
		||||
                msg.fail(err, err_help, exits=err_exits)
 | 
			
		||||
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
 | 
			
		||||
        with working_dir(project_dir) as current_dir:
 | 
			
		||||
            msg.divider(subcommand)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -336,6 +336,11 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
			
		|||
            "clear the existing vectors and resize the table.")
 | 
			
		||||
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
 | 
			
		||||
            "to end with the attribute {attr}. Got: {bad_attr}.")
 | 
			
		||||
    E079 = ("Error computing states in beam: number of predicted beams "
 | 
			
		||||
            "({pbeams}) does not equal number of gold beams ({gbeams}).")
 | 
			
		||||
    E080 = ("Duplicate state found in beam: {key}.")
 | 
			
		||||
    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
 | 
			
		||||
            "does not equal number of losses ({losses}).")
 | 
			
		||||
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
 | 
			
		||||
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
 | 
			
		||||
            "match.")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -15,7 +15,7 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
 | 
			
		||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
 | 
			
		||||
afgelopen aldus alhoewel anderzijds
 | 
			
		||||
 | 
			
		||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -350,9 +350,9 @@ class EditTreeLemmatizer(TrainablePipe):
 | 
			
		|||
 | 
			
		||||
            tree = dict(tree)
 | 
			
		||||
            if "orig" in tree:
 | 
			
		||||
                tree["orig"] = self.vocab.strings[tree["orig"]]
 | 
			
		||||
                tree["orig"] = self.vocab.strings.add(tree["orig"])
 | 
			
		||||
            if "orig" in tree:
 | 
			
		||||
                tree["subst"] = self.vocab.strings[tree["subst"]]
 | 
			
		||||
                tree["subst"] = self.vocab.strings.add(tree["subst"])
 | 
			
		||||
 | 
			
		||||
            trees.append(tree)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
 | 
			
		|||
 | 
			
		||||
    # head before start
 | 
			
		||||
    arr = doc.to_array(["HEAD"])
 | 
			
		||||
    arr[0] = -1
 | 
			
		||||
    arr[0] = numpy.int32(-1).astype(numpy.uint64)
 | 
			
		||||
    doc_from_array = Doc(en_vocab, words=words)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        doc_from_array.from_array(["HEAD"], arr)
 | 
			
		||||
 | 
			
		||||
    # head after end
 | 
			
		||||
    arr = doc.to_array(["HEAD"])
 | 
			
		||||
    arr[0] = 5
 | 
			
		||||
    arr[0] = numpy.int32(5).astype(numpy.uint64)
 | 
			
		||||
    doc_from_array = Doc(en_vocab, words=words)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        doc_from_array.from_array(["HEAD"], arr)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,10 @@
 | 
			
		|||
from typing import List
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
from random import Random
 | 
			
		||||
from spacy.matcher import Matcher
 | 
			
		||||
from spacy.tokens import Span, SpanGroup
 | 
			
		||||
from spacy.tokens import Span, SpanGroup, Doc
 | 
			
		||||
from spacy.util import filter_spans
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
| 
						 | 
				
			
			@ -242,3 +245,13 @@ def test_span_group_extend(doc):
 | 
			
		|||
def test_span_group_dealloc(span_group):
 | 
			
		||||
    with pytest.raises(AttributeError):
 | 
			
		||||
        print(span_group.doc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.issue(11975)
 | 
			
		||||
def test_span_group_typing(doc: Doc):
 | 
			
		||||
    """Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
 | 
			
		||||
    span_group: SpanGroup = doc.spans["SPANS"]
 | 
			
		||||
    spans: List[Span] = list(span_group)
 | 
			
		||||
    for i, span in enumerate(span_group):
 | 
			
		||||
        assert span == span_group[i] == spans[i]
 | 
			
		||||
    filter_spans(span_group)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -62,10 +62,45 @@ def test_initialize_from_labels():
 | 
			
		|||
    nlp2 = Language()
 | 
			
		||||
    lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
 | 
			
		||||
    lemmatizer2.initialize(
 | 
			
		||||
        get_examples=lambda: train_examples,
 | 
			
		||||
        # We want to check that the strings in replacement nodes are
 | 
			
		||||
        # added to the string store. Avoid that they get added through
 | 
			
		||||
        # the examples.
 | 
			
		||||
        get_examples=lambda: train_examples[:1],
 | 
			
		||||
        labels=lemmatizer.label_data,
 | 
			
		||||
    )
 | 
			
		||||
    assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
 | 
			
		||||
    assert lemmatizer2.label_data == {
 | 
			
		||||
        "trees": [
 | 
			
		||||
            {"orig": "S", "subst": "s"},
 | 
			
		||||
            {
 | 
			
		||||
                "prefix_len": 1,
 | 
			
		||||
                "suffix_len": 0,
 | 
			
		||||
                "prefix_tree": 0,
 | 
			
		||||
                "suffix_tree": 4294967295,
 | 
			
		||||
            },
 | 
			
		||||
            {"orig": "s", "subst": ""},
 | 
			
		||||
            {
 | 
			
		||||
                "prefix_len": 0,
 | 
			
		||||
                "suffix_len": 1,
 | 
			
		||||
                "prefix_tree": 4294967295,
 | 
			
		||||
                "suffix_tree": 2,
 | 
			
		||||
            },
 | 
			
		||||
            {
 | 
			
		||||
                "prefix_len": 0,
 | 
			
		||||
                "suffix_len": 0,
 | 
			
		||||
                "prefix_tree": 4294967295,
 | 
			
		||||
                "suffix_tree": 4294967295,
 | 
			
		||||
            },
 | 
			
		||||
            {"orig": "E", "subst": "e"},
 | 
			
		||||
            {
 | 
			
		||||
                "prefix_len": 1,
 | 
			
		||||
                "suffix_len": 0,
 | 
			
		||||
                "prefix_tree": 5,
 | 
			
		||||
                "suffix_tree": 4294967295,
 | 
			
		||||
            },
 | 
			
		||||
        ],
 | 
			
		||||
        "labels": (1, 3, 4, 6),
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_no_data():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
 | 
			
		|||
import pkg_resources
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
import spacy
 | 
			
		||||
import numpy
 | 
			
		||||
import pytest
 | 
			
		||||
import srsly
 | 
			
		||||
| 
						 | 
				
			
			@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
 | 
			
		|||
from spacy.cli.project.remote_storage import RemoteStorage
 | 
			
		||||
from spacy.cli.project.run import _check_requirements
 | 
			
		||||
from spacy.cli.validate import get_model_pkgs
 | 
			
		||||
from spacy.cli.apply import apply
 | 
			
		||||
from spacy.cli.find_threshold import find_threshold
 | 
			
		||||
from spacy.lang.en import English
 | 
			
		||||
from spacy.lang.nl import Dutch
 | 
			
		||||
| 
						 | 
				
			
			@ -123,6 +125,25 @@ def test_issue7055():
 | 
			
		|||
    assert "model" in filled_cfg["components"]["ner"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.issue(11235)
 | 
			
		||||
def test_issue11235():
 | 
			
		||||
    """
 | 
			
		||||
    Test that the cli handles interpolation in the directory names correctly when loading project config.
 | 
			
		||||
    """
 | 
			
		||||
    lang_var = "en"
 | 
			
		||||
    variables = {"lang": lang_var}
 | 
			
		||||
    commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
 | 
			
		||||
    directories = ["cfg", "${vars.lang}_model"]
 | 
			
		||||
    project = {"commands": commands, "vars": variables, "directories": directories}
 | 
			
		||||
    with make_tempdir() as d:
 | 
			
		||||
        srsly.write_yaml(d / "project.yml", project)
 | 
			
		||||
        cfg = load_project_config(d)
 | 
			
		||||
        # Check that the directories are interpolated and created correctly
 | 
			
		||||
        assert os.path.exists(d / "cfg")
 | 
			
		||||
        assert os.path.exists(d / f"{lang_var}_model")
 | 
			
		||||
    assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cli_info():
 | 
			
		||||
    nlp = Dutch()
 | 
			
		||||
    nlp.add_pipe("textcat")
 | 
			
		||||
| 
						 | 
				
			
			@ -866,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
 | 
			
		|||
    assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_applycli_empty_dir():
 | 
			
		||||
    with make_tempdir() as data_path:
 | 
			
		||||
        output = data_path / "test.spacy"
 | 
			
		||||
        apply(data_path, output, "blank:en", "text", 1, 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_applycli_docbin():
 | 
			
		||||
    with make_tempdir() as data_path:
 | 
			
		||||
        output = data_path / "testout.spacy"
 | 
			
		||||
        nlp = spacy.blank("en")
 | 
			
		||||
        doc = nlp("testing apply cli.")
 | 
			
		||||
        # test empty DocBin case
 | 
			
		||||
        docbin = DocBin()
 | 
			
		||||
        docbin.to_disk(data_path / "testin.spacy")
 | 
			
		||||
        apply(data_path, output, "blank:en", "text", 1, 1)
 | 
			
		||||
        docbin.add(doc)
 | 
			
		||||
        docbin.to_disk(data_path / "testin.spacy")
 | 
			
		||||
        apply(data_path, output, "blank:en", "text", 1, 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_applycli_jsonl():
 | 
			
		||||
    with make_tempdir() as data_path:
 | 
			
		||||
        output = data_path / "testout.spacy"
 | 
			
		||||
        data = [{"field": "Testing apply cli.", "key": 234}]
 | 
			
		||||
        data2 = [{"field": "234"}]
 | 
			
		||||
        srsly.write_jsonl(data_path / "test.jsonl", data)
 | 
			
		||||
        apply(data_path, output, "blank:en", "field", 1, 1)
 | 
			
		||||
        srsly.write_jsonl(data_path / "test2.jsonl", data2)
 | 
			
		||||
        apply(data_path, output, "blank:en", "field", 1, 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_applycli_txt():
 | 
			
		||||
    with make_tempdir() as data_path:
 | 
			
		||||
        output = data_path / "testout.spacy"
 | 
			
		||||
        with open(data_path / "test.foo", "w") as ftest:
 | 
			
		||||
            ftest.write("Testing apply cli.")
 | 
			
		||||
        apply(data_path, output, "blank:en", "text", 1, 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_applycli_mixed():
 | 
			
		||||
    with make_tempdir() as data_path:
 | 
			
		||||
        output = data_path / "testout.spacy"
 | 
			
		||||
        text = "Testing apply cli"
 | 
			
		||||
        nlp = spacy.blank("en")
 | 
			
		||||
        doc = nlp(text)
 | 
			
		||||
        jsonl_data = [{"text": text}]
 | 
			
		||||
        srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
 | 
			
		||||
        docbin = DocBin()
 | 
			
		||||
        docbin.add(doc)
 | 
			
		||||
        docbin.to_disk(data_path / "testin.spacy")
 | 
			
		||||
        with open(data_path / "test.txt", "w") as ftest:
 | 
			
		||||
            ftest.write(text)
 | 
			
		||||
        apply(data_path, output, "blank:en", "text", 1, 1)
 | 
			
		||||
        # Check whether it worked
 | 
			
		||||
        result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
 | 
			
		||||
        assert len(result) == 3
 | 
			
		||||
        for doc in result:
 | 
			
		||||
            assert doc.text == text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_applycli_user_data():
 | 
			
		||||
    Doc.set_extension("ext", default=0)
 | 
			
		||||
    val = ("ext", 0)
 | 
			
		||||
    with make_tempdir() as data_path:
 | 
			
		||||
        output = data_path / "testout.spacy"
 | 
			
		||||
        nlp = spacy.blank("en")
 | 
			
		||||
        doc = nlp("testing apply cli.")
 | 
			
		||||
        doc._.ext = val
 | 
			
		||||
        docbin = DocBin(store_user_data=True)
 | 
			
		||||
        docbin.add(doc)
 | 
			
		||||
        docbin.to_disk(data_path / "testin.spacy")
 | 
			
		||||
        apply(data_path, output, "blank:en", "", 1, 1)
 | 
			
		||||
        result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
 | 
			
		||||
        assert result[0]._.ext == val
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_local_remote_storage():
 | 
			
		||||
    with make_tempdir() as d:
 | 
			
		||||
        filename = "a.txt"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -359,6 +359,7 @@ cdef class Doc:
 | 
			
		|||
            for annot in annotations:
 | 
			
		||||
                if annot:
 | 
			
		||||
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
 | 
			
		||||
                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
 | 
			
		||||
                        for i in range(len(words)):
 | 
			
		||||
                            if attrs.ndim == 1:
 | 
			
		||||
                                attrs[i] = annot[i]
 | 
			
		||||
| 
						 | 
				
			
			@ -1573,6 +1574,7 @@ cdef class Doc:
 | 
			
		|||
 | 
			
		||||
            for j, (attr, annot) in enumerate(token_annotations.items()):
 | 
			
		||||
                if attr is HEAD:
 | 
			
		||||
                    annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
 | 
			
		||||
                    for i in range(len(words)):
 | 
			
		||||
                        array[i, j] = annot[i]
 | 
			
		||||
                elif attr is MORPH:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -93,8 +93,8 @@ class Span:
 | 
			
		|||
        self,
 | 
			
		||||
        start_idx: int,
 | 
			
		||||
        end_idx: int,
 | 
			
		||||
        label: int = ...,
 | 
			
		||||
        kb_id: int = ...,
 | 
			
		||||
        label: Union[int, str] = ...,
 | 
			
		||||
        kb_id: Union[int, str] = ...,
 | 
			
		||||
        vector: Optional[Floats1d] = ...,
 | 
			
		||||
    ) -> Span: ...
 | 
			
		||||
    @property
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -318,7 +318,7 @@ cdef class Span:
 | 
			
		|||
                    for ancestor in ancestors:
 | 
			
		||||
                        ancestor_i = ancestor.i - span_c.start
 | 
			
		||||
                        if ancestor_i in range(length):
 | 
			
		||||
                            array[i, head_col] = ancestor_i - i
 | 
			
		||||
                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 | 
			
		||||
 | 
			
		||||
                # if there is no appropriate ancestor, define a new artificial root
 | 
			
		||||
                value = array[i, head_col]
 | 
			
		||||
| 
						 | 
				
			
			@ -326,7 +326,7 @@ cdef class Span:
 | 
			
		|||
                    new_root = old_to_new_root.get(ancestor_i, None)
 | 
			
		||||
                    if new_root is not None:
 | 
			
		||||
                        # take the same artificial root as a previous token from the same sentence
 | 
			
		||||
                        array[i, head_col] = new_root - i
 | 
			
		||||
                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
 | 
			
		||||
                    else:
 | 
			
		||||
                        # set this token as the new artificial root
 | 
			
		||||
                        array[i, head_col] = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -18,6 +18,7 @@ class SpanGroup:
 | 
			
		|||
    def doc(self) -> Doc: ...
 | 
			
		||||
    @property
 | 
			
		||||
    def has_overlap(self) -> bool: ...
 | 
			
		||||
    def __iter__(self): ...
 | 
			
		||||
    def __len__(self) -> int: ...
 | 
			
		||||
    def append(self, span: Span) -> None: ...
 | 
			
		||||
    def extend(self, spans: Iterable[Span]) -> None: ...
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -159,6 +159,16 @@ cdef class SpanGroup:
 | 
			
		|||
            return self._concat(other)
 | 
			
		||||
        return NotImplemented
 | 
			
		||||
 | 
			
		||||
    def __iter__(self):
 | 
			
		||||
        """
 | 
			
		||||
        Iterate over the spans in this SpanGroup.
 | 
			
		||||
        YIELDS (Span): A span in this SpanGroup.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/spangroup#iter
 | 
			
		||||
        """
 | 
			
		||||
        for i in range(self.c.size()):
 | 
			
		||||
            yield self[i]
 | 
			
		||||
 | 
			
		||||
    def append(self, Span span):
 | 
			
		||||
        """Add a span to the group. The span must refer to the same Doc
 | 
			
		||||
        object as the span group.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
 | 
			
		|||
        if key not in IDS:
 | 
			
		||||
            raise ValueError(Errors.E974.format(obj="token", key=key))
 | 
			
		||||
        elif key in ["ORTH", "SPACY"]:
 | 
			
		||||
            pass
 | 
			
		||||
            continue
 | 
			
		||||
        elif key == "HEAD":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
 | 
			
		||||
            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
 | 
			
		||||
        elif key == "DEP":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
 | 
			
		||||
            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
 | 
			
		||||
        elif key == "SENT_START":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([to_ternary_int(v) for v in value])
 | 
			
		||||
            row = [to_ternary_int(v) for v in value]
 | 
			
		||||
        elif key == "MORPH":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([vocab.morphology.add(v) for v in value])
 | 
			
		||||
            row = [vocab.morphology.add(v) for v in value]
 | 
			
		||||
        else:
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            if not all(isinstance(v, str) for v in value):
 | 
			
		||||
                types = set([type(v) for v in value])
 | 
			
		||||
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
 | 
			
		||||
            values.append([vocab.strings.add(v) for v in value])
 | 
			
		||||
    array = numpy.asarray(values, dtype="uint64")
 | 
			
		||||
            row = [vocab.strings.add(v) for v in value]
 | 
			
		||||
        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
 | 
			
		||||
    array = numpy.array(values, dtype=numpy.uint64)
 | 
			
		||||
    return attrs, array.T
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,6 +12,7 @@ menu:
 | 
			
		|||
  - ['train', 'train']
 | 
			
		||||
  - ['pretrain', 'pretrain']
 | 
			
		||||
  - ['evaluate', 'evaluate']
 | 
			
		||||
  - ['apply', 'apply']
 | 
			
		||||
  - ['find-threshold', 'find-threshold']
 | 
			
		||||
  - ['assemble', 'assemble']
 | 
			
		||||
  - ['package', 'package']
 | 
			
		||||
| 
						 | 
				
			
			@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
 | 
			
		|||
| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | 
			
		||||
| **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |
 | 
			
		||||
 | 
			
		||||
## apply {#apply new="3.5" tag="command"}
 | 
			
		||||
 | 
			
		||||
Applies a trained pipeline to data and stores the resulting annotated documents
 | 
			
		||||
in a `DocBin`. The input can be a single file or a directory. The recognized
 | 
			
		||||
input formats are:
 | 
			
		||||
 | 
			
		||||
1. `.spacy`
 | 
			
		||||
2. `.jsonl` containing a user specified `text_key`
 | 
			
		||||
3. Files with any other extension are assumed to be plain text files containing
 | 
			
		||||
   a single document.
 | 
			
		||||
 | 
			
		||||
When a directory is provided it is traversed recursively to collect all files.
 | 
			
		||||
 | 
			
		||||
```cli
 | 
			
		||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
| Name                                      | Description                                                                                                                                                                          |
 | 
			
		||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `model`                                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
 | 
			
		||||
| `data_path`                               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
 | 
			
		||||
| `output-file`, `-o`                       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
 | 
			
		||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | 
			
		||||
| `--text-key`, `-tk`                       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
 | 
			
		||||
| `--force-overwrite`, `-F`                 | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
 | 
			
		||||
| `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | 
			
		||||
| `--batch-size`, `-b`                      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
 | 
			
		||||
| `--n-process`, `-n`                       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
 | 
			
		||||
| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | 
			
		||||
| **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
 | 
			
		||||
 | 
			
		||||
## find-threshold {#find-threshold new="3.5" tag="command"}
 | 
			
		||||
 | 
			
		||||
Runs prediction trials for a trained model with varying tresholds to maximize
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
 | 
			
		|||
| `prefix`         | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
 | 
			
		||||
| `prefix_`        | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
 | 
			
		||||
| `suffix`         | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |
 | 
			
		||||
| `suffix_`        | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
 | 
			
		||||
| `suffix_`        | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
 | 
			
		||||
| `is_alpha`       | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~                                                                                                                                                                    |
 | 
			
		||||
| `is_ascii`       | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~                                                                                                                                                     |
 | 
			
		||||
| `is_digit`       | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~                                                                                                                                                                                   |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -202,6 +202,23 @@ already present in the current span group.
 | 
			
		|||
| `other`     | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
 | 
			
		||||
| **RETURNS** | The span group. ~~SpanGroup~~                                           |
 | 
			
		||||
 | 
			
		||||
## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
 | 
			
		||||
 | 
			
		||||
Iterate over the spans in this span group.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> doc = nlp("Their goi ng home")
 | 
			
		||||
> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 | 
			
		||||
> for error_span in doc.spans["errors"]:
 | 
			
		||||
>     print(error_span)
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name       | Description                         |
 | 
			
		||||
| ---------- | ----------------------------------- |
 | 
			
		||||
| **YIELDS** | A span in this span group. ~~Span~~ |
 | 
			
		||||
 | 
			
		||||
## SpanGroup.append {#append tag="method"}
 | 
			
		||||
 | 
			
		||||
Add a [`Span`](/api/span) object to the group. The span must refer to the same
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -45,7 +45,7 @@
 | 
			
		|||
                    { "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
 | 
			
		||||
                    {
 | 
			
		||||
                        "text": "Custom Solutions",
 | 
			
		||||
                        "url": "https://explosion.ai/spacy-tailored-pipelines"
 | 
			
		||||
                        "url": "https://explosion.ai/custom-solutions"
 | 
			
		||||
                    }
 | 
			
		||||
                ]
 | 
			
		||||
            }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,7 @@
 | 
			
		|||
                { "text": "Online Course", "url": "https://course.spacy.io" },
 | 
			
		||||
                {
 | 
			
		||||
                    "text": "Custom Solutions",
 | 
			
		||||
                    "url": "https://explosion.ai/spacy-tailored-pipelines"
 | 
			
		||||
                    "url": "https://explosion.ai/custom-solutions"
 | 
			
		||||
                }
 | 
			
		||||
            ]
 | 
			
		||||
        },
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1024,25 +1024,6 @@
 | 
			
		|||
            "category": ["pipeline"],
 | 
			
		||||
            "spacy_version": 2
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "id": "spacy-sentence-segmenter",
 | 
			
		||||
            "title": "Sentence Segmenter",
 | 
			
		||||
            "slogan": "Custom sentence segmentation for spaCy",
 | 
			
		||||
            "code_example": [
 | 
			
		||||
                "from seg.newline.segmenter import NewLineSegmenter",
 | 
			
		||||
                "import spacy",
 | 
			
		||||
                "",
 | 
			
		||||
                "nlseg = NewLineSegmenter()",
 | 
			
		||||
                "nlp = spacy.load('en')",
 | 
			
		||||
                "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
 | 
			
		||||
                "doc = nlp(my_doc_text)"
 | 
			
		||||
            ],
 | 
			
		||||
            "author": "tc64",
 | 
			
		||||
            "author_links": {
 | 
			
		||||
                "github": "tc64"
 | 
			
		||||
            },
 | 
			
		||||
            "category": ["pipeline"]
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "id": "spacy_cld",
 | 
			
		||||
            "title": "spaCy-CLD",
 | 
			
		||||
| 
						 | 
				
			
			@ -1472,13 +1453,26 @@
 | 
			
		|||
            "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
 | 
			
		||||
            "code_example": [
 | 
			
		||||
                "import spacy",
 | 
			
		||||
                "import scattertext as st",
 | 
			
		||||
                "",
 | 
			
		||||
                "nlp = spacy.load('en')",
 | 
			
		||||
                "corpus = st.CorpusFromPandas(convention_df,",
 | 
			
		||||
                "                             category_col='party',",
 | 
			
		||||
                "                             text_col='text',",
 | 
			
		||||
                "                             nlp=nlp).build()"
 | 
			
		||||
                "from scattertext import SampleCorpora, produce_scattertext_explorer",
 | 
			
		||||
                "from scattertext import produce_scattertext_html",
 | 
			
		||||
                "from scattertext.CorpusFromPandas import CorpusFromPandas",
 | 
			
		||||
                "",
 | 
			
		||||
                "nlp = spacy.load('en_core_web_sm')",
 | 
			
		||||
                "convention_df = SampleCorpora.ConventionData2012.get_data()",
 | 
			
		||||
                "corpus = CorpusFromPandas(convention_df,",
 | 
			
		||||
                "                          category_col='party',",
 | 
			
		||||
                "                          text_col='text',",
 | 
			
		||||
                "                          nlp=nlp).build()",
 | 
			
		||||
                "",
 | 
			
		||||
                "html = produce_scattertext_html(corpus,",
 | 
			
		||||
                "                                    category='democrat',",
 | 
			
		||||
                "                                    category_name='Democratic',",
 | 
			
		||||
                "                                    not_category_name='Republican',",
 | 
			
		||||
                "                                    minimum_term_frequency=5,",
 | 
			
		||||
                "                                    width_in_pixels=1000)",
 | 
			
		||||
                "open('./simple.html', 'wb').write(html.encode('utf-8'))",
 | 
			
		||||
                "print('Open ./simple.html in Chrome or Firefox.')"
 | 
			
		||||
            ],
 | 
			
		||||
            "author": "Jason Kessler",
 | 
			
		||||
            "author_links": {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -105,13 +105,13 @@ const Landing = ({ data }) => {
 | 
			
		|||
 | 
			
		||||
            <LandingBannerGrid>
 | 
			
		||||
                <LandingBanner
 | 
			
		||||
                    to="https://explosion.ai/spacy-tailored-pipelines"
 | 
			
		||||
                    to="https://explosion.ai/custom-solutions"
 | 
			
		||||
                    button="Learn more"
 | 
			
		||||
                    background="#E4F4F9"
 | 
			
		||||
                    color="#1e1935"
 | 
			
		||||
                    small
 | 
			
		||||
                >
 | 
			
		||||
                    <Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
 | 
			
		||||
                    <Link to="https://explosion.ai/custom-solutions" hidden>
 | 
			
		||||
                        <img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
 | 
			
		||||
                    </Link>
 | 
			
		||||
                    <strong>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user