Merge branch 'master_copy' into website/migration

This commit is contained in:
svlandeg 2023-01-05 09:53:02 +01:00
commit 1bae1f9f60
25 changed files with 459 additions and 53 deletions

View File

@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11'
numpy==1.23.2; python_version=='3.11'
numpy; python_version>='3.12'

View File

@ -1,5 +1,5 @@
# Our libraries
spacy-legacy>=3.0.10,<3.1.0
spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0

View File

@ -22,6 +22,7 @@ classifiers =
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
@ -41,7 +42,7 @@ setup_requires =
thinc>=8.1.0,<8.2.0
install_requires =
# Our libraries
spacy-legacy>=3.0.10,<3.1.0
spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0

View File

@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401

View File

@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif suffix is not None and not path.parts[-1].endswith(suffix):
continue
else:
locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
as happens with `round(number, ndigits)`"""

143
spacy/cli/apply.py Normal file
View File

@ -0,0 +1,143 @@
import tqdm
import srsly
from itertools import chain
from pathlib import Path
from typing import Optional, List, Iterable, cast, Union
from wasabi import msg
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
from ..tokens import Doc, DocBin
from ..vocab import Vocab
from ..util import ensure_path, load_model
path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file.
Files with other extensions are treated as single plain text documents.
If a directory is provided it is traversed recursively to grab
all files to be processed.
The files can be a mixture of .spacy, .jsonl and text files.
If .jsonl is provided the specified field is going
to be grabbed ("text" by default)."""
out_help = "Path to save the resulting .spacy file"
code_help = (
"Path to Python file with additional " "code (registered functions) to be imported"
)
gold_help = "Use gold preprocessing provided in the .spacy files"
force_msg = (
"The provided output file already exists. "
"To force overwriting the output file, set the --force or -F flag."
)
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
"""
Stream Doc objects from DocBin.
"""
docbin = DocBin().from_disk(path)
for doc in docbin.get_docs(vocab):
yield doc
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
"""
Stream "text" field from JSONL. If the field "text" is
not found it raises error.
"""
for entry in srsly.read_jsonl(path):
if field not in entry:
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
else:
yield entry[field]
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
"""
Yields strings from text files in paths.
"""
for path in paths:
with open(path, "r") as fin:
text = fin.read()
yield text
@app.command("apply")
def apply_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help=path_help, exists=True),
output_file: Path = Arg(..., help=out_help, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
):
"""
Apply a trained pipeline to documents to get predictions.
Expects a loadable spaCy pipeline and path to the data, which
can be a directory or a file.
The data files can be provided in multiple formats:
1. .spacy files
2. .jsonl files with a specified "field" to read the text from.
3. Files with any other extension are assumed to be containing
a single document.
DOCS: https://spacy.io/api/cli#apply
"""
data_path = ensure_path(data_path)
output_file = ensure_path(output_file)
code_path = ensure_path(code_path)
if output_file.exists() and not force_overwrite:
msg.fail(force_msg, exits=1)
if not data_path.exists():
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
import_code(code_path)
setup_gpu(use_gpu)
apply(data_path, output_file, model, text_key, batch_size, n_process)
def apply(
data_path: Path,
output_file: Path,
model: str,
json_field: str,
batch_size: int,
n_process: int,
):
docbin = DocBin(store_user_data=True)
paths = walk_directory(data_path)
if len(paths) == 0:
docbin.to_disk(output_file)
msg.warn(
"Did not find data to process,"
f" {data_path} seems to be an empty directory."
)
return
nlp = load_model(model)
msg.good(f"Loaded model {model}")
vocab = nlp.vocab
streams: List[DocOrStrStream] = []
text_files = []
for path in paths:
if path.suffix == ".spacy":
streams.append(_stream_docbin(path, vocab))
elif path.suffix == ".jsonl":
streams.append(_stream_jsonl(path, json_field))
else:
text_files.append(path)
if len(text_files) > 0:
streams.append(_stream_texts(text_files))
datagen = cast(DocOrStrStream, chain(*streams))
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
docbin.add(doc)
if output_file.suffix == "":
output_file = output_file.with_suffix(".spacy")
docbin.to_disk(output_file)

View File

@ -1,4 +1,4 @@
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
from typing import Callable, Iterable, Mapping, Optional, Any, Union
from enum import Enum
from pathlib import Path
from wasabi import Printer
@ -7,7 +7,7 @@ import re
import sys
import itertools
from ._util import app, Arg, Opt
from ._util import app, Arg, Opt, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
def walk_directory(path: Path, converter: str) -> List[Path]:
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif converter == "json" and not path.parts[-1].endswith("json"):
continue
elif converter == "conll" and not path.parts[-1].endswith("conll"):
continue
elif converter == "iob" and not path.parts[-1].endswith("iob"):
continue
else:
locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
def verify_cli_args(
msg: Printer,
input_path: Path,

View File

@ -36,7 +36,7 @@ def render(
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers

View File

@ -94,7 +94,7 @@ class SpanRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@ -510,7 +510,7 @@ class EntityRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):

View File

@ -962,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
"knowledge base, use `InMemoryLookupKB`.")
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -15,7 +15,7 @@
STOP_WORDS = set(
"""
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
afgelopen aldus alhoewel anderzijds
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven

View File

@ -170,7 +170,7 @@ def prioritize_existing_ents_filter(
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
def make_preverse_existing_ents_filter():
def make_preserve_existing_ents_filter():
return prioritize_existing_ents_filter

View File

@ -74,7 +74,7 @@ subword_features = true
default_config={
"threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
@ -117,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
)
@registry.scorers("spacy.textcat_scorer.v1")
@registry.scorers("spacy.textcat_scorer.v2")
def make_textcat_scorer():
return textcat_score

View File

@ -1,7 +1,10 @@
from typing import List
import pytest
from random import Random
from spacy.matcher import Matcher
from spacy.tokens import Span, SpanGroup
from spacy.tokens import Span, SpanGroup, Doc
from spacy.util import filter_spans
@pytest.fixture
@ -240,3 +243,13 @@ def test_span_group_extend(doc):
def test_span_group_dealloc(span_group):
with pytest.raises(AttributeError):
print(span_group.doc)
@pytest.mark.issue(11975)
def test_span_group_typing(doc: Doc):
"""Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
span_group: SpanGroup = doc.spans["SPANS"]
spans: List[Span] = list(span_group)
for i, span in enumerate(span_group):
assert span == span_group[i] == spans[i]
filter_spans(span_group)

View File

@ -895,3 +895,22 @@ def test_textcat_multi_threshold():
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
@pytest.mark.parametrize(
"component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]
)
def test_textcat_legacy_scorers(component_name, scorer):
"""Check that legacy scorers are registered and produce the expected score
keys."""
nlp = English()
nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
nlp.initialize(get_examples=lambda: train_examples)
# score the model (it's not actually trained but that doesn't matter)
scores = nlp.evaluate(train_examples)
assert 0 <= scores["cats_score"] <= 1

View File

@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
import pkg_resources
import time
import spacy
import numpy
import pytest
import srsly
@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs
from spacy.cli.apply import apply
from spacy.cli.find_threshold import find_threshold
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@ -885,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
def test_applycli_empty_dir():
with make_tempdir() as data_path:
output = data_path / "test.spacy"
apply(data_path, output, "blank:en", "text", 1, 1)
def test_applycli_docbin():
with make_tempdir() as data_path:
output = data_path / "testout.spacy"
nlp = spacy.blank("en")
doc = nlp("testing apply cli.")
# test empty DocBin case
docbin = DocBin()
docbin.to_disk(data_path / "testin.spacy")
apply(data_path, output, "blank:en", "text", 1, 1)
docbin.add(doc)
docbin.to_disk(data_path / "testin.spacy")
apply(data_path, output, "blank:en", "text", 1, 1)
def test_applycli_jsonl():
with make_tempdir() as data_path:
output = data_path / "testout.spacy"
data = [{"field": "Testing apply cli.", "key": 234}]
data2 = [{"field": "234"}]
srsly.write_jsonl(data_path / "test.jsonl", data)
apply(data_path, output, "blank:en", "field", 1, 1)
srsly.write_jsonl(data_path / "test2.jsonl", data2)
apply(data_path, output, "blank:en", "field", 1, 1)
def test_applycli_txt():
with make_tempdir() as data_path:
output = data_path / "testout.spacy"
with open(data_path / "test.foo", "w") as ftest:
ftest.write("Testing apply cli.")
apply(data_path, output, "blank:en", "text", 1, 1)
def test_applycli_mixed():
with make_tempdir() as data_path:
output = data_path / "testout.spacy"
text = "Testing apply cli"
nlp = spacy.blank("en")
doc = nlp(text)
jsonl_data = [{"text": text}]
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
docbin = DocBin()
docbin.add(doc)
docbin.to_disk(data_path / "testin.spacy")
with open(data_path / "test.txt", "w") as ftest:
ftest.write(text)
apply(data_path, output, "blank:en", "text", 1, 1)
# Check whether it worked
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
assert len(result) == 3
for doc in result:
assert doc.text == text
def test_applycli_user_data():
Doc.set_extension("ext", default=0)
val = ("ext", 0)
with make_tempdir() as data_path:
output = data_path / "testout.spacy"
nlp = spacy.blank("en")
doc = nlp("testing apply cli.")
doc._.ext = val
docbin = DocBin(store_user_data=True)
docbin.add(doc)
docbin.to_disk(data_path / "testin.spacy")
apply(data_path, output, "blank:en", "", 1, 1)
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
assert result[0]._.ext == val
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"

View File

@ -95,8 +95,8 @@ class Span:
self,
start_idx: int,
end_idx: int,
label: int = ...,
kb_id: int = ...,
label: Union[int, str] = ...,
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
) -> Span: ...
@property

View File

@ -18,6 +18,7 @@ class SpanGroup:
def doc(self) -> Doc: ...
@property
def has_overlap(self) -> bool: ...
def __iter__(self): ...
def __len__(self) -> int: ...
def append(self, span: Span) -> None: ...
def extend(self, spans: Iterable[Span]) -> None: ...

View File

@ -158,6 +158,16 @@ cdef class SpanGroup:
return self._concat(other)
return NotImplemented
def __iter__(self):
"""
Iterate over the spans in this SpanGroup.
YIELDS (Span): A span in this SpanGroup.
DOCS: https://spacy.io/api/spangroup#iter
"""
for i in range(self.c.size()):
yield self[i]
def append(self, Span span):
"""Add a span to the group. The span must refer to the same Doc
object as the span group.

View File

@ -26,6 +26,8 @@ def setup_table(
return final_cols, final_widths, ["r" for _ in final_widths]
# We cannot rename this method as it's directly imported
# and used by external packages such as spacy-loggers.
@registry.loggers("spacy.ConsoleLogger.v2")
def console_logger(
progress_bar: bool = False,
@ -33,7 +35,27 @@ def console_logger(
output_file: Optional[Union[str, Path]] = None,
):
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
progress_bar (bool): Whether the logger should print the progress bar.
progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
console_output (bool): Whether the logger should print the logs on the console.
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
"""
return console_logger_v3(
progress_bar=None if progress_bar is False else "eval",
console_output=console_output,
output_file=output_file,
)
@registry.loggers("spacy.ConsoleLogger.v3")
def console_logger_v3(
progress_bar: Optional[str] = None,
console_output: bool = True,
output_file: Optional[Union[str, Path]] = None,
):
"""The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
console_output (bool): Whether the logger should print the logs on the console.
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
"""
@ -70,6 +92,7 @@ def console_logger(
for name, proc in nlp.pipeline
if hasattr(proc, "is_trainable") and proc.is_trainable
]
max_steps = nlp.config["training"]["max_steps"]
eval_frequency = nlp.config["training"]["eval_frequency"]
score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None]
@ -84,6 +107,13 @@ def console_logger(
write(msg.row(table_header, widths=table_widths, spacing=spacing))
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
progress = None
expected_progress_types = ("train", "eval")
if progress_bar is not None and progress_bar not in expected_progress_types:
raise ValueError(
Errors.E1048.format(
unexpected=progress_bar, expected=expected_progress_types
)
)
def log_step(info: Optional[Dict[str, Any]]) -> None:
nonlocal progress
@ -141,11 +171,23 @@ def console_logger(
)
)
if progress_bar:
if progress_bar == "train":
total = max_steps
desc = f"Last Eval Epoch: {info['epoch']}"
initial = info["step"]
else:
total = eval_frequency
desc = f"Epoch {info['epoch']+1}"
initial = 0
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(
total=eval_frequency, disable=None, leave=False, file=stderr
total=total,
disable=None,
leave=False,
file=stderr,
initial=initial,
)
progress.set_description(f"Epoch {info['epoch']+1}")
progress.set_description(desc)
def finalize() -> None:
if output_stream:

View File

@ -12,6 +12,7 @@ menu:
- ['train', 'train']
- ['pretrain', 'pretrain']
- ['evaluate', 'evaluate']
- ['apply', 'apply']
- ['find-threshold', 'find-threshold']
- ['assemble', 'assemble']
- ['package', 'package']
@ -1162,6 +1163,38 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
## apply {id="apply", version="3.5", tag="command"}
Applies a trained pipeline to data and stores the resulting annotated documents
in a `DocBin`. The input can be a single file or a directory. The recognized
input formats are:
1. `.spacy`
2. `.jsonl` containing a user specified `text_key`
3. Files with any other extension are assumed to be plain text files containing
a single document.
When a directory is provided it is traversed recursively to collect all files.
```cli
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
```
| Name | Description |
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
## find-threshold {id="find-threshold",version="3.5",tag="command"}
Runs prediction trials for a trained model with varying tresholds to maximize

View File

@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |

View File

@ -202,6 +202,24 @@ already present in the current span group.
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
| **RETURNS** | The span group. ~~SpanGroup~~ |
## SpanGroup.\_\_iter\_\_ {id="iter",tag="method",version="3.5"}
Iterate over the spans in this span group.
> #### Example
>
> ```python
> doc = nlp("Their goi ng home")
> doc.spans["errors"] = [doc[0:1], doc[1:3]]
> for error_span in doc.spans["errors"]:
> print(error_span)
> ```
| Name | Description |
| ---------- | ----------------------------------- |
| **YIELDS** | A span in this span group. ~~Span~~ |
## SpanGroup.append {id="append",tag="method"}
Add a [`Span`](/api/span) object to the group. The span must refer to the same

View File

@ -265,7 +265,7 @@ Render a dependency parse tree or named entity visualization.
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
@ -512,7 +512,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
Instead of using one of the built-in loggers, you can
[implement your own](/usage/training#custom-logging).
#### spacy.ConsoleLogger.v2 {id="ConsoleLogger",tag="registered function"}
#### spacy.ConsoleLogger.v2 {tag="registered function"}
> #### Example config
>
@ -564,10 +564,32 @@ start decreasing across epochs.
</Accordion>
| Name | Description |
| ---------------- | --------------------------------------------------------------------- |
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ |
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
#### spacy.ConsoleLogger.v3 {id="ConsoleLogger",tag="registered function"}
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.ConsoleLogger.v3"
> progress_bar = "all_steps"
> console_output = true
> output_file = "training_log.jsonl"
> ```
Writes the results of a training step to the console in a tabular format and
optionally saves them to a `jsonl` file.
| Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. |
| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ |
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
## Readers {id="readers"}

View File

@ -4047,6 +4047,33 @@
"author_links": {
"github": "yasufumy"
}
},
{
"id": "spacy-pythainlp",
"title": "spaCy-PyThaiNLP",
"slogan": "PyThaiNLP for spaCy",
"description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.",
"github": "PyThaiNLP/spaCy-PyThaiNLP",
"code_example": [
"import spacy",
"import spacy_pythainlp.core",
"",
"nlp = spacy.blank('th')",
"nlp.add_pipe('pythainlp')",
"doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')",
"",
"print(list(doc.sents))",
"# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]"
],
"code_language": "python",
"author": "Wannaphong Phatthiyaphaibun",
"author_links": {
"twitter": "@wannaphong_p",
"github": "wannaphong",
"website": "https://iam.wannaphong.com/"
},
"category": ["pipeline", "research"],
"tags": ["Thai"]
}
],