From 7a2c58864cfb24b78c28643e22ce8c9686e1f1bf Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 26 Aug 2022 17:23:10 +0900 Subject: [PATCH 1/6] Move deps outside explosion to "third-party" (#11381) --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 708300b04..bf4890a68 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,9 +50,9 @@ install_requires = wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 + # Third-party dependencies typer>=0.3.0,<0.5.0 pathy>=0.3.5 - # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 From ba3320097948cd5056fc068cfc1a9cc1b2d89cf2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 26 Aug 2022 16:07:16 +0200 Subject: [PATCH 2/6] Remove pathy from pyproject.toml (#11383) --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 317c5fdbe..7abd7a96f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,6 @@ requires = [ "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", "thinc>=8.1.0,<8.2.0", - "pathy", "numpy>=1.15.0", ] build-backend = "setuptools.build_meta" From 6723d76f24a55f24ef1632ac8be46567a984d0ef Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 29 Aug 2022 10:23:05 +0200 Subject: [PATCH 3/6] Add ConsoleLogger.v2 (#11214) * Init * Change logger to ConsoleLogger.v2 * adjust naming * More naming adjustments * Fix output_file reference error * ignore type * Add basic test for logger * Hopefully fix mypy issue * mypy ignore line * Update mypy line Co-authored-by: Adriane Boyd * Update test method name Co-authored-by: Adriane Boyd * Change file saving logic * Fix finalize method * increase spacy-legacy version in requirements * Update docs * small adjustments Co-authored-by: Adriane Boyd --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/tests/training/test_logger.py | 30 ++++++++ spacy/training/loggers.py | 102 +++++++++++++++++++++------- website/docs/api/legacy.md | 53 +++++++++++++++ website/docs/api/top-level.md | 57 +++++++++------- 6 files changed, 198 insertions(+), 48 deletions(-) create mode 100644 spacy/tests/training/test_logger.py diff --git a/requirements.txt b/requirements.txt index 437dd415a..3b8d66e0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.9,<3.1.0 +spacy-legacy>=3.0.10,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/setup.cfg b/setup.cfg index bf4890a68..5fd820a96 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ setup_requires = thinc>=8.1.0,<8.2.0 install_requires = # Our libraries - spacy-legacy>=3.0.9,<3.1.0 + spacy-legacy>=3.0.10,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 diff --git a/spacy/tests/training/test_logger.py b/spacy/tests/training/test_logger.py new file mode 100644 index 000000000..0dfd0cbf4 --- /dev/null +++ b/spacy/tests/training/test_logger.py @@ -0,0 +1,30 @@ +import pytest +import spacy + +from spacy.training import loggers + + +@pytest.fixture() +def nlp(): + nlp = spacy.blank("en") + nlp.add_pipe("ner") + return nlp + + +@pytest.fixture() +def info(): + return { + "losses": {"ner": 100}, + "other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80}, + "epoch": 100, + "step": 125, + "score": 85, + } + + +def test_console_logger(nlp, info): + console_logger = loggers.console_logger( + progress_bar=True, console_output=True, output_file=None + ) + log_step, finalize = console_logger(nlp) + log_step(info) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index edd0f1959..408ea7140 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -1,10 +1,13 @@ -from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO +from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union from wasabi import Printer +from pathlib import Path import tqdm import sys +import srsly from ..util import registry from ..errors import Errors +from .. import util if TYPE_CHECKING: from ..language import Language # noqa: F401 @@ -23,13 +26,44 @@ def setup_table( return final_cols, final_widths, ["r" for _ in final_widths] -@registry.loggers("spacy.ConsoleLogger.v1") -def console_logger(progress_bar: bool = False): +@registry.loggers("spacy.ConsoleLogger.v2") +def console_logger( + progress_bar: bool = False, + console_output: bool = True, + output_file: Optional[Union[str, Path]] = None, +): + """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file. + progress_bar (bool): Whether the logger should print the progress bar. + console_output (bool): Whether the logger should print the logs on the console. + output_file (Optional[Union[str, Path]]): The file to save the training logs to. + """ + _log_exist = False + if output_file: + output_file = util.ensure_path(output_file) # type: ignore + if output_file.exists(): # type: ignore + _log_exist = True + if not output_file.parents[0].exists(): # type: ignore + output_file.parents[0].mkdir(parents=True) # type: ignore + def setup_printer( nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: write = lambda text: print(text, file=stdout, flush=True) msg = Printer(no_print=True) + + nonlocal output_file + output_stream = None + if _log_exist: + write( + msg.warn( + f"Saving logs is disabled because {output_file} already exists." + ) + ) + output_file = None + elif output_file: + write(msg.info(f"Saving results to {output_file}")) + output_stream = open(output_file, "w", encoding="utf-8") + # ensure that only trainable components are logged logged_pipes = [ name @@ -40,13 +74,15 @@ def console_logger(progress_bar: bool = False): score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] - spacing = 2 - table_header, table_widths, table_aligns = setup_table( - cols=["E", "#"] + loss_cols + score_cols + ["Score"], - widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6], - ) - write(msg.row(table_header, widths=table_widths, spacing=spacing)) - write(msg.row(["-" * width for width in table_widths], spacing=spacing)) + + if console_output: + spacing = 2 + table_header, table_widths, table_aligns = setup_table( + cols=["E", "#"] + loss_cols + score_cols + ["Score"], + widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6], + ) + write(msg.row(table_header, widths=table_widths, spacing=spacing)) + write(msg.row(["-" * width for width in table_widths], spacing=spacing)) progress = None def log_step(info: Optional[Dict[str, Any]]) -> None: @@ -57,12 +93,15 @@ def console_logger(progress_bar: bool = False): if progress is not None: progress.update(1) return - losses = [ - "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in logged_pipes - ] + + losses = [] + log_losses = {} + for pipe_name in logged_pipes: + losses.append("{0:.2f}".format(float(info["losses"][pipe_name]))) + log_losses[pipe_name] = float(info["losses"][pipe_name]) scores = [] + log_scores = {} for col in score_cols: score = info["other_scores"].get(col, 0.0) try: @@ -73,6 +112,7 @@ def console_logger(progress_bar: bool = False): if col != "speed": score *= 100 scores.append("{0:.2f}".format(score)) + log_scores[str(col)] = score data = ( [info["epoch"], info["step"]] @@ -80,20 +120,36 @@ def console_logger(progress_bar: bool = False): + scores + ["{0:.2f}".format(float(info["score"]))] ) + + if output_stream: + # Write to log file per log_step + log_data = { + "epoch": info["epoch"], + "step": info["step"], + "losses": log_losses, + "scores": log_scores, + "score": float(info["score"]), + } + output_stream.write(srsly.json_dumps(log_data) + "\n") + if progress is not None: progress.close() - write( - msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing) - ) - if progress_bar: - # Set disable=None, so that it disables on non-TTY - progress = tqdm.tqdm( - total=eval_frequency, disable=None, leave=False, file=stderr + if console_output: + write( + msg.row( + data, widths=table_widths, aligns=table_aligns, spacing=spacing + ) ) - progress.set_description(f"Epoch {info['epoch']+1}") + if progress_bar: + # Set disable=None, so that it disables on non-TTY + progress = tqdm.tqdm( + total=eval_frequency, disable=None, leave=False, file=stderr + ) + progress.set_description(f"Epoch {info['epoch']+1}") def finalize() -> None: - pass + if output_stream: + output_stream.close() return log_step, finalize diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index 31d178b67..d9167c76f 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -248,6 +248,59 @@ added to an existing vectors table. See more details in ## Loggers {#loggers} +These functions are available from `@spacy.registry.loggers`. + +### spacy.ConsoleLogger.v1 {#ConsoleLogger_v1} + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.ConsoleLogger.v1" +> progress_bar = true +> ``` + +Writes the results of a training step to the console in a tabular format. + + + +```cli +$ python -m spacy train config.cfg +``` + +``` +ℹ Using CPU +ℹ Loading config and nlp from: config.cfg +ℹ Pipeline: ['tok2vec', 'tagger'] +ℹ Start training +ℹ Training. Initial learn rate: 0.0 + +E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE +--- ------ ------------ ----------- ------- ------ + 0 0 0.00 86.20 0.22 0.00 + 0 200 3.08 18968.78 34.00 0.34 + 0 400 31.81 22539.06 33.64 0.34 + 0 600 92.13 22794.91 43.80 0.44 + 0 800 183.62 21541.39 56.05 0.56 + 0 1000 352.49 25461.82 65.15 0.65 + 0 1200 422.87 23708.82 71.84 0.72 + 0 1400 601.92 24994.79 76.57 0.77 + 0 1600 662.57 22268.02 80.20 0.80 + 0 1800 1101.50 28413.77 82.56 0.83 + 0 2000 1253.43 28736.36 85.00 0.85 + 0 2200 1411.02 28237.53 87.42 0.87 + 0 2400 1605.35 28439.95 88.70 0.89 +``` + +Note that the cumulative loss keeps increasing within one epoch, but should +start decreasing across epochs. + + + +| Name | Description | +| -------------- | --------------------------------------------------------- | +| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ | + Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 1e1925442..c3dc42f1a 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -275,8 +275,8 @@ Render a dependency parse tree or named entity visualization. ### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"} -Generate dependency parse in `{'words': [], 'arcs': []}` format. -For use with the `manual=True` argument in `displacy.render`. +Generate dependency parse in `{'words': [], 'arcs': []}` format. For use with +the `manual=True` argument in `displacy.render`. > #### Example > @@ -297,8 +297,8 @@ For use with the `manual=True` argument in `displacy.render`. ### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"} -Generate named entities in `[{start: i, end: i, label: 'label'}]` format. -For use with the `manual=True` argument in `displacy.render`. +Generate named entities in `[{start: i, end: i, label: 'label'}]` format. For +use with the `manual=True` argument in `displacy.render`. > #### Example > @@ -319,8 +319,8 @@ For use with the `manual=True` argument in `displacy.render`. ### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"} -Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. -For use with the `manual=True` argument in `displacy.render`. +Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. For +use with the `manual=True` argument in `displacy.render`. > #### Example > @@ -505,7 +505,7 @@ finished. To log each training step, a and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the -console in tabular format. The +console in tabular format and saves them to a `jsonl` file. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as a dependency of spaCy, enables other loggers, such as one that sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. @@ -513,16 +513,20 @@ a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). -#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} +#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"} > #### Example config > > ```ini > [training.logger] -> @loggers = "spacy.ConsoleLogger.v1" +> @loggers = "spacy.ConsoleLogger.v2" +> progress_bar = true +> console_output = true +> output_file = "training_log.jsonl" > ``` -Writes the results of a training step to the console in a tabular format. +Writes the results of a training step to the console in a tabular format and +saves them to a `jsonl` file. @@ -536,22 +540,23 @@ $ python -m spacy train config.cfg ℹ Pipeline: ['tok2vec', 'tagger'] ℹ Start training ℹ Training. Initial learn rate: 0.0 +ℹ Saving results to training_log.jsonl E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE --- ------ ------------ ----------- ------- ------ - 1 0 0.00 86.20 0.22 0.00 - 1 200 3.08 18968.78 34.00 0.34 - 1 400 31.81 22539.06 33.64 0.34 - 1 600 92.13 22794.91 43.80 0.44 - 1 800 183.62 21541.39 56.05 0.56 - 1 1000 352.49 25461.82 65.15 0.65 - 1 1200 422.87 23708.82 71.84 0.72 - 1 1400 601.92 24994.79 76.57 0.77 - 1 1600 662.57 22268.02 80.20 0.80 - 1 1800 1101.50 28413.77 82.56 0.83 - 1 2000 1253.43 28736.36 85.00 0.85 - 1 2200 1411.02 28237.53 87.42 0.87 - 1 2400 1605.35 28439.95 88.70 0.89 + 0 0 0.00 86.20 0.22 0.00 + 0 200 3.08 18968.78 34.00 0.34 + 0 400 31.81 22539.06 33.64 0.34 + 0 600 92.13 22794.91 43.80 0.44 + 0 800 183.62 21541.39 56.05 0.56 + 0 1000 352.49 25461.82 65.15 0.65 + 0 1200 422.87 23708.82 71.84 0.72 + 0 1400 601.92 24994.79 76.57 0.77 + 0 1600 662.57 22268.02 80.20 0.80 + 0 1800 1101.50 28413.77 82.56 0.83 + 0 2000 1253.43 28736.36 85.00 0.85 + 0 2200 1411.02 28237.53 87.42 0.87 + 0 2400 1605.35 28439.95 88.70 0.89 ``` Note that the cumulative loss keeps increasing within one epoch, but should @@ -559,6 +564,12 @@ start decreasing across epochs. +| Name | Description | +| ---------------- | --------------------------------------------------------------------- | +| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ | +| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ | +| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ | + ## Readers {#readers} ### File readers {#file-readers source="github.com/explosion/srsly" new="3"} From 5ae63b1fbd549fdfc0f7399c0b9656d4a6681544 Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 30 Aug 2022 08:04:54 -0400 Subject: [PATCH 4/6] Add Latin language support (#11349) * Add lang folder for la (Latin) * Add Latin lang classes * Add minimal tokenizer exceptions * Add minimal stopwords * Add minimal lex_attrs * Update stopwords, tokenizer exceptions * Add la tests; register la_tokenizer in conftest.py * Update spacy/lang/la/lex_attrs.py Remove duplicate form in Latin lex_attrs Co-authored-by: Sofie Van Landeghem * Update natto-py version spec (#11222) * Update natto-py version spec * Update setup.cfg Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd * Add scorer to textcat API docs config settings (#11263) * Update docs for pipeline initialize() methods (#11221) * Update documentation for dependency parser * Update documentation for trainable_lemmatizer * Update documentation for entity_linker * Update documentation for ner * Update documentation for morphologizer * Update documentation for senter * Update documentation for spancat * Update documentation for tagger * Update documentation for textcat * Update documentation for tok2vec * Run prettier on edited files * Apply similar changes in transformer docs * Remove need to say annotated example explicitly I removed the need to say "Must contain at least one annotated Example" because it's often a given that Examples will contain some gold-standard annotation. * Run prettier on transformer docs * chore: add 'concepCy' to spacy universe (#11255) * chore: add 'concepCy' to spacy universe * docs: add 'slogan' to concepCy * Support full prerelease versions in the compat table (#11228) * Support full prerelease versions in the compat table * Fix types * adding spans to doc_annotation in Example.to_dict (#11261) * adding spans to doc_annotation in Example.to_dict * to_dict compatible with from_dict: tuples instead of spans * use strings for label and kb_id * Simplify test * Update data formats docs Co-authored-by: Stefanie Wolf Co-authored-by: Adriane Boyd * Fix regex invalid escape sequences (#11276) * Add W605 to the errors raised by flake8 in the CI (#11283) * Clean up automated label-based issue handling (#11284) * Clean up automated label-based issue handline 1. upgrade tiangolo/issue-manager to latest 2. move needs-more-info to tiangolo 3. change needs-more-info close time to 7 days 4. delete old needs-more-info config * Use old, longer message * Fix label name * Fix Dutch noun chunks to skip overlapping spans (#11275) * Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem * Docs: displaCy documentation - data types, `parse_{deps,ents,spans}`, spans example (#10950) * add in spans example and parse references * rm autoformatter * rm extra ents copy * TypedDict draft * type fixes * restore non-documentation files * docs update * fix spans example * fix hyperlinks * add parse example * example fix + argument fix * fix api arg in docs * fix bad variable replacement * fix spacing in style Co-authored-by: Sofie Van Landeghem * fix spacing on table * fix spacing on table * rm temp files Co-authored-by: Sofie Van Landeghem * include span_ruler for default warning filter (#11333) * Add uk pipelines to website (#11332) * Check for . in factory names (#11336) * Make fixes for PR #11349 * Fix roman numeral coverage in #11349 Co-authored-by: Patrick J. Burns Co-authored-by: Sofie Van Landeghem Co-authored-by: Paul O'Leary McCann Co-authored-by: Adriane Boyd Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Co-authored-by: Jules Belveze <32683010+JulesBelveze@users.noreply.github.com> Co-authored-by: stefawolf Co-authored-by: Stefanie Wolf Co-authored-by: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> --- spacy/lang/la/__init__.py | 18 +++++++++++++ spacy/lang/la/lex_attrs.py | 32 +++++++++++++++++++++++ spacy/lang/la/stop_words.py | 37 +++++++++++++++++++++++++++ spacy/lang/la/tokenizer_exceptions.py | 30 ++++++++++++++++++++++ spacy/tests/conftest.py | 5 ++++ spacy/tests/lang/la/__init__.py | 0 spacy/tests/lang/la/test_exception.py | 7 +++++ spacy/tests/lang/la/test_text.py | 33 ++++++++++++++++++++++++ website/docs/api/top-level.md | 2 +- 9 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/la/__init__.py create mode 100644 spacy/lang/la/lex_attrs.py create mode 100644 spacy/lang/la/stop_words.py create mode 100644 spacy/lang/la/tokenizer_exceptions.py create mode 100644 spacy/tests/lang/la/__init__.py create mode 100644 spacy/tests/lang/la/test_exception.py create mode 100644 spacy/tests/lang/la/test_text.py diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py new file mode 100644 index 000000000..5f2cccee3 --- /dev/null +++ b/spacy/lang/la/__init__.py @@ -0,0 +1,18 @@ +from ...language import Language, BaseDefaults +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS + + +class LatinDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Latin(Language): + lang = "la" + Defaults = LatinDefaults + + +__all__ = ["Latin"] diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py new file mode 100644 index 000000000..9348a811a --- /dev/null +++ b/spacy/lang/la/lex_attrs.py @@ -0,0 +1,32 @@ +from ...attrs import LIKE_NUM +import re + +# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4 +roman_numerals_compile = re.compile(r'(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$') + +_num_words = set( + """ +unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem +""".split() +) + +_ordinal_words = set( + """ +primus prima primum secundus secunda secundum tertius tertia tertium +""".split() +) + + +def like_num(text): + if text.isdigit(): + return True + if roman_numerals_compile.match(text): + return True + if text.lower() in _num_words: + return True + if text.lower() in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/la/stop_words.py b/spacy/lang/la/stop_words.py new file mode 100644 index 000000000..8b590bb67 --- /dev/null +++ b/spacy/lang/la/stop_words.py @@ -0,0 +1,37 @@ +# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin + +STOP_WORDS = set( + """ +ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem + +cum cur + +de deinde dum + +ego enim ergo es est et etiam etsi ex + +fio + +haud hic + +iam idem igitur ille in infra inter interim ipse is ita + +magis modo mox + +nam ne nec necque neque nisi non nos + +o ob + +per possum post pro + +quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam + +sed si sic sive sub sui sum super suus + +tam tamen trans tu tum + +ubi uel uero + +vel vero +""".split() +) diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py new file mode 100644 index 000000000..905304188 --- /dev/null +++ b/spacy/lang/la/tokenizer_exceptions.py @@ -0,0 +1,30 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import ORTH +from ...util import update_exc + + +## TODO: Look into systematically handling u/v +_exc = { + "mecum": [{ORTH: "me"}, {ORTH: "cum"}], + "tecum": [{ORTH: "te"}, {ORTH: "cum"}], + "nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}], + "vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}], + "uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}], +} + +for orth in [ + + 'A.', 'Agr.', 'Ap.', 'C.', 'Cn.', 'D.', 'F.', 'K.', 'L.', "M'.", 'M.', 'Mam.', 'N.', 'Oct.', + 'Opet.', 'P.', 'Paul.', 'Post.', 'Pro.', 'Q.', 'S.', 'Ser.', 'Sert.', 'Sex.', 'St.', 'Sta.', + 'T.', 'Ti.', 'V.', 'Vol.', 'Vop.', 'U.', 'Uol.', 'Uop.', + + 'Ian.', 'Febr.', 'Mart.', 'Apr.', 'Mai.', 'Iun.', 'Iul.', 'Aug.', 'Sept.', 'Oct.', 'Nov.', 'Nou.', + 'Dec.', + + 'Non.', 'Id.', 'A.D.', + + 'Coll.', 'Cos.', 'Ord.', 'Pl.', 'S.C.', 'Suff.', 'Trib.', +]: + _exc[orth] = [{ORTH: orth}] + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 5193bd301..0395ba7ca 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -256,6 +256,11 @@ def ko_tokenizer_tokenizer(): return nlp.tokenizer +@pytest.fixture(scope="module") +def la_tokenizer(): + return get_lang_class("la")().tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer diff --git a/spacy/tests/lang/la/__init__.py b/spacy/tests/lang/la/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/la/test_exception.py b/spacy/tests/lang/la/test_exception.py new file mode 100644 index 000000000..04bc1d489 --- /dev/null +++ b/spacy/tests/lang/la/test_exception.py @@ -0,0 +1,7 @@ +import pytest + +def test_la_tokenizer_handles_exc_in_text(la_tokenizer): + text = "scio te omnia facturum, ut nobiscum quam primum sis" + tokens = la_tokenizer(text) + assert len(tokens) == 11 + assert tokens[6].text == "nobis" diff --git a/spacy/tests/lang/la/test_text.py b/spacy/tests/lang/la/test_text.py new file mode 100644 index 000000000..11676b92b --- /dev/null +++ b/spacy/tests/lang/la/test_text.py @@ -0,0 +1,33 @@ +import pytest +from spacy.lang.la.lex_attrs import like_num + +@pytest.mark.parametrize( + "text,match", + [ + ("IIII", True), + ("VI", True), + ("vi", True), + ("IV", True), + ("iv", True), + ("IX", True), + ("ix", True), + ("MMXXII", True), + ("0", True), + ("1", True), + ("quattuor", True), + ("decem", True), + ("tertius", True), + ("canis", False), + ("MMXX11", False), + (",", False), + ], +) +def test_lex_attrs_like_number(la_tokenizer, text, match): + tokens = la_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + +@pytest.mark.parametrize("word", ["quinque"]) +def test_la_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index c3dc42f1a..724f2775e 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -451,7 +451,7 @@ factories. | Registry name | Description | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | -| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. | +| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. | | `batchers` | Registry for training and evaluation [data batchers](#batchers). | | `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | From 3f4b4b7b4fa2df6c5d888cdc97efb71093d3fb6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 30 Aug 2022 14:21:02 +0200 Subject: [PATCH 5/6] Fix `test_{prefer,require}_gpu` (#11390) * Fix `test_{prefer,require}_gpu` These tests assumed that GPUs are only supported with CuPy, but since Thinc 8.1 we also support Metal Performance Shaders. * test_misc: arrange thinc imports to be together --- spacy/tests/test_misc.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index d8743d322..1c9b045ac 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -10,7 +10,8 @@ from spacy.ml._precomputable_affine import _backprop_precomputable_affine_paddin from spacy.util import dot_to_object, SimpleFrozenList, import_file from spacy.util import to_ternary_int from thinc.api import Config, Optimizer, ConfigValidationError -from thinc.api import set_current_ops +from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps +from thinc.compat import has_cupy_gpu, has_torch_mps_gpu from spacy.training.batchers import minibatch_by_words from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -18,7 +19,6 @@ from spacy.language import DEFAULT_CONFIG_PATH from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema from pydantic import ValidationError -from thinc.api import get_current_ops, NumpyOps, CupyOps from .util import get_random_doc, make_tempdir @@ -111,26 +111,25 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): def test_prefer_gpu(): current_ops = get_current_ops() - try: - import cupy # noqa: F401 - - prefer_gpu() + if has_cupy_gpu: + assert prefer_gpu() assert isinstance(get_current_ops(), CupyOps) - except ImportError: + elif has_torch_mps_gpu: + assert prefer_gpu() + assert isinstance(get_current_ops(), MPSOps) + else: assert not prefer_gpu() set_current_ops(current_ops) def test_require_gpu(): current_ops = get_current_ops() - try: - import cupy # noqa: F401 - + if has_cupy_gpu: require_gpu() assert isinstance(get_current_ops(), CupyOps) - except ImportError: - with pytest.raises(ValueError): - require_gpu() + elif has_torch_mps_gpu: + require_gpu() + assert isinstance(get_current_ops(), MPSOps) set_current_ops(current_ops) From 8fc0efc502da2f02076575e0887cb585d0e0f391 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 31 Aug 2022 09:02:34 +0200 Subject: [PATCH 6/6] Allow string argument for disable/enable/exclude (#11406) * adding unit test for spacy.load with disable/exclude string arg * allow pure strings in from_config * update docs * upstream type adjustements * docs update * make docstring more consistent * Update spacy/language.py Co-authored-by: Adriane Boyd * two more cleanups * fix type in internal method Co-authored-by: Adriane Boyd --- spacy/__init__.py | 12 ++--- spacy/language.py | 32 +++++++----- spacy/tests/pipeline/test_pipe_methods.py | 11 +++++ spacy/util.py | 60 +++++++++++------------ website/docs/api/language.md | 27 +++++----- website/docs/api/top-level.md | 58 +++++++++++----------- 6 files changed, 112 insertions(+), 88 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 069215fda..d60f46b96 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -31,21 +31,21 @@ def load( name: Union[str, Path], *, vocab: Union[Vocab, bool] = True, - disable: Iterable[str] = util.SimpleFrozenList(), - enable: Iterable[str] = util.SimpleFrozenList(), - exclude: Iterable[str] = util.SimpleFrozenList(), + disable: Union[str, Iterable[str]] = util.SimpleFrozenList(), + enable: Union[str, Iterable[str]] = util.SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = util.SimpleFrozenList(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), ) -> Language: """Load a spaCy model from an installed package or a local path. name (str): Package name or model path. vocab (Vocab): A Vocab object. If True, a vocab is created. - disable (Iterable[str]): Names of pipeline components to disable. Disabled + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. - enable (Iterable[str]): Names of pipeline components to enable. All other + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other pipes will be disabled (but can be enabled later using nlp.enable_pipe). - exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. diff --git a/spacy/language.py b/spacy/language.py index e89ae142b..ec330753c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1063,7 +1063,7 @@ class Language: """ if enable is None and disable is None: raise ValueError(Errors.E991) - if disable is not None and isinstance(disable, str): + if isinstance(disable, str): disable = [disable] if enable is not None: if isinstance(enable, str): @@ -1698,9 +1698,9 @@ class Language: config: Union[Dict[str, Any], Config] = {}, *, vocab: Union[Vocab, bool] = True, - disable: Iterable[str] = SimpleFrozenList(), - enable: Iterable[str] = SimpleFrozenList(), - exclude: Iterable[str] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = SimpleFrozenList(), + enable: Union[str, Iterable[str]] = SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = SimpleFrozenList(), meta: Dict[str, Any] = SimpleFrozenDict(), auto_fill: bool = True, validate: bool = True, @@ -1711,12 +1711,12 @@ class Language: config (Dict[str, Any] / Config): The loaded config. vocab (Vocab): A Vocab object. If True, a vocab is created. - disable (Iterable[str]): Names of pipeline components to disable. + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. - enable (Iterable[str]): Names of pipeline components to enable. All other + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other pipes will be disabled (and can be enabled using `nlp.enable_pipe`). - exclude (Iterable[str]): Names of pipeline components to exclude. + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded components won't be loaded. meta (Dict[str, Any]): Meta overrides for nlp.meta. auto_fill (bool): Automatically fill in missing values in config based @@ -1727,6 +1727,12 @@ class Language: DOCS: https://spacy.io/api/language#from_config """ + if isinstance(disable, str): + disable = [disable] + if isinstance(enable, str): + enable = [enable] + if isinstance(exclude, str): + exclude = [exclude] if auto_fill: config = Config( cls.default_config, section_order=CONFIG_SECTION_ORDER @@ -2031,25 +2037,29 @@ class Language: @staticmethod def _resolve_component_status( - disable: Iterable[str], enable: Iterable[str], pipe_names: Collection[str] + disable: Union[str, Iterable[str]], + enable: Union[str, Iterable[str]], + pipe_names: Iterable[str], ) -> Tuple[str, ...]: """Derives whether (1) `disable` and `enable` values are consistent and (2) resolves those to a single set of disabled components. Raises an error in case of inconsistency. - disable (Iterable[str]): Names of components or serialization fields to disable. - enable (Iterable[str]): Names of pipeline components to enable. + disable (Union[str, Iterable[str]]): Name(s) of component(s) or serialization fields to disable. + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. pipe_names (Iterable[str]): Names of all pipeline components. RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t. specified includes and excludes. """ - if disable is not None and isinstance(disable, str): + if isinstance(disable, str): disable = [disable] to_disable = disable if enable: + if isinstance(enable, str): + enable = [enable] to_disable = [ pipe_name for pipe_name in pipe_names if pipe_name not in enable ] diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 6f00a1cd9..b946061f6 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -618,6 +618,7 @@ def test_load_disable_enable() -> None: base_nlp.to_disk(tmp_dir) to_disable = ["parser", "tagger"] to_enable = ["tagger", "parser"] + single_str = "tagger" # Setting only `disable`. nlp = spacy.load(tmp_dir, disable=to_disable) @@ -632,6 +633,16 @@ def test_load_disable_enable() -> None: ] ) + # Loading with a string representing one component + nlp = spacy.load(tmp_dir, exclude=single_str) + assert single_str not in nlp.component_names + + nlp = spacy.load(tmp_dir, disable=single_str) + assert single_str in nlp.component_names + assert single_str not in nlp.pipe_names + assert nlp._disabled == {single_str} + assert nlp.disabled == [single_str] + # Testing consistent enable/disable combination. nlp = spacy.load( tmp_dir, diff --git a/spacy/util.py b/spacy/util.py index d170fc15b..4e1a62d05 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -398,9 +398,9 @@ def load_model( name: Union[str, Path], *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = SimpleFrozenList(), - enable: Iterable[str] = SimpleFrozenList(), - exclude: Iterable[str] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = SimpleFrozenList(), + enable: Union[str, Iterable[str]] = SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a package or data path. @@ -408,9 +408,9 @@ def load_model( name (str): Package name or model path. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. - enable (Iterable[str]): Names of pipeline components to enable. All others will be disabled. - exclude (Iterable[str]): Names of pipeline components to exclude. + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All others will be disabled. + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. @@ -440,9 +440,9 @@ def load_model_from_package( name: str, *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = SimpleFrozenList(), - enable: Iterable[str] = SimpleFrozenList(), - exclude: Iterable[str] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = SimpleFrozenList(), + enable: Union[str, Iterable[str]] = SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package. @@ -450,12 +450,12 @@ def load_model_from_package( name (str): The package name. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. Disabled + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. - enable (Iterable[str]): Names of pipeline components to enable. All other + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other pipes will be disabled (and can be enabled using `nlp.enable_pipe`). - exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. @@ -470,9 +470,9 @@ def load_model_from_path( *, meta: Optional[Dict[str, Any]] = None, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = SimpleFrozenList(), - enable: Iterable[str] = SimpleFrozenList(), - exclude: Iterable[str] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = SimpleFrozenList(), + enable: Union[str, Iterable[str]] = SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a data directory path. Creates Language class with @@ -482,12 +482,12 @@ def load_model_from_path( meta (Dict[str, Any]): Optional model meta. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. Disabled + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. - enable (Iterable[str]): Names of pipeline components to enable. All other + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other pipes will be disabled (and can be enabled using `nlp.enable_pipe`). - exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. @@ -516,9 +516,9 @@ def load_model_from_config( *, meta: Dict[str, Any] = SimpleFrozenDict(), vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = SimpleFrozenList(), - enable: Iterable[str] = SimpleFrozenList(), - exclude: Iterable[str] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = SimpleFrozenList(), + enable: Union[str, Iterable[str]] = SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = SimpleFrozenList(), auto_fill: bool = False, validate: bool = True, ) -> "Language": @@ -529,12 +529,12 @@ def load_model_from_config( meta (Dict[str, Any]): Optional model meta. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. Disabled + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. - enable (Iterable[str]): Names of pipeline components to enable. All other + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other pipes will be disabled (and can be enabled using `nlp.enable_pipe`). - exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded components won't be loaded. auto_fill (bool): Whether to auto-fill config with missing defaults. validate (bool): Whether to show config validation errors. @@ -616,9 +616,9 @@ def load_model_from_init_py( init_file: Union[Path, str], *, vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = SimpleFrozenList(), - enable: Iterable[str] = SimpleFrozenList(), - exclude: Iterable[str] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = SimpleFrozenList(), + enable: Union[str, Iterable[str]] = SimpleFrozenList(), + exclude: Union[str, Iterable[str]] = SimpleFrozenList(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's @@ -626,12 +626,12 @@ def load_model_from_init_py( vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. - disable (Iterable[str]): Names of pipeline components to disable. Disabled + disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling nlp.enable_pipe. - enable (Iterable[str]): Names of pipeline components to enable. All other + enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other pipes will be disabled (and can be enabled using `nlp.enable_pipe`). - exclude (Iterable[str]): Names of pipeline components to exclude. Excluded + exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 9a413efaf..ed763e36a 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -63,17 +63,18 @@ spaCy loads a model under the hood based on its > nlp = Language.from_config(config) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | -| _keyword-only_ | | -| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | -| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | -| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | -| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | -| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | -| **RETURNS** | The initialized object. ~~Language~~ | +| Name | Description | +| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | +| _keyword-only_ | | +| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | +| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | +| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The initialized object. ~~Language~~ | ## Language.component {#component tag="classmethod" new="3"} @@ -695,8 +696,8 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------ | | _keyword-only_ | | -| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ | -| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | +| `disable` | Name(s) of pipeline component(s) to disable. ~~Optional[Union[str, Iterable[str]]]~~ | +| `enable` | Name(s) of pipeline component(s) that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | | **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 724f2775e..220b2d6e9 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument. > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > ``` -| Name | Description | -| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | -| _keyword-only_ | | -| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | -| `enable` | Names of pipeline components to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~List[str]~~ | -| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | -| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | +| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's [`config.cfg`](/api/data-formats#config), uses the language and pipeline @@ -1049,15 +1049,16 @@ and create a `Language` object. The model data will then be loaded in via > nlp = util.load_model("/path/to/data") > ``` -| Name | Description | -| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | Package name or path. ~~str~~ | -| _keyword-only_ | | -| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | -| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | -| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Package name or path. ~~str~~ | +| _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | ### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"} @@ -1073,15 +1074,16 @@ A helper function to use in the `load()` method of a pipeline package's > return load_model_from_init_py(__file__, **overrides) > ``` -| Name | Description | -| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | -| _keyword-only_ | | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | -| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | -| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | ### util.load_config {#util.load_config tag="function" new="3"}