diff --git a/.github/contributors/dardoria.md b/.github/contributors/dardoria.md new file mode 100644 index 000000000..0c7202fca --- /dev/null +++ b/.github/contributors/dardoria.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Boian Tzonev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 18.02.2021 | +| GitHub username | dardoria | +| Website (optional) | | diff --git a/requirements.txt b/requirements.txt index 85fc6a62c..01a3be120 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ wasabi>=0.8.1,<1.1.0 srsly>=2.4.0,<3.0.0 catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 -pathy +pathy>=0.3.5 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 @@ -21,11 +21,11 @@ jinja2 setuptools packaging>=20.0 importlib_metadata>=0.20; python_version < "3.8" -typing_extensions>=3.7.4; python_version < "3.8" +typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=5.2.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.5.0,<3.6.0 -hypothesis +hypothesis>=3.27.0,<7.0.0 diff --git a/setup.cfg b/setup.cfg index 6f8572381..482c1fbdd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = srsly>=2.4.0,<3.0.0 catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 - pathy + pathy>=0.3.5 # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 @@ -58,7 +58,7 @@ install_requires = setuptools packaging>=20.0 importlib_metadata>=0.20; python_version < "3.8" - typing_extensions>=3.7.4; python_version < "3.8" + typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8" [options.entry_points] console_scripts = diff --git a/setup.py b/setup.py index df21202fa..fcc124a43 100755 --- a/setup.py +++ b/setup.py @@ -204,7 +204,7 @@ def setup_package(): for name in MOD_NAMES: mod_path = name.replace(".", "/") + ".pyx" ext = Extension( - name, [mod_path], language="c++", extra_compile_args=["-std=c++11"] + name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"] ) ext_modules.append(ext) print("Cythonizing sources") @@ -216,7 +216,6 @@ def setup_package(): version=about["__version__"], ext_modules=ext_modules, cmdclass={"build_ext": build_ext_subclass}, - include_dirs=include_dirs, package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, ) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 86b3ab356..228cc622a 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -11,6 +11,7 @@ from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager from thinc.api import Config, ConfigValidationError, require_gpu +from thinc.util import has_cupy, gpu_is_available from configparser import InterpolationError import os @@ -510,3 +511,5 @@ def setup_gpu(use_gpu: int) -> None: require_gpu(use_gpu) else: msg.info("Using CPU") + if has_cupy and gpu_is_available(): + msg.info("To switch to GPU 0, use the option: --gpu-id 0") diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 1eacd6399..d13a4fc80 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -22,7 +22,7 @@ from ..training.converters import conllu_to_docs CONVERTERS = { "conllubio": conllu_to_docs, "conllu": conllu_to_docs, - "conll": conllu_to_docs, + "conll": conll_ner_to_docs, "ner": conll_ner_to_docs, "iob": iob_to_docs, "json": json_to_docs, diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 02f9b6528..368af8d49 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -132,7 +132,7 @@ def evaluate( if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] - docs = [ex.predicted for ex in dev_dataset] + docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) render_deps = "parser" in factory_names render_ents = "ner" in factory_names render_parses( diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 89c8ec2d4..38fc23272 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -16,7 +16,11 @@ gpu_allocator = null [nlp] lang = "{{ lang }}" +{%- if "tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or (("textcat" in components or "textcat_multilabel" in components) and optimize == "accuracy") -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %} +{%- else -%} +{%- set full_pipeline = components %} +{%- endif %} pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }} batch_size = {{ 128 if hardware == "gpu" else 1000 }} diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index e0b591eb5..dac3a26c1 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -22,21 +22,21 @@ ar: bg: word_vectors: null transformer: - efficiency: - name: iarfmoose/roberta-base-bulgarian - size_factor: 3 - accuracy: - name: iarfmoose/roberta-base-bulgarian - size_factor: 3 + efficiency: + name: iarfmoose/roberta-base-bulgarian + size_factor: 3 + accuracy: + name: iarfmoose/roberta-base-bulgarian + size_factor: 3 bn: word_vectors: null transformer: - efficiency: - name: sagorsarker/bangla-bert-base - size_factor: 3 - accuracy: - name: sagorsarker/bangla-bert-base - size_factor: 3 + efficiency: + name: sagorsarker/bangla-bert-base + size_factor: 3 + accuracy: + name: sagorsarker/bangla-bert-base + size_factor: 3 da: word_vectors: da_core_news_lg transformer: diff --git a/spacy/errors.py b/spacy/errors.py index 79a15fbaa..2ebc49e8c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -321,7 +321,8 @@ class Errors: "https://spacy.io/api/top-level#util.filter_spans") E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " "token can only be part of one entity, so make sure the entities " - "you're setting don't overlap.") + "you're setting don't overlap. To work with overlapping entities, " + "consider using doc.spans instead.") E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore " "settings: {opts}") E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}") @@ -486,6 +487,15 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + + E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to " + "a list of spans, with each span represented by a tuple (start_char, end_char). " + "The tuple can be optionally extended with a label and a KB ID.") + E880 = ("The 'wandb' library could not be found - did you install it? " + "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' " + "config section, instead of the 'WandbLogger'.") + E885 = ("entity_linker.set_kb received an invalid 'kb_loader' argument: expected " + "a callable function, but got: {arg_type}") E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not " "found in config for component '{name}'.") E887 = ("Can't replace {name} -> {tok2vec} listeners: the paths to replace " diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index a30f49ce7..6fa539a28 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,9 +1,21 @@ from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .lex_attrs import LEX_ATTRS +from ..tokenizer_exceptions import BASE_EXCEPTIONS + from ...language import Language +from ...attrs import LANG +from ...util import update_exc class BulgarianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "bg" + + lex_attr_getters.update(LEX_ATTRS) + stop_words = STOP_WORDS + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) class Bulgarian(Language): diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py new file mode 100644 index 000000000..62b69d6cc --- /dev/null +++ b/spacy/lang/bg/lex_attrs.py @@ -0,0 +1,88 @@ +from ...attrs import LIKE_NUM + + +_num_words = [ + "нула", + "едно", + "един", + "една", + "две", + "три", + "четири", + "пет", + "шест", + "седем", + "осем", + "девет", + "десет", + "единадесет", + "единайсет", + "дванадесет", + "дванайсет", + "тринадесет", + "тринайсет", + "четиринадесет", + "четиринайсет" + "петнадесет", + "петнайсет" + "шестнадесет", + "шестнайсет", + "седемнадесет", + "седемнайсет" + "осемнадесет", + "осемнайсет", + "деветнадесет", + "деветнайсет", + "двадесет", + "двайсет", + "тридесет", + "трийсет" + "четиридесет", + "четиресет", + "петдесет", + "шестдесет", + "шейсет", + "седемдесет", + "осемдесет", + "деветдесет", + "сто", + "двеста", + "триста", + "четиристотин", + "петстотин", + "шестстотин", + "седемстотин", + "осемстотин", + "деветстотин", + "хиляда", + "милион", + "милиона", + "милиард", + "милиарда", + "трилион", + "трилионa", + "билион", + "билионa", + "квадрилион", + "квадрилионa", + "квинтилион", + "квинтилионa", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py new file mode 100644 index 000000000..defa00ef7 --- /dev/null +++ b/spacy/lang/bg/tokenizer_exceptions.py @@ -0,0 +1,68 @@ +from ...symbols import ORTH, NORM + + +_exc = {} + + +_abbr_exc = [ + {ORTH: "м", NORM: "метър"}, + {ORTH: "мм", NORM: "милиметър"}, + {ORTH: "см", NORM: "сантиметър"}, + {ORTH: "дм", NORM: "дециметър"}, + {ORTH: "км", NORM: "километър"}, + {ORTH: "кг", NORM: "килограм"}, + {ORTH: "мг", NORM: "милиграм"}, + {ORTH: "г", NORM: "грам"}, + {ORTH: "т", NORM: "тон"}, + {ORTH: "хл", NORM: "хектолиър"}, + {ORTH: "дкл", NORM: "декалитър"}, + {ORTH: "л", NORM: "литър"}, +] +for abbr in _abbr_exc: + _exc[abbr[ORTH]] = [abbr] + +_abbr_line_exc = [ + {ORTH: "г-жа", NORM: "госпожа"}, + {ORTH: "г-н", NORM: "господин"}, + {ORTH: "г-ца", NORM: "госпожица"}, + {ORTH: "д-р", NORM: "доктор"}, + {ORTH: "о-в", NORM: "остров"}, + {ORTH: "п-в", NORM: "полуостров"}, +] + +for abbr in _abbr_line_exc: + _exc[abbr[ORTH]] = [abbr] + +_abbr_dot_exc = [ + {ORTH: "акад.", NORM: "академик"}, + {ORTH: "ал.", NORM: "алинея"}, + {ORTH: "арх.", NORM: "архитект"}, + {ORTH: "бл.", NORM: "блок"}, + {ORTH: "бр.", NORM: "брой"}, + {ORTH: "бул.", NORM: "булевард"}, + {ORTH: "в.", NORM: "век"}, + {ORTH: "г.", NORM: "година"}, + {ORTH: "гр.", NORM: "град"}, + {ORTH: "ж.р.", NORM: "женски род"}, + {ORTH: "инж.", NORM: "инженер"}, + {ORTH: "лв.", NORM: "лев"}, + {ORTH: "м.р.", NORM: "мъжки род"}, + {ORTH: "мат.", NORM: "математика"}, + {ORTH: "мед.", NORM: "медицина"}, + {ORTH: "пл.", NORM: "площад"}, + {ORTH: "проф.", NORM: "професор"}, + {ORTH: "с.", NORM: "село"}, + {ORTH: "с.р.", NORM: "среден род"}, + {ORTH: "св.", NORM: "свети"}, + {ORTH: "сп.", NORM: "списание"}, + {ORTH: "стр.", NORM: "страница"}, + {ORTH: "ул.", NORM: "улица"}, + {ORTH: "чл.", NORM: "член"}, + +] + +for abbr in _abbr_dot_exc: + _exc[abbr[ORTH]] = [abbr] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index b7a2fc8e4..e4689815e 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -23,8 +23,6 @@ class RussianLemmatizer(Lemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) - try: from pymorphy2 import MorphAnalyzer except ImportError: @@ -34,6 +32,7 @@ class RussianLemmatizer(Lemmatizer): ) from None if RussianLemmatizer._morph is None: RussianLemmatizer._morph = MorphAnalyzer() + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) def pymorphy2_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 56d9c75c0..0b4435a21 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -7,6 +7,8 @@ from ...vocab import Vocab class UkrainianLemmatizer(RussianLemmatizer): + _morph = None + def __init__( self, vocab: Vocab, @@ -16,7 +18,6 @@ class UkrainianLemmatizer(RussianLemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) try: from pymorphy2 import MorphAnalyzer except ImportError: @@ -27,3 +28,4 @@ class UkrainianLemmatizer(RussianLemmatizer): ) from None if UkrainianLemmatizer._morph is None: UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/language.py b/spacy/language.py index 5bd1545d2..2a9b50bcc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -684,12 +684,12 @@ class Language: # TODO: handle errors and mismatches (vectors etc.) if not isinstance(source, self.__class__): raise ValueError(Errors.E945.format(name=source_name, source=type(source))) - if not source.has_pipe(source_name): + if not source_name in source.component_names: raise KeyError( Errors.E944.format( name=source_name, model=f"{source.meta['lang']}_{source.meta['name']}", - opts=", ".join(source.pipe_names), + opts=", ".join(source.component_names), ) ) pipe = source.get_pipe(source_name) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index f37203b1b..21e1c53b9 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -8,7 +8,7 @@ from ...kb import KnowledgeBase, Candidate, get_candidates from ...vocab import Vocab -@registry.architectures.register("spacy.EntityLinker.v1") +@registry.architectures("spacy.EntityLinker.v1") def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: with Model.define_operators({">>": chain, "**": clone}): token_width = tok2vec.get_dim("nO") @@ -25,7 +25,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: return model -@registry.misc.register("spacy.KBFromFile.v1") +@registry.misc("spacy.KBFromFile.v1") def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: def kb_from_file(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) @@ -35,7 +35,7 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: return kb_from_file -@registry.misc.register("spacy.EmptyKB.v1") +@registry.misc("spacy.EmptyKB.v1") def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: def empty_kb_factory(vocab): return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length) @@ -43,6 +43,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: return empty_kb_factory -@registry.misc.register("spacy.CandidateGenerator.v1") +@registry.misc("spacy.CandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: return get_candidates diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 7c0589bff..8aa0f3c2b 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from ...tokens import Doc # noqa: F401 -@registry.architectures.register("spacy.PretrainVectors.v1") +@registry.architectures("spacy.PretrainVectors.v1") def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: @@ -40,7 +40,7 @@ def create_pretrain_vectors( return create_vectors_objective -@registry.architectures.register("spacy.PretrainCharacters.v1") +@registry.architectures("spacy.PretrainCharacters.v1") def create_pretrain_characters( maxout_pieces: int, hidden_size: int, n_characters: int ) -> Callable[["Vocab", Model], Model]: diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index da53f562e..861094209 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -10,7 +10,7 @@ from ..tb_framework import TransitionModel from ...tokens import Doc -@registry.architectures.register("spacy.TransitionBasedParser.v1") +@registry.architectures("spacy.TransitionBasedParser.v1") def transition_parser_v1( tok2vec: Model[List[Doc], List[Floats2d]], state_type: Literal["parser", "ner"], @@ -31,7 +31,7 @@ def transition_parser_v1( ) -@registry.architectures.register("spacy.TransitionBasedParser.v2") +@registry.architectures("spacy.TransitionBasedParser.v2") def transition_parser_v2( tok2vec: Model[List[Doc], List[Floats2d]], state_type: Literal["parser", "ner"], diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 09405214c..87944e305 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -6,7 +6,7 @@ from ...util import registry from ...tokens import Doc -@registry.architectures.register("spacy.Tagger.v1") +@registry.architectures("spacy.Tagger.v1") def build_tagger_model( tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None ) -> Model[List[Doc], List[Floats2d]]: diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 0234530e6..a1855c5a0 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -15,7 +15,7 @@ from ...tokens import Doc from .tok2vec import get_tok2vec_width -@registry.architectures.register("spacy.TextCatCNN.v1") +@registry.architectures("spacy.TextCatCNN.v1") def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None ) -> Model[List[Doc], Floats2d]: @@ -41,7 +41,7 @@ def build_simple_cnn_text_classifier( return model -@registry.architectures.register("spacy.TextCatBOW.v1") +@registry.architectures("spacy.TextCatBOW.v1") def build_bow_text_classifier( exclusive_classes: bool, ngram_size: int, @@ -60,7 +60,7 @@ def build_bow_text_classifier( return model -@registry.architectures.register("spacy.TextCatEnsemble.v2") +@registry.architectures("spacy.TextCatEnsemble.v2") def build_text_classifier_v2( tok2vec: Model[List[Doc], List[Floats2d]], linear_model: Model[List[Doc], Floats2d], @@ -112,7 +112,7 @@ def init_ensemble_textcat(model, X, Y) -> Model: return model -@registry.architectures.register("spacy.TextCatLowData.v1") +@registry.architectures("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( width: int, dropout: Optional[float], nO: Optional[int] = None ) -> Model[List[Doc], Floats2d]: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index c4bd6b0d7..5790af631 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -14,7 +14,7 @@ from ...pipeline.tok2vec import Tok2VecListener from ...attrs import intify_attr -@registry.architectures.register("spacy.Tok2VecListener.v1") +@registry.architectures("spacy.Tok2VecListener.v1") def tok2vec_listener_v1(width: int, upstream: str = "*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec @@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model): return nO -@registry.architectures.register("spacy.HashEmbedCNN.v1") +@registry.architectures("spacy.HashEmbedCNN.v1") def build_hash_embed_cnn_tok2vec( *, width: int, @@ -87,7 +87,7 @@ def build_hash_embed_cnn_tok2vec( ) -@registry.architectures.register("spacy.Tok2Vec.v2") +@registry.architectures("spacy.Tok2Vec.v2") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], encode: Model[List[Floats2d], List[Floats2d]], @@ -108,7 +108,7 @@ def build_Tok2Vec_model( return tok2vec -@registry.architectures.register("spacy.MultiHashEmbed.v1") +@registry.architectures("spacy.MultiHashEmbed.v1") def MultiHashEmbed( width: int, attrs: List[Union[str, int]], @@ -182,7 +182,7 @@ def MultiHashEmbed( return model -@registry.architectures.register("spacy.CharacterEmbed.v1") +@registry.architectures("spacy.CharacterEmbed.v1") def CharacterEmbed( width: int, rows: int, @@ -255,7 +255,7 @@ def CharacterEmbed( return model -@registry.architectures.register("spacy.MaxoutWindowEncoder.v2") +@registry.architectures("spacy.MaxoutWindowEncoder.v2") def MaxoutWindowEncoder( width: int, window_size: int, maxout_pieces: int, depth: int ) -> Model[List[Floats2d], List[Floats2d]]: @@ -287,7 +287,7 @@ def MaxoutWindowEncoder( return with_array(model, pad=receptive_field) -@registry.architectures.register("spacy.MishWindowEncoder.v2") +@registry.architectures("spacy.MishWindowEncoder.v2") def MishWindowEncoder( width: int, window_size: int, depth: int ) -> Model[List[Floats2d], List[Floats2d]]: @@ -310,7 +310,7 @@ def MishWindowEncoder( return with_array(model) -@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") +@registry.architectures("spacy.TorchBiLSTMEncoder.v1") def BiLSTMEncoder( width: int, depth: int, dropout: float ) -> Model[List[Floats2d], List[Floats2d]]: diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 262bcf677..630057c3f 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -45,6 +45,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] default_config={ "model": DEFAULT_NEL_MODEL, "labels_discard": [], + "n_sents": 0, "incl_prior": True, "incl_context": True, "entity_vector_length": 64, @@ -62,6 +63,7 @@ def make_entity_linker( model: Model, *, labels_discard: Iterable[str], + n_sents: int, incl_prior: bool, incl_context: bool, entity_vector_length: int, @@ -73,6 +75,7 @@ def make_entity_linker( representations. Given a batch of Doc objects, it should return a single array, with one row per item in the batch. labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + n_sents (int): The number of neighbouring sentences to take into account. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. @@ -84,6 +87,7 @@ def make_entity_linker( model, name, labels_discard=labels_discard, + n_sents=n_sents, incl_prior=incl_prior, incl_context=incl_context, entity_vector_length=entity_vector_length, @@ -106,6 +110,7 @@ class EntityLinker(TrainablePipe): name: str = "entity_linker", *, labels_discard: Iterable[str], + n_sents: int, incl_prior: bool, incl_context: bool, entity_vector_length: int, @@ -118,6 +123,7 @@ class EntityLinker(TrainablePipe): name (str): The component instance name, used to add entries to the losses during training. labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + n_sents (int): The number of neighbouring sentences to take into account. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. @@ -129,25 +135,24 @@ class EntityLinker(TrainablePipe): self.vocab = vocab self.model = model self.name = name - cfg = { - "labels_discard": list(labels_discard), - "incl_prior": incl_prior, - "incl_context": incl_context, - "entity_vector_length": entity_vector_length, - } + self.labels_discard = list(labels_discard) + self.n_sents = n_sents + self.incl_prior = incl_prior + self.incl_context = incl_context self.get_candidates = get_candidates - self.cfg = dict(cfg) + self.cfg = {} self.distance = CosineDistance(normalize=False) # how many neightbour sentences to take into account - self.n_sents = cfg.get("n_sents", 0) # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will create it using this object's vocab.""" + if not callable(kb_loader): + raise ValueError(Errors.E885.format(arg_type=type(kb_loader))) + self.kb = kb_loader(self.vocab) - self.cfg["entity_vector_length"] = self.kb.entity_vector_length def validate_kb(self) -> None: # Raise an error if the knowledge base is not initialized. @@ -309,14 +314,13 @@ class EntityLinker(TrainablePipe): sent_doc = doc[start_token:end_token].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) xp = self.model.ops.xp - if self.cfg.get("incl_context"): + if self.incl_context: sentence_encoding = self.model.predict([sent_doc])[0] sentence_encoding_t = sentence_encoding.T sentence_norm = xp.linalg.norm(sentence_encoding_t) for ent in sent.ents: entity_count += 1 - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: + if ent.label_ in self.labels_discard: # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) else: @@ -334,13 +338,13 @@ class EntityLinker(TrainablePipe): prior_probs = xp.asarray( [c.prior_prob for c in candidates] ) - if not self.cfg.get("incl_prior"): + if not self.incl_prior: prior_probs = xp.asarray( [0.0 for _ in candidates] ) scores = prior_probs # add in similarity from the context - if self.cfg.get("incl_context"): + if self.incl_context: entity_encodings = xp.asarray( [c.entity_vector for c in candidates] ) diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index b10cdd8e8..60102efcb 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -66,26 +66,12 @@ class Sentencizer(Pipe): """ error_handler = self.get_error_handler() try: - self._call(doc) + tags = self.predict([doc]) + self.set_annotations([doc], tags) return doc except Exception as e: error_handler(self.name, self, [doc], e) - def _call(self, doc): - start = 0 - seen_period = False - for i, token in enumerate(doc): - is_in_punct_chars = token.text in self.punct_chars - token.is_sent_start = i == 0 - if seen_period and not token.is_punct and not is_in_punct_chars: - doc[start].is_sent_start = True - start = token.i - seen_period = False - elif is_in_punct_chars: - seen_period = True - if start < len(doc): - doc[start].is_sent_start = True - def predict(self, docs): """Apply the pipe to a batch of docs, without modifying them. diff --git a/spacy/scorer.py b/spacy/scorer.py index f20a0d786..f28cb5639 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -314,6 +314,9 @@ class Scorer: getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If provided, getter(doc, attr) should return the spans for the individual doc. + has_annotation (Optional[Callable[[Doc], bool]]) should return whether a `Doc` + has annotation for this `attr`. Docs without annotation are skipped for + scoring purposes. RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under the keys attr_p/r/f and the per-type PRF scores under attr_per_type. @@ -324,7 +327,7 @@ class Scorer: for example in examples: pred_doc = example.predicted gold_doc = example.reference - # Option to handle docs without sents + # Option to handle docs without annotation for this attribute if has_annotation is not None: if not has_annotation(gold_doc): continue @@ -531,27 +534,28 @@ class Scorer: gold_span = gold_ent_by_offset.get( (pred_ent.start_char, pred_ent.end_char), None ) - label = gold_span.label_ - if label not in f_per_type: - f_per_type[label] = PRFScore() - gold = gold_span.kb_id_ - # only evaluating entities that overlap between gold and pred, - # to disentangle the performance of the NEL from the NER - if gold is not None: - pred = pred_ent.kb_id_ - if gold in negative_labels and pred in negative_labels: - # ignore true negatives - pass - elif gold == pred: - f_per_type[label].tp += 1 - elif gold in negative_labels: - f_per_type[label].fp += 1 - elif pred in negative_labels: - f_per_type[label].fn += 1 - else: - # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN - f_per_type[label].fp += 1 - f_per_type[label].fn += 1 + if gold_span is not None: + label = gold_span.label_ + if label not in f_per_type: + f_per_type[label] = PRFScore() + gold = gold_span.kb_id_ + # only evaluating entities that overlap between gold and pred, + # to disentangle the performance of the NEL from the NER + if gold is not None: + pred = pred_ent.kb_id_ + if gold in negative_labels and pred in negative_labels: + # ignore true negatives + pass + elif gold == pred: + f_per_type[label].tp += 1 + elif gold in negative_labels: + f_per_type[label].fp += 1 + elif pred in negative_labels: + f_per_type[label].fn += 1 + else: + # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN + f_per_type[label].fp += 1 + f_per_type[label].fn += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 9209a840c..04e254c50 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -39,6 +39,11 @@ def ar_tokenizer(): return get_lang_class("ar")().tokenizer +@pytest.fixture(scope="session") +def bg_tokenizer(): + return get_lang_class("bg")().tokenizer + + @pytest.fixture(scope="session") def bn_tokenizer(): return get_lang_class("bn")().tokenizer diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 3ccdcc228..c27139d2f 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,3 +1,5 @@ +import weakref + import pytest import numpy import logging @@ -663,3 +665,10 @@ def test_span_groups(en_tokenizer): assert doc.spans["hi"].has_overlap del doc.spans["hi"] assert "hi" not in doc.spans + + +def test_doc_spans_copy(en_tokenizer): + doc1 = en_tokenizer("Some text about Colombia and the Czech Republic") + assert weakref.ref(doc1) == doc1.spans.doc_ref + doc2 = doc1.copy() + assert weakref.ref(doc2) == doc2.spans.doc_ref diff --git a/spacy/tests/lang/bg/test_text.py b/spacy/tests/lang/bg/test_text.py new file mode 100644 index 000000000..3d35ba997 --- /dev/null +++ b/spacy/tests/lang/bg/test_text.py @@ -0,0 +1,30 @@ +import pytest +from spacy.lang.bg.lex_attrs import like_num + +@pytest.mark.parametrize( + "word,match", + [ + ("10", True), + ("1", True), + ("10000", True), + ("1.000", True), + ("бројка", False), + ("999,23", True), + ("едно", True), + ("две", True), + ("цифра", False), + ("единайсет", True), + ("десет", True), + ("сто", True), + ("брой", False), + ("хиляда", True), + ("милион", True), + (",", False), + ("милиарда", True), + ("билион", True), + ], +) +def test_bg_lex_attrs_like_number(bg_tokenizer, word, match): + tokens = bg_tokenizer(word) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 8ba2d0d3e..4883cceb8 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -230,7 +230,7 @@ def test_el_pipe_configuration(nlp): def get_lowercased_candidates(kb, span): return kb.get_alias_candidates(span.text.lower()) - @registry.misc.register("spacy.LowercaseCandidateGenerator.v1") + @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: return get_lowercased_candidates @@ -250,6 +250,14 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" +def test_nel_nsents(nlp): + """Test that n_sents can be set through the configuration""" + entity_linker = nlp.add_pipe("entity_linker", config={}) + assert entity_linker.n_sents == 0 + entity_linker = nlp.replace_pipe("entity_linker", "entity_linker", config={"n_sents": 2}) + assert entity_linker.n_sents == 2 + + def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 6a21ddfaa..9af8395a6 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -83,9 +83,9 @@ def test_replace_last_pipe(nlp): def test_replace_pipe_config(nlp): nlp.add_pipe("entity_linker") nlp.add_pipe("sentencizer") - assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is True + assert nlp.get_pipe("entity_linker").incl_prior is True nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False}) - assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is False + assert nlp.get_pipe("entity_linker").incl_prior is False @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index cee48522d..8435b32e1 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -61,7 +61,6 @@ def test_issue7029(): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] - nlp.select_pipes(enable=["tok2vec", "tagger"]) docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py index 64a420b84..541144877 100644 --- a/spacy/tests/regression/test_issue7056.py +++ b/spacy/tests/regression/test_issue7056.py @@ -1,5 +1,3 @@ -import pytest - from spacy.tokens.doc import Doc from spacy.vocab import Vocab from spacy.pipeline._parser_internals.arc_eager import ArcEager diff --git a/spacy/tests/regression/test_issue7062.py b/spacy/tests/regression/test_issue7062.py new file mode 100644 index 000000000..88e5d2520 --- /dev/null +++ b/spacy/tests/regression/test_issue7062.py @@ -0,0 +1,54 @@ +from spacy.kb import KnowledgeBase +from spacy.training import Example +from spacy.lang.en import English + + +# fmt: off +TRAIN_DATA = [ + ("Russ Cochran his reprints include EC Comics.", + {"links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}) +] +# fmt: on + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]} + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"] diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py new file mode 100644 index 000000000..897687d19 --- /dev/null +++ b/spacy/tests/regression/test_issue7065.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 6709defb8..86f726c43 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -160,7 +160,7 @@ subword_features = false """ -@registry.architectures.register("my_test_parser") +@registry.architectures("my_test_parser") def my_parser(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 352c335ea..fb04d31a3 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -108,7 +108,7 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc.register("spacy.CustomKB.v1") + @registry.misc("spacy.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int ) -> Callable[["Vocab"], KnowledgeBase]: diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index 31b2a2d2f..c9e451471 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -4,12 +4,12 @@ from thinc.api import Linear from catalogue import RegistryError -@registry.architectures.register("my_test_function") -def create_model(nr_in, nr_out): - return Linear(nr_in, nr_out) - - def test_get_architecture(): + + @registry.architectures("my_test_function") + def create_model(nr_in, nr_out): + return Linear(nr_in, nr_out) + arch = registry.architectures.get("my_test_function") assert arch is create_model with pytest.raises(RegistryError): diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index e694baa40..58bebc4ca 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,7 +7,7 @@ from spacy import util from spacy import prefer_gpu, require_gpu, require_cpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from spacy.util import dot_to_object, SimpleFrozenList +from spacy.util import dot_to_object, SimpleFrozenList, import_file from thinc.api import Config, Optimizer, ConfigValidationError from spacy.training.batchers import minibatch_by_words from spacy.lang.en import English @@ -17,7 +17,7 @@ from spacy.schemas import ConfigSchemaTraining from thinc.api import get_current_ops, NumpyOps, CupyOps -from .util import get_random_doc +from .util import get_random_doc, make_tempdir @pytest.fixture @@ -347,3 +347,35 @@ def test_resolve_dot_names(): errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ["training", "xyz"] + + +def test_import_code(): + code_str = """ +from spacy import Language + +class DummyComponent: + def __init__(self, vocab, name): + pass + + def initialize(self, get_examples, *, nlp, dummy_param: int): + pass + +@Language.factory( + "dummy_component", +) +def make_dummy_component( + nlp: Language, name: str +): + return DummyComponent(nlp.vocab, name) +""" + + with make_tempdir() as temp_dir: + code_path = os.path.join(temp_dir, "code.py") + with open(code_path, "w") as fileh: + fileh.write(code_str) + + import_file("python_code", code_path) + config = {"initialize": {"components": {"dummy_component": {"dummy_param": 1}}}} + nlp = English.from_config(config) + nlp.add_pipe("dummy_component") + nlp.initialize() diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index be3419b82..b8fbaf606 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -196,6 +196,104 @@ def test_Example_from_dict_with_entities_invalid(annots): assert len(list(example.reference.ents)) == 0 +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [ + (7, 15, "LOC"), + (11, 15, "LOC"), + (20, 26, "LOC"), + ], # overlapping + } + ], +) +def test_Example_from_dict_with_entities_overlapping(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": { + "cities": [(7, 15, "LOC"), (20, 26, "LOC")], + "people": [(0, 1, "PERSON")], + }, + } + ], +) +def test_Example_from_dict_with_spans(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.ents)) == 0 + assert len(list(example.reference.spans["cities"])) == 2 + assert len(list(example.reference.spans["people"])) == 1 + for span in example.reference.spans["cities"]: + assert span.label_ == "LOC" + for span in example.reference.spans["people"]: + assert span.label_ == "PERSON" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": { + "cities": [(7, 15, "LOC"), (11, 15, "LOC"), (20, 26, "LOC")], + "people": [(0, 1, "PERSON")], + }, + } + ], +) +def test_Example_from_dict_with_spans_overlapping(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.ents)) == 0 + assert len(list(example.reference.spans["cities"])) == 3 + assert len(list(example.reference.spans["people"])) == 1 + for span in example.reference.spans["cities"]: + assert span.label_ == "LOC" + for span in example.reference.spans["people"]: + assert span.label_ == "PERSON" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": [(0, 1, "PERSON")], + }, + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": {"cities": (7, 15, "LOC")}, + }, + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": {"cities": [7, 11]}, + }, + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "spans": {"cities": [[7]]}, + }, + ], +) +def test_Example_from_dict_with_spans_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) + + @pytest.mark.parametrize( "annots", [ diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 1c698abcc..f53660818 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -27,7 +27,7 @@ def test_readers(): factory = "textcat" """ - @registry.readers.register("myreader.v1") + @registry.readers("myreader.v1") def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]: annots = {"cats": {"POS": 1.0, "NEG": 0.0}} diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 1aefa2b7c..64090925d 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,7 +1,8 @@ from .doc import Doc from .token import Token from .span import Span +from .span_group import SpanGroup from ._serialize import DocBin from .morphanalysis import MorphAnalysis -__all__ = ["Doc", "Token", "Span", "DocBin", "MorphAnalysis"] +__all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"] diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 7b2d2d5b5..9ee1ad02f 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -33,8 +33,10 @@ class SpanGroups(UserDict): def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup: return SpanGroup(self.doc_ref(), name=name, spans=spans) - def copy(self) -> "SpanGroups": - return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes()) + def copy(self, doc: "Doc" = None) -> "SpanGroups": + if doc is None: + doc = self.doc_ref() + return SpanGroups(doc).from_bytes(self.to_bytes()) def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d276228da..850036483 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1188,7 +1188,7 @@ cdef class Doc: other.user_span_hooks = dict(self.user_span_hooks) other.length = self.length other.max_length = self.max_length - other.spans = self.spans.copy() + other.spans = self.spans.copy(doc=other) buff_size = other.max_length + (PADDING*2) assert buff_size > 0 tokens = other.mem.alloc(buff_size, sizeof(TokenC)) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 31eb1385b..06d86d2ac 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -357,7 +357,12 @@ cdef class Span: @property def sent(self): - """RETURNS (Span): The sentence span that the span is a part of.""" + """Obtain the sentence that contains this span. If the given span + crosses sentence boundaries, return only the first sentence + to which it belongs. + + RETURNS (Span): The sentence span that the span is a part of. + """ if "sent" in self.doc.user_span_hooks: return self.doc.user_span_hooks["sent"](self) # Use `sent_start` token attribute to find sentence boundaries @@ -367,8 +372,8 @@ cdef class Span: start = self.start while self.doc.c[start].sent_start != 1 and start > 0: start += -1 - # Find end of the sentence - end = self.end + # Find end of the sentence - can be within the entity + end = self.start + 1 while end < self.doc.length and self.doc.c[end].sent_start != 1: end += 1 n += 1 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index dc1c74e8a..9cf825bf9 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -22,6 +22,8 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if "entities" in doc_annot: _add_entities_to_doc(output, doc_annot["entities"]) + if "spans" in doc_annot: + _add_spans_to_doc(output, doc_annot["spans"]) if array.size: output = output.from_array(attrs, array) # links are currently added with ENT_KB_ID on the token level @@ -314,13 +316,11 @@ def _annot2array(vocab, tok_annot, doc_annot): for key, value in doc_annot.items(): if value: - if key == "entities": + if key in ["entities", "cats", "spans"]: pass elif key == "links": ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value) tok_annot["ENT_KB_ID"] = ent_kb_ids - elif key == "cats": - pass else: raise ValueError(Errors.E974.format(obj="doc", key=key)) @@ -351,6 +351,29 @@ def _annot2array(vocab, tok_annot, doc_annot): return attrs, array.T +def _add_spans_to_doc(doc, spans_data): + if not isinstance(spans_data, dict): + raise ValueError(Errors.E879) + for key, span_list in spans_data.items(): + spans = [] + if not isinstance(span_list, list): + raise ValueError(Errors.E879) + for span_tuple in span_list: + if not isinstance(span_tuple, (list, tuple)) or len(span_tuple) < 2: + raise ValueError(Errors.E879) + start_char = span_tuple[0] + end_char = span_tuple[1] + label = 0 + kb_id = 0 + if len(span_tuple) > 2: + label = span_tuple[2] + if len(span_tuple) > 3: + kb_id = span_tuple[3] + span = doc.char_span(start_char, end_char, label=label, kb_id=kb_id) + spans.append(span) + doc.spans[key] = spans + + def _add_entities_to_doc(doc, ner_data): if ner_data is None: return @@ -397,7 +420,7 @@ def _fix_legacy_dict_data(example_dict): pass elif key == "ids": pass - elif key in ("cats", "links"): + elif key in ("cats", "links", "spans"): doc_dict[key] = value elif key in ("ner", "entities"): doc_dict["entities"] = value diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 79459a89b..8acf2783c 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -103,7 +103,11 @@ def console_logger(progress_bar: bool = False): @registry.loggers("spacy.WandbLogger.v1") def wandb_logger(project_name: str, remove_config_values: List[str] = []): - import wandb + try: + import wandb + from wandb import init, log, join # test that these are available + except ImportError: + raise ImportError(Errors.E880) console = console_logger(progress_bar=False) diff --git a/spacy/util.py b/spacy/util.py index aa9bf301e..bcb51fe7d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -70,7 +70,7 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co logger = logging.getLogger("spacy") logger_stream_handler = logging.StreamHandler() -logger_stream_handler.setFormatter(logging.Formatter("%(message)s")) +logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")) logger.addHandler(logger_stream_handler) @@ -1454,9 +1454,10 @@ def is_cython_func(func: Callable) -> bool: if hasattr(func, attr): # function or class instance return True # https://stackoverflow.com/a/55767059 - if hasattr(func, "__qualname__") and hasattr(func, "__module__"): # method - cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] - return hasattr(cls_func, attr) + if hasattr(func, "__qualname__") and hasattr(func, "__module__") \ + and func.__module__ in sys.modules: # method + cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] + return hasattr(cls_func, attr) return False diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index db73e9d91..1008797b3 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -61,6 +61,8 @@ cdef class Vocab: lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. + get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]): + A function that yields base noun phrases used for Doc.noun_chunks. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index d8f0ce022..1739836ed 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -19,7 +19,7 @@ spaCy's built-in architectures that are used for different NLP tasks. All trainable [built-in components](/api#architecture-pipeline) expect a `model` argument defined in the config and document their the default architecture. Custom architectures can be registered using the -[`@spacy.registry.architectures`](/api/top-level#regsitry) decorator and used as +[`@spacy.registry.architectures`](/api/top-level#registry) decorator and used as part of the [training config](/usage/training#custom-functions). Also see the usage documentation on [layers and model architectures](/usage/layers-architectures). diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index a0b4c29bb..45feb8774 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -219,7 +219,7 @@ alignment mode `"strict". | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | -## Doc.set_ents {#ents tag="method" new="3"} +## Doc.set_ents {#set_ents tag="method" new="3"} Set the named entities in the document. @@ -616,8 +616,10 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. -If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has -not been implemeted for the given language, a `NotImplementedError` is raised. +To customize the noun chunk iterator in a loaded pipeline, modify +[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` +[syntax iterator](/usage/adding-languages#language-data) has not been +implemented for the given language, a `NotImplementedError` is raised. > #### Example > @@ -633,12 +635,14 @@ not been implemeted for the given language, a `NotImplementedError` is raised. | ---------- | ------------------------------------- | | **YIELDS** | Noun chunks in the document. ~~Span~~ | -## Doc.sents {#sents tag="property" model="parser"} +## Doc.sents {#sents tag="property" model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. To -improve accuracy on informal texts, spaCy calculates sentence boundaries from -the syntactic dependency parse. If the parser is disabled, the `sents` iterator -will be unavailable. +Iterate over the sentences in the document. Sentence spans have no label. + +This property is only available when +[sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. > #### Example > diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 47ba80c0f..1cc864059 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -31,6 +31,7 @@ architectures and their arguments and hyperparameters. > from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL > config = { > "labels_discard": [], +> "n_sents": 0, > "incl_prior": True, > "incl_context": True, > "model": DEFAULT_NEL_MODEL, @@ -43,6 +44,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | @@ -89,6 +91,7 @@ custom knowledge base, you should either call | `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | @@ -154,7 +157,7 @@ with the current vocab. > kb.add_alias(...) > return kb > entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb) +> entity_linker.set_kb(create_kb) > ``` | Name | Description | @@ -247,14 +250,14 @@ pipe's entity linking model and context encoder. Delegates to > losses = entity_linker.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityLinker.score {#score tag="method" new="3"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index 03f12b2c9..3cbc5dbd8 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -152,7 +152,7 @@ Get a list of all aliases in the knowledge base. | ----------- | -------------------------------------------------------- | | **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ | -## KnowledgeBase.get_candidates {#get_candidates tag="method"} +## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"} Given a certain textual mention as input, retrieve a list of candidate entities of type [`Candidate`](/api/kb/#candidate). @@ -160,13 +160,13 @@ of type [`Candidate`](/api/kb/#candidate). > #### Example > > ```python -> candidates = kb.get_candidates("Douglas") +> candidates = kb.get_alias_candidates("Douglas") > ``` -| Name | Description | -| ----------- | ------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | -| **RETURNS** | iterable | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| `alias` | The textual mention or alias. ~~str~~ | +| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | ## KnowledgeBase.get_vector {#get_vector tag="method"} @@ -246,7 +246,7 @@ certain prior probability. Construct a `Candidate` object. Usually this constructor is not called directly, but instead these objects are returned by the -[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`. +`get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe. > #### Example > diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 6a8744463..a90476dab 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -364,7 +364,7 @@ Evaluate a pipeline's components. -The `Language.update` method now takes a batch of [`Example`](/api/example) +The `Language.evaluate` method now takes a batch of [`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse` objects. diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index fb48d68cc..cf1a1ca1f 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -137,14 +137,14 @@ Returns PRF scores for labeled or unlabeled spans. > print(scores["ents_f"]) > ``` -| Name | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | -| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~ | -| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | +| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~Optional[Callable[[Doc], bool]]~~ | +| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 37d18c62e..333344b31 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -483,13 +483,40 @@ The L2 norm of the span's vector representation. | ----------- | --------------------------------------------------- | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ | +## Span.sent {#sent tag="property" model="sentences"} + +The sentence span that this span is a part of. This property is only available +when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. + +If the span happens to cross sentence boundaries, only the first sentence will +be returned. If it is required that the sentence always includes the +full span, the result can be adjusted as such: + +```python +sent = span.sent +sent = doc[sent.start : max(sent.end, span.end)] +``` + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> span = doc[1:3] +> assert span.sent.text == "Give it back!" +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------- | +| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ | + ## Attributes {#attributes} | Name | Description | | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | `doc` | The parent document. ~~Doc~~ | | `tensor` 2.1.7 | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | -| `sent` | The sentence span that this span is a part of. ~~Span~~ | | `start` | The token offset for the start of the span. ~~int~~ | | `end` | The token offset for the end of the span. ~~int~~ | | `start_char` | The character offset for the start of the span. ~~int~~ | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index a2ca63002..8fe769cdd 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -21,14 +21,14 @@ Create the vocabulary. > vocab = Vocab(strings=["hello", "world"]) > ``` -| Name | Description | -| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | -| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | -| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | -| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | -| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | -| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | +| Name | Description | +| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | +| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | +| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | +| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | +| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | +| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -182,14 +182,14 @@ subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`). | Name | Description | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | -| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | -| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | +| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | +| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.set_vector {#set_vector tag="method" new="2"} -Set a vector for a word in the vocabulary. Words can be referenced by string -or hash value. +Set a vector for a word in the vocabulary. Words can be referenced by string or +hash value. > #### Example > @@ -300,13 +300,14 @@ Load state from a binary string. > assert type(PERSON) == int > ``` -| Name | Description | -| --------------------------------------------- | ------------------------------------------------------------------------------- | -| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ | -| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ | -| `vectors_length` | Number of dimensions for each word vector. ~~int~~ | -| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ | -| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ | +| Name | Description | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ | +| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ | +| `vectors_length` | Number of dimensions for each word vector. ~~int~~ | +| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ | +| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ | +| `get_noun_chunks` 3.0 | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index d7a7d3ce8..0bc935d51 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -15,7 +15,7 @@ next: /usage/projects > ```python > from thinc.api import Model, chain > -> @spacy.registry.architectures.register("model.v1") +> @spacy.registry.architectures("model.v1") > def build_model(width: int, classes: int) -> Model: > tok2vec = build_tok2vec(width) > output_layer = build_output_layer(width, classes) @@ -563,7 +563,7 @@ matrix** (~~Floats2d~~) of predictions: ```python ### The model architecture -@spacy.registry.architectures.register("rel_model.v1") +@spacy.registry.architectures("rel_model.v1") def create_relation_model(...) -> Model[List[Doc], Floats2d]: model = ... # 👈 model will go here return model @@ -589,7 +589,7 @@ transforms the instance tensor into a final tensor holding the predictions: ```python ### The model architecture {highlight="6"} -@spacy.registry.architectures.register("rel_model.v1") +@spacy.registry.architectures("rel_model.v1") def create_relation_model( create_instance_tensor: Model[List[Doc], Floats2d], classification_layer: Model[Floats2d, Floats2d], @@ -613,7 +613,7 @@ The `classification_layer` could be something like a ```python ### The classification layer -@spacy.registry.architectures.register("rel_classification_layer.v1") +@spacy.registry.architectures("rel_classification_layer.v1") def create_classification_layer( nO: int = None, nI: int = None ) -> Model[Floats2d, Floats2d]: @@ -650,7 +650,7 @@ that has the full implementation. ```python ### The layer that creates the instance tensor -@spacy.registry.architectures.register("rel_instance_tensor.v1") +@spacy.registry.architectures("rel_instance_tensor.v1") def create_tensors( tok2vec: Model[List[Doc], List[Floats2d]], pooling: Model[Ragged, Floats2d], @@ -731,7 +731,7 @@ are within a **maximum distance** (in number of tokens) of each other: ```python ### Candidate generation -@spacy.registry.misc.register("rel_instance_generator.v1") +@spacy.registry.misc("rel_instance_generator.v1") def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]: def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]: candidates = [] diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 80a8eab1b..fd76c6e4d 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -585,7 +585,7 @@ print(ent_francisco) # ['Francisco', 'I', 'GPE'] To ensure that the sequence of token annotations remains consistent, you have to set entity annotations **at the document level**. However, you can't write directly to the `token.ent_iob` or `token.ent_type` attributes, so the easiest -way to set entities is to assign to the [`doc.ents`](/api/doc#ents) attribute +way to set entities is to use the [`doc.set_ents`](/api/doc#set_ents) function and create the new entity as a [`Span`](/api/span). ```python diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 1758f677a..5e9d3303c 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -95,6 +95,14 @@ spaCy's binary `.spacy` format. You can either include the data paths in the $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy ``` +> #### Tip: Enable your GPU +> +> Use the `--gpu-id` option to select the GPU: +> +> ```cli +> $ python -m spacy train config.cfg --gpu-id 0 +> ``` + The recommended config settings generated by the quickstart widget and the diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index f8bcb39d1..5353f9ded 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -603,6 +603,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | `GoldParse` | [`Example`](/api/example) | | `GoldCorpus` | [`Corpus`](/api/corpus) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `KnowledgeBase.get_candidates` | [`KnowledgeBase.get_alias_candidates`](/api/kb#get_alias_candidates) | | `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | | `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | | `spacy init-model` | [`spacy init vectors`](/api/cli#init-vectors) | diff --git a/website/meta/universe.json b/website/meta/universe.json index cae818e88..f67b7c219 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -58,7 +58,7 @@ }, "category": ["pipeline"], "tags": ["sentiment", "textblob"] - }, + }, { "id": "spacy-ray", "title": "spacy-ray", @@ -2647,14 +2647,14 @@ "github": "medspacy" } }, - { + { "id": "rita-dsl", "title": "RITA DSL", "slogan": "Domain Specific Language for creating language rules", "github": "zaibacu/rita-dsl", "description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format", "pip": "rita-dsl", - "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png", + "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png", "code_language": "python", "code_example": [ "import spacy", @@ -2754,14 +2754,41 @@ "{", " var lexeme = doc.Vocab[word.Text];", " Console.WriteLine($@\"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}\");", - "}" - ], + "}" + ], "code_language": "csharp", "author": "Antonio Miras", "author_links": { "github": "AMArostegui" }, "category": ["nonpython"] + }, + { + "id": "ruts", + "title": "ruTS", + "slogan": "A library for statistics extraction from texts in Russian", + "description": "The library allows extracting the following statistics from a text: basic statistics, readability metrics, lexical diversity metrics, morphological statistics", + "github": "SergeyShk/ruTS", + "pip": "ruts", + "code_example": [ + "import spacy", + "import ruts", + "", + "nlp = spacy.load('ru_core_news_sm')", + "nlp.add_pipe('basic', last=True)", + "doc = nlp('мама мыла раму')", + "doc._.basic.get_stats()" + ], + "code_language": "python", + "thumb": "https://habrastorage.org/webt/6z/le/fz/6zlefzjavzoqw_wymz7v3pwgfp4.png", + "image": "https://clipartart.com/images/free-tree-roots-clipart-black-and-white-2.png", + "author": "Sergey Shkarin", + "author_links": { + "twitter": "shk_sergey", + "github": "SergeyShk" + }, + "category": ["pipeline", "standalone"], + "tags": ["Text Analytics", "Russian"] } ],