diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml new file mode 100644 index 000000000..bb7f51d29 --- /dev/null +++ b/.github/workflows/gputests.yml @@ -0,0 +1,21 @@ +name: Weekly GPU tests + +on: + schedule: + - cron: '0 1 * * MON' + +jobs: + weekly-gputests: + strategy: + fail-fast: false + matrix: + branch: [master, v4] + runs-on: ubuntu-latest + steps: + - name: Trigger buildkite build + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-gpu-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action" + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml new file mode 100644 index 000000000..1a99c751c --- /dev/null +++ b/.github/workflows/slowtests.yml @@ -0,0 +1,37 @@ +name: Daily slow tests + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + daily-slowtests: + strategy: + fail-fast: false + matrix: + branch: [master, v4] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v1 + with: + ref: ${{ matrix.branch }} + - name: Get commits from past 24 hours + id: check_commits + run: | + today=$(date '+%Y-%m-%d %H:%M:%S') + yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') + if git log --after="$yesterday" --before="$today" | grep commit ; then + echo "::set-output name=run_tests::true" + else + echo "::set-output name=run_tests::false" + fi + + - name: Trigger buildkite build + if: steps.check_commits.outputs.run_tests == 'true' + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action" + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/.gitignore b/.gitignore index 60036a475..ac72f2bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ keys/ spacy/tests/package/setup.cfg spacy/tests/package/pyproject.toml spacy/tests/package/requirements.txt -spacy/tests/universe/universe.json # Website website/.cache/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a4d321aa3..9a7d0744a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -143,15 +143,25 @@ Changes to `.py` files will be effective immediately. ### Fixing bugs When fixing a bug, first create an -[issue](https://github.com/explosion/spaCy/issues) if one does not already exist. -The description text can be very short – we don't want to make this too +[issue](https://github.com/explosion/spaCy/issues) if one does not already +exist. The description text can be very short – we don't want to make this too bureaucratic. -Next, create a test file named `test_issue[ISSUE NUMBER].py` in the -[`spacy/tests/regression`](spacy/tests/regression) folder. Test for the bug -you're fixing, and make sure the test fails. Next, add and commit your test file -referencing the issue number in the commit message. Finally, fix the bug, make -sure your test passes and reference the issue in your commit message. +Next, add a test to the relevant file in the +[`spacy/tests`](spacy/tests)folder. Then add a [pytest +mark](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers), +`@pytest.mark.issue(NUMBER)`, to reference the issue number. + +```python +# Assume you're fixing Issue #1234 +@pytest.mark.issue(1234) +def test_issue1234(): + ... +``` + +Test for the bug you're fixing, and make sure the test fails. Next, add and +commit your test file. Finally, fix the bug, make sure your test passes and +reference the issue number in your pull request description. 📖 **For more information on how to add tests, check out the [tests README](spacy/tests/README.md).** diff --git a/LICENSE b/LICENSE index 86f501b92..d76864579 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in index c1524d460..b7826e456 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,8 @@ -recursive-include include *.h recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml include spacy/py.typed -recursive-exclude spacy/lang *.json -recursive-include spacy/lang *.json.gz -recursive-include spacy/cli *.json *.yml +recursive-include spacy/cli *.yml recursive-include licenses * recursive-exclude spacy *.cpp diff --git a/README.md b/README.md index 57d76fb45..05c912ffa 100644 --- a/README.md +++ b/README.md @@ -32,19 +32,20 @@ open-source software, released under the MIT license. ## 📖 Documentation -| Documentation | | -| -------------------------- | -------------------------------------------------------------- | -| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | -| 📚 **[Usage Guides]** | How to use spaCy and its features. | -| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | -| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | -| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | -| 📦 **[Models]** | Download trained pipelines for spaCy. | -| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | -| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | -| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | -| 🛠 **[Changelog]** | Changes and version history. | -| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | +| Documentation | | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | +| 📚 **[Usage Guides]** | How to use spaCy and its features. | +| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | +| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | +| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | +| 📦 **[Models]** | Download trained pipelines for spaCy. | +| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | +| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | +| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | +| 🛠 **[Changelog]** | Changes and version history. | +| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | +| spaCy Tailored Pipelines | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** | [spacy 101]: https://spacy.io/usage/spacy-101 [new in v3.0]: https://spacy.io/usage/v3 @@ -60,9 +61,7 @@ open-source software, released under the MIT license. ## 💬 Where to ask questions -The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, -**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4291b6e0a..4624b2eb2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,19 +11,21 @@ trigger: exclude: - "website/*" - "*.md" + - ".github/workflows/*" pr: - paths: + paths: exclude: - "*.md" - "website/docs/*" - "website/src/*" + - ".github/workflows/*" jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config # defined in .flake8 and overwrites the selected codes. - job: "Validate" pool: - vmImage: "ubuntu-18.04" + vmImage: "ubuntu-latest" steps: - task: UsePythonVersion@0 inputs: @@ -39,49 +41,49 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-18.04" + imageName: "ubuntu-latest" python.version: "3.6" # Python36Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.6" # Python36Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-18.04" + # imageName: "ubuntu-latest" # python.version: "3.7" Python37Windows: - imageName: "windows-2019" + imageName: "windows-latest" python.version: "3.7" # Python37Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.7" # Python38Linux: - # imageName: "ubuntu-18.04" + # imageName: "ubuntu-latest" # python.version: "3.8" # Python38Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.8" Python38Mac: - imageName: "macos-10.14" + imageName: "macos-latest" python.version: "3.8" Python39Linux: - imageName: "ubuntu-18.04" + imageName: "ubuntu-latest" python.version: "3.9" # Python39Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.9" # Python39Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.9" Python310Linux: - imageName: "ubuntu-20.04" + imageName: "ubuntu-latest" python.version: "3.10" Python310Windows: - imageName: "windows-2019" + imageName: "windows-latest" python.version: "3.10" Python310Mac: - imageName: "macos-10.15" + imageName: "macos-latest" python.version: "3.10" maxParallel: 4 pool: diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md index 7a3f6996f..eba466c46 100644 --- a/extra/DEVELOPER_DOCS/Code Conventions.md +++ b/extra/DEVELOPER_DOCS/Code Conventions.md @@ -444,7 +444,7 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`. -Regression tests are tests that refer to bugs reported in specific issues. They should live in the `regression` module and are named according to the issue number (e.g. `test_issue1234.py`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. Every once in a while, we go through the `regression` module and group tests together into larger files by issue number, in groups of 500 to 1000 numbers. This prevents us from ending up with too many individual files over time. +Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file. diff --git a/requirements.txt b/requirements.txt index fe621c55a..f3cb67bd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,8 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<3.10.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.910 +mypy==0.910 types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-requests +black>=22.0,<23.0 diff --git a/setup.cfg b/setup.cfg index d4b0fc54e..9a257f5f9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,37 +77,39 @@ transformers = ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<10.0.0 + cupy>=5.0.0b4,<11.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<10.0.0 + cupy-cuda80>=5.0.0b4,<11.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<10.0.0 + cupy-cuda90>=5.0.0b4,<11.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<10.0.0 + cupy-cuda91>=5.0.0b4,<11.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<10.0.0 + cupy-cuda92>=5.0.0b4,<11.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<10.0.0 + cupy-cuda100>=5.0.0b4,<11.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<10.0.0 + cupy-cuda101>=5.0.0b4,<11.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<10.0.0 + cupy-cuda102>=5.0.0b4,<11.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<10.0.0 + cupy-cuda110>=5.0.0b4,<11.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<10.0.0 + cupy-cuda111>=5.0.0b4,<11.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<10.0.0 + cupy-cuda112>=5.0.0b4,<11.0.0 cuda113 = - cupy-cuda113>=5.0.0b4,<10.0.0 + cupy-cuda113>=5.0.0b4,<11.0.0 cuda114 = - cupy-cuda114>=5.0.0b4,<10.0.0 + cupy-cuda114>=5.0.0b4,<11.0.0 +cuda115 = + cupy-cuda115>=5.0.0b4,<11.0.0 apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = - sudachipy>=0.4.9 - sudachidict_core>=20200330 + sudachipy>=0.5.2,!=0.6.1 + sudachidict_core>=20211220 ko = natto-py==0.9.0 th = diff --git a/setup.py b/setup.py index 03a1e01dd..fcc124a43 100755 --- a/setup.py +++ b/setup.py @@ -81,7 +81,6 @@ COPY_FILES = { ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", - ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe", } diff --git a/spacy/about.py b/spacy/about.py index 29f78805c..d01b278c9 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.2.0" +__version__ = "3.2.2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 640fb2f3c..dc8eed7c3 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,3 +1,6 @@ +from .errors import Errors + +IOB_STRINGS = ("", "I", "O", "B") IDS = { "": NULL_ATTR, @@ -64,7 +67,6 @@ IDS = { "FLAG61": FLAG61, "FLAG62": FLAG62, "FLAG63": FLAG63, - "ID": ID, "ORTH": ORTH, "LOWER": LOWER, @@ -72,7 +74,6 @@ IDS = { "SHAPE": SHAPE, "PREFIX": PREFIX, "SUFFIX": SUFFIX, - "LENGTH": LENGTH, "LEMMA": LEMMA, "POS": POS, @@ -87,7 +88,7 @@ IDS = { "SPACY": SPACY, "LANG": LANG, "MORPH": MORPH, - "IDX": IDX + "IDX": IDX, } @@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): """ inty_attrs = {} if _do_deprecated: - if 'F' in stringy_attrs: + if "F" in stringy_attrs: stringy_attrs["ORTH"] = stringy_attrs.pop("F") - if 'L' in stringy_attrs: + if "L" in stringy_attrs: stringy_attrs["LEMMA"] = stringy_attrs.pop("L") - if 'pos' in stringy_attrs: + if "pos" in stringy_attrs: stringy_attrs["TAG"] = stringy_attrs.pop("pos") - if 'morph' in stringy_attrs: - morphs = stringy_attrs.pop('morph') - if 'number' in stringy_attrs: - stringy_attrs.pop('number') - if 'tenspect' in stringy_attrs: - stringy_attrs.pop('tenspect') + if "morph" in stringy_attrs: + morphs = stringy_attrs.pop("morph") + if "number" in stringy_attrs: + stringy_attrs.pop("number") + if "tenspect" in stringy_attrs: + stringy_attrs.pop("tenspect") morph_keys = [ - 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', - 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', - 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', - 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', - 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', - 'NumValue', 'PartType', 'Polite', 'StyleVariant', - 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', - 'Polarity', 'PrepCase', 'Animacy' # U20 + "PunctType", + "PunctSide", + "Other", + "Degree", + "AdvType", + "Number", + "VerbForm", + "PronType", + "Aspect", + "Tense", + "PartType", + "Poss", + "Hyph", + "ConjType", + "NumType", + "Foreign", + "VerbType", + "NounType", + "Gender", + "Mood", + "Negative", + "Tense", + "Voice", + "Abbr", + "Derivation", + "Echo", + "Foreign", + "NameType", + "NounType", + "NumForm", + "NumValue", + "PartType", + "Polite", + "StyleVariant", + "PronType", + "AdjType", + "Person", + "Variant", + "AdpType", + "Reflex", + "Negative", + "Mood", + "Aspect", + "Case", + "Polarity", + "PrepCase", + "Animacy", # U20 ] for key in morph_keys: if key in stringy_attrs: @@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: + if int_key == ENT_IOB: + if value in IOB_STRINGS: + value = IOB_STRINGS.index(value) + elif isinstance(value, str): + raise ValueError(Errors.E1025.format(value=value)) if strings_map is not None and isinstance(value, str): - if hasattr(strings_map, 'add'): + if hasattr(strings_map, "add"): value = strings_map.add(value) else: value = strings_map[value] diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 56ee12336..409fac4ed 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -25,7 +25,7 @@ def debug_config_cli( show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") # fmt: on ): - """Debug a config.cfg file and show validation errors. The command will + """Debug a config file and show validation errors. The command will create all objects in the tree and validate them. Note that some config validation errors are blocking and will prevent the rest of the config from being resolved. This means that you may not see all validation errors at diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 3143e2c62..a63795148 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals.nonproj import DELIMITER -from ..pipeline import Morphologizer +from ..pipeline import Morphologizer, SpanCategorizer from ..morphology import Morphology from ..language import Language from ..util import registry, resolve_dot_names @@ -193,6 +193,70 @@ def debug_data( else: msg.info("No word vectors present in the package") + if "spancat" in factory_names: + model_labels_spancat = _get_labels_from_spancat(nlp) + has_low_data_warning = False + has_no_neg_warning = False + + msg.divider("Span Categorization") + msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True) + + msg.text("Label counts in train data: ", show=verbose) + for spans_key, data_labels in gold_train_data["spancat"].items(): + msg.text( + f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}", + show=verbose, + ) + # Data checks: only take the spans keys in the actual spancat components + data_labels_in_component = { + spans_key: gold_train_data["spancat"][spans_key] + for spans_key in model_labels_spancat.keys() + } + for spans_key, data_labels in data_labels_in_component.items(): + for label, count in data_labels.items(): + # Check for missing labels + spans_key_in_model = spans_key in model_labels_spancat.keys() + if (spans_key_in_model) and ( + label not in model_labels_spancat[spans_key] + ): + msg.warn( + f"Label '{label}' is not present in the model labels of key '{spans_key}'. " + "Performance may degrade after training." + ) + # Check for low number of examples per label + if count <= NEW_LABEL_THRESHOLD: + msg.warn( + f"Low number of examples for label '{label}' in key '{spans_key}' ({count})" + ) + has_low_data_warning = True + # Check for negative examples + with msg.loading("Analyzing label distribution..."): + neg_docs = _get_examples_without_label( + train_dataset, label, "spancat", spans_key + ) + if neg_docs == 0: + msg.warn(f"No examples for texts WITHOUT new label '{label}'") + has_no_neg_warning = True + + if has_low_data_warning: + msg.text( + f"To train a new span type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", + show=verbose, + ) + else: + msg.good("Good amount of examples for all labels") + + if has_no_neg_warning: + msg.text( + "Training data should always include examples of spans " + "in context, as well as examples without a given span " + "type.", + show=verbose, + ) + else: + msg.good("Examples without ocurrences available for all labels") + if "ner" in factory_names: # Get all unique NER labels present in the data labels = set( @@ -203,6 +267,7 @@ def debug_data( has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False + has_boundary_cross_ents_warning = False msg.divider("Named Entity Recognition") msg.info(f"{len(model_labels)} label(s)") @@ -237,17 +302,25 @@ def debug_data( has_low_data_warning = True with msg.loading("Analyzing label distribution..."): - neg_docs = _get_examples_without_label(train_dataset, label) + neg_docs = _get_examples_without_label(train_dataset, label, "ner") if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True + if gold_train_data["boundary_cross_ents"]: + msg.warn( + f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries" + ) + has_boundary_cross_ents_warning = True + if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") + if not has_boundary_cross_ents_warning: + msg.good("No entities crossing sentence boundaries") if has_low_data_warning: msg.text( @@ -564,7 +637,9 @@ def _compile_gold( "deps": Counter(), "words": Counter(), "roots": Counter(), + "spancat": dict(), "ws_ents": 0, + "boundary_cross_ents": 0, "n_words": 0, "n_misaligned_words": 0, "words_missing_vectors": Counter(), @@ -593,6 +668,7 @@ def _compile_gold( if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in factory_names: + sent_starts = eg.get_aligned_sent_starts() for i, label in enumerate(eg.get_aligned_ner()): if label is None: continue @@ -602,8 +678,19 @@ def _compile_gold( if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 + if sent_starts[i] == True and label.startswith(("I-", "L-")): + data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 + if "spancat" in factory_names: + for span_key in list(eg.reference.spans.keys()): + if span_key not in data["spancat"]: + data["spancat"][span_key] = Counter() + for i, span in enumerate(eg.reference.spans[span_key]): + if span.label_ is None: + continue + else: + data["spancat"][span_key][span.label_] += 1 if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) if any(val not in (0, 1) for val in gold.cats.values()): @@ -674,21 +761,57 @@ def _format_labels( return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) -def _get_examples_without_label(data: Sequence[Example], label: str) -> int: +def _get_examples_without_label( + data: Sequence[Example], + label: str, + component: Literal["ner", "spancat"] = "ner", + spans_key: Optional[str] = "sc", +) -> int: count = 0 for eg in data: - labels = [ - label.split("-")[1] - for label in eg.get_aligned_ner() - if label not in ("O", "-", None) - ] + if component == "ner": + labels = [ + label.split("-")[1] + for label in eg.get_aligned_ner() + if label not in ("O", "-", None) + ] + + if component == "spancat": + labels = ( + [span.label_ for span in eg.reference.spans[spans_key]] + if spans_key in eg.reference.spans + else [] + ) + if label not in labels: count += 1 return count -def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]: - if pipe_name not in nlp.pipe_names: - return set() - pipe = nlp.get_pipe(pipe_name) - return set(pipe.labels) +def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]: + pipe_names = [ + pipe_name + for pipe_name in nlp.pipe_names + if nlp.get_pipe_meta(pipe_name).factory == factory_name + ] + labels: Set[str] = set() + for pipe_name in pipe_names: + pipe = nlp.get_pipe(pipe_name) + labels.update(pipe.labels) + return labels + + +def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: + pipe_names = [ + pipe_name + for pipe_name in nlp.pipe_names + if nlp.get_pipe_meta(pipe_name).factory == "spancat" + ] + labels: Dict[str, Set[str]] = {} + for pipe_name in pipe_names: + pipe = nlp.get_pipe(pipe_name) + assert isinstance(pipe, SpanCategorizer) + if pipe.key not in labels: + labels[pipe.key] = set() + labels[pipe.key].update(pipe.labels) + return labels diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 530b38eb3..d4cd939c2 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -27,7 +27,7 @@ class Optimizations(str, Enum): @init_cli.command("config") def init_config_cli( # fmt: off - output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True), + output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), @@ -37,7 +37,7 @@ def init_config_cli( # fmt: on ): """ - Generate a starter config.cfg for training. Based on your requirements + Generate a starter config file for training. Based on your requirements specified via the CLI arguments, this command generates a config with the optimal settings for your use case. This includes the choice of architecture, pretrained weights and related hyperparameters. @@ -66,15 +66,15 @@ def init_config_cli( @init_cli.command("fill-config") def init_fill_config_cli( # fmt: off - base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False), - output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), + base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False), + output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), # fmt: on ): """ - Fill partial config.cfg with default values. Will add all missing settings + Fill partial config file with default values. Will add all missing settings from the default config and will create all objects, check the registered functions for their default values and update the base config. This command can be used with a config generated via the training quickstart widget: diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 76e14daf5..b8c8397b6 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -4,8 +4,10 @@ from pathlib import Path from wasabi import Printer, MarkdownRenderer, get_raw_input from thinc.api import Config from collections import defaultdict +from catalogue import RegistryError import srsly import sys +import re from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX from ..schemas import validate, ModelMetaSchema @@ -108,6 +110,24 @@ def package( ", ".join(meta["requirements"]), ) if name is not None: + if not name.isidentifier(): + msg.fail( + f"Model name ('{name}') is not a valid module name. " + "This is required so it can be imported as a module.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", + exits=1, + ) + if not _is_permitted_package_name(name): + msg.fail( + f"Model name ('{name}') is not a permitted package name. " + "This is required to correctly load the model with spacy.load.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", + exits=1, + ) meta["name"] = name if version is not None: meta["version"] = version @@ -161,7 +181,7 @@ def package( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) - msg.good(f"Successfully created package '{model_name_v}'", main_path) + msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) @@ -170,8 +190,14 @@ def package( if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) - wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" + wheel_name_squashed = re.sub("_+", "_", model_name_v) + wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) + if "__" in model_name: + msg.warn( + f"Model name ('{model_name}') contains a run of underscores. " + "Runs of underscores are not significant in installed package names.", + ) def has_wheel() -> bool: @@ -212,9 +238,18 @@ def get_third_party_dependencies( if "factory" in component: funcs["factories"].add(component["factory"]) modules = set() + lang = config["nlp"]["lang"] for reg_name, func_names in funcs.items(): for func_name in func_names: - func_info = util.registry.find(reg_name, func_name) + # Try the lang-specific version and fall back + try: + func_info = util.registry.find(reg_name, lang + "." + func_name) + except RegistryError: + try: + func_info = util.registry.find(reg_name, func_name) + except RegistryError as regerr: + # lang-specific version being absent is not actually an issue + raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file modules.add(func_info["module"].split(".")[0]) # type: ignore[index] @@ -412,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: return md.text +def _is_permitted_package_name(package_name: str) -> bool: + # regex from: https://www.python.org/dev/peps/pep-0426/#name + permitted_match = re.search( + r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE + ) + return permitted_match is not None + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index b5057e401..5e0cdfdf2 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -1,6 +1,7 @@ from typing import Any, Dict, Optional from pathlib import Path from wasabi import msg +import os import re import shutil import requests @@ -129,10 +130,17 @@ def fetch_asset( the asset failed. """ dest_path = (project_path / dest).resolve() - if dest_path.exists() and checksum: + if dest_path.exists(): # If there's already a file, check for checksum - if checksum == get_checksum(dest_path): - msg.good(f"Skipping download with matching checksum: {dest}") + if checksum: + if checksum == get_checksum(dest_path): + msg.good(f"Skipping download with matching checksum: {dest}") + return + else: + # If there's not a checksum, make sure the file is a possibly valid size + if os.path.getsize(dest_path) == 0: + msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}") + os.remove(dest_path) # We might as well support the user here and create parent directories in # case the asset dir isn't listed as a dir to create in the project.yml if not dest_path.parent.exists(): diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b78806fec..fb79a4f60 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements. [paths] train = null dev = null +{% if use_transformer or optimize == "efficiency" or not word_vectors -%} +vectors = null +{% else -%} +vectors = "{{ word_vectors }}" +{% endif -%} [system] {% if use_transformer -%} @@ -421,8 +426,4 @@ compound = 1.001 {% endif %} [initialize] -{% if use_transformer or optimize == "efficiency" or not word_vectors -%} vectors = ${paths.vectors} -{% else -%} -vectors = "{{ word_vectors }}" -{% endif -%} diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index ceb7357fc..86a72926e 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -68,12 +68,14 @@ seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 -# Controls early-stopping. 0 disables early stopping. +# Controls early-stopping, i.e., the number of steps to continue without +# improvement before stopping. 0 disables early stopping. patience = 1600 # Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in # memory and shuffled within the training loop. -1 means stream train corpus # rather than loading in memory with no shuffling within the training loop. max_epochs = 0 +# Maximum number of update steps to train for. 0 means an unlimited number of steps. max_steps = 20000 eval_frequency = 200 # Control how scores are printed and checkpoints are evaluated. diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index d9418f675..25d530c83 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -181,11 +181,19 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: """Generate named entities in [{start: i, end: i, label: 'label'}] format. - doc (Doc): Document do parse. + doc (Doc): Document to parse. + options (Dict[str, Any]): NER-specific visualisation options. RETURNS (dict): Generated entities keyed by text (original text) and ents. """ + kb_url_template = options.get("kb_url_template", None) ents = [ - {"start": ent.start_char, "end": ent.end_char, "label": ent.label_} + { + "start": ent.start_char, + "end": ent.end_char, + "label": ent.label_, + "kb_id": ent.kb_id_ if ent.kb_id_ else "", + "kb_url": kb_url_template.format(ent.kb_id_) if kb_url_template else "#", + } for ent in doc.ents ] if not ents: diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 14d741a3d..a032d843b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = { "LOC": "#ff9561", "PERSON": "#aa9cfc", "NORP": "#c887fb", - "FACILITY": "#9cc9cc", + "FAC": "#9cc9cc", "EVENT": "#ffeb80", "LAW": "#ff8197", "LANGUAGE": "#ff8197", diff --git a/spacy/errors.py b/spacy/errors.py index c5e364013..5399e489b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes): "components, since spans are only views of the Doc. Use Doc and " "Token attributes (or custom extension attributes) only and remove " "the following: {attrs}") - E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " + E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. " "Only Doc and Token attributes are supported.") E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " "to define the attribute? For example: `{attr}.???`") @@ -566,9 +566,6 @@ class Errors(metaclass=ErrorsWithCodes): E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to " "a list of spans, with each span represented by a tuple (start_char, end_char). " "The tuple can be optionally extended with a label and a KB ID.") - E880 = ("The 'wandb' library could not be found - did you install it? " - "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' " - "config section, instead of the 'WandbLogger'.") E884 = ("The pipeline could not be initialized because the vectors " "could not be found at '{vectors}'. If your pipeline was already " "initialized/trained before, call 'resume_training' instead of 'initialize', " @@ -642,7 +639,7 @@ class Errors(metaclass=ErrorsWithCodes): E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found " "for mode '{mode}'. Required tables: {tables}. Found: {found}.") E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " - "config.cfg or override it on the CLI?") + ".cfg file or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " "return the nlp object but got: {value}. Maybe you forgot to return " "the modified object in your function?") @@ -888,8 +885,13 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") - E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") - + E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't " + "exist.") + E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler " + "patterns.") + E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " + "supported values are: 'I', 'O', 'B' and ''") + # Deprecated model shortcuts, only used in errors and warnings OLD_MODEL_SHORTCUTS = { diff --git a/spacy/glossary.py b/spacy/glossary.py index e45704fc5..57254330f 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -310,7 +310,6 @@ GLOSSARY = { "re": "repeated element", "rs": "reported speech", "sb": "subject", - "sb": "subject", "sbp": "passivized subject (PP)", "sp": "subject or predicate", "svp": "separable verb prefix", diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 9e5441a4f..b15bb3cf3 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF" _hangul_jamo = r"\u1100-\u11FF" _hangul = _hangul_syllables + _hangul_jamo +_hiragana = r"\u3040-\u309F" +_katakana = r"\u30A0-\u30FFー" +_kana = _hiragana + _katakana + # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh _latin_u_extendedA = ( r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" @@ -244,6 +248,7 @@ _uncased = ( + _tamil + _telugu + _hangul + + _kana + _cjk ) diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 988dbaba1..9d1fa93b8 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -47,6 +47,41 @@ _num_words = [ ] +_ordinal_words = [ + "primero", + "segundo", + "tercero", + "cuarto", + "quinto", + "sexto", + "séptimo", + "octavo", + "noveno", + "décimo", + "undécimo", + "duodécimo", + "decimotercero", + "decimocuarto", + "decimoquinto", + "decimosexto", + "decimoséptimo", + "decimoctavo", + "decimonoveno", + "vigésimo", + "trigésimo", + "cuadragésimo", + "quincuagésimo", + "sexagésimo", + "septuagésimo", + "octogésima", + "nonagésima", + "centésima", + "milésima", + "millonésima", + "billonésima", +] + + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -57,7 +92,11 @@ def like_num(text): num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True - if text.lower() in _num_words: + text_lower = text.lower() + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: return True return False diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 86a834170..c3a0cf451 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults @@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Finnish(Language): diff --git a/spacy/lang/fi/syntax_iterators.py b/spacy/lang/fi/syntax_iterators.py new file mode 100644 index 000000000..6b481e51f --- /dev/null +++ b/spacy/lang/fi/syntax_iterators.py @@ -0,0 +1,79 @@ +from typing import Iterator, Tuple, Union +from ...tokens import Doc, Span +from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """Detect base noun phrases from a dependency parse. Works on both Doc and Span.""" + labels = [ + "appos", + "nsubj", + "nsubj:cop", + "obj", + "obl", + "ROOT", + ] + extend_labels = [ + "amod", + "compound", + "compound:nn", + "flat:name", + "nmod", + "nmod:gobj", + "nmod:gsubj", + "nmod:poss", + "nummod", + ] + + def potential_np_head(word): + return word.pos in (NOUN, PROPN) and ( + word.dep in np_deps or word.head.pos == PRON + ) + + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + np_deps = [doc.vocab.strings[label] for label in labels] + extend_deps = [doc.vocab.strings[label] for label in extend_labels] + np_label = doc.vocab.strings.add("NP") + conj_label = doc.vocab.strings.add("conj") + + rbracket = 0 + prev_end = -1 + for i, word in enumerate(doclike): + if i < rbracket: + continue + + # Is this a potential independent NP head or coordinated with + # a NOUN that is itself an independent NP head? + # + # e.g. "Terveyden ja hyvinvoinnin laitos" + if potential_np_head(word) or ( + word.dep == conj_label and potential_np_head(word.head) + ): + # Try to extend to the left to include adjective/num + # modifiers, compound words etc. + lbracket = word.i + for ldep in word.lefts: + if ldep.dep in extend_deps: + lbracket = ldep.left_edge.i + break + + # Prevent nested chunks from being produced + if lbracket <= prev_end: + continue + + rbracket = word.i + # Try to extend the span to the right to capture + # appositions and noun modifiers + for rdep in word.rights: + if rdep.dep in extend_deps: + rbracket = rdep.i + prev_end = rbracket + + yield lbracket, rbracket + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index d86662693..5849c40b3 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -6,16 +6,35 @@ from ...tokens import Doc, Span def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: - """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" - # fmt: off - labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] - # fmt: on + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "obl:arg", + "obl:mod", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"] doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] - conj = doc.vocab.strings.add("conj") + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_label = doc.vocab.strings.add("det") + det_pos = doc.vocab.strings.add("DET") + adp_pos = doc.vocab.strings.add("ADP") + conj_label = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): @@ -24,16 +43,43 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label - elif word.dep == conj: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep == det_label and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj_label: head = word.head - while head.dep == conj and head.head.i < head.i: + while head.dep == conj_label and head.head.i < head.i: head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py index a18c2e513..ee845e8b1 100644 --- a/spacy/lang/hi/lex_attrs.py +++ b/spacy/lang/hi/lex_attrs.py @@ -90,7 +90,7 @@ _eleven_to_beyond = [ "अड़सठ", "उनहत्तर", "सत्तर", - "इकहत्तर" + "इकहत्तर", "बहत्तर", "तिहत्तर", "चौहत्तर", diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 1edebc837..ecf322bd7 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language, BaseDefaults from .lemmatizer import ItalianLemmatizer +from .syntax_iterators import SYNTAX_ITERATORS class ItalianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS - stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Italian(Language): diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 4178ed452..42adc7904 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto basta bene benissimo brava bravo -casa caso cento certa certe certi certo che chi chicchessia chiunque ci +casa caso cento certa certe certi certo che chi chicchessia chiunque ci c' ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto cogli coi col colei coll coloro colui come cominci comunque con concernente conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui -da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli -dei del dell della delle dello dentro detto deve di dice dietro dire +d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli +dei del dell dell' della delle dello dentro detto deve di dice dietro dire dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due dunque durante -ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era -erano eravamo eravate eri ero esempio esse essendo esser essere essi ex +e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era +erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero facessi facessimo faceste facesti faceva facevamo facevano facevate facevi @@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra frattempo fu fui fummo fuori furono futuro generale -gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo +gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo grande grazie gruppo ha haha hai hanno ho ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io -la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo +l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo -ma macche magari maggior mai male malgrado malissimo mancanza marche me +m' ma macche magari maggior mai male malgrado malissimo mancanza marche me medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto -nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun -nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre +nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun' +nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre nostri nostro novanta nove nulla nuovo od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto @@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente proprio puo può pure purtroppo qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante -quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest +quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest' questa queste questi questo qui quindi realmente recente recentemente registrazione relativo riecco salvo -sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste +s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando @@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua subito successivamente successivo sue sugl sugli sui sul sull sulla sulle sullo suo suoi -tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta +t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto -uguali ulteriore ultimo un una uno uomo +uguali ulteriore ultimo un un' una uno uomo -va vale vari varia varie vario verso vi via vicino visto vita voi volta volte +v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte vostra vostre vostri vostro """.split() ) diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py new file mode 100644 index 000000000..f63df3fad --- /dev/null +++ b/spacy/lang/it/syntax_iterators.py @@ -0,0 +1,86 @@ +from typing import Union, Iterator, Tuple + +from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "fixed", "compound"] + dets = ["det", "det:poss"] + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} + np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_labels = {doc.vocab.strings.add(det) for det in dets} + det_pos = doc.vocab.strings.add("DET") + adp_label = doc.vocab.strings.add("ADP") + conj = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep in det_labels and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_label else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 05fc67e79..63bc06665 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,6 @@ from typing import Iterator, Any, Dict +from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS @@ -31,15 +32,24 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab): self.vocab = vocab - MeCab = try_mecab_import() # type: ignore[func-returns-value] - self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") + self._mecab = try_mecab_import() # type: ignore[func-returns-value] + self._mecab_tokenizer = None + + @property + def mecab_tokenizer(self): + # This is a property so that initializing a pipeline with blank:ko is + # possible without actually requiring mecab-ko, e.g. to run + # `spacy init vectors ko` for a pipeline that will have a different + # tokenizer in the end. The languages need to match for the vectors + # to be imported and there's no way to pass a custom config to + # `init vectors`. + if self._mecab_tokenizer is None: + self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") + return self._mecab_tokenizer def __reduce__(self): return KoreanTokenizer, (self.vocab,) - def __del__(self): - self.mecab_tokenizer.__del__() - def __call__(self, text: str) -> Doc: dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] @@ -76,6 +86,7 @@ class KoreanDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + infixes = TOKENIZER_INFIXES class Korean(Language): @@ -90,7 +101,8 @@ def try_mecab_import() -> None: return MeCab except ImportError: raise ImportError( - "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' + "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " "and [natto-py](https://github.com/buruzaemon/natto-py)" ) from None diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py new file mode 100644 index 000000000..7f7b40c5b --- /dev/null +++ b/spacy/lang/ko/punctuation.py @@ -0,0 +1,12 @@ +from ..char_classes import LIST_QUOTES +from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES + + +_infixes = ( + ["·", "ㆍ", "\(", "\)"] + + [r"(?<=[0-9])~(?=[0-9-])"] + + LIST_QUOTES + + BASE_TOKENIZER_INFIXES +) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index fd65dd788..d9ed414ef 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både -da dag de del dem den denne der dermed det dette disse drept du +da dag de del dem den denne der dermed det dette disse du eller en enn er et ett etter -fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag +fem fikk fire fjor flere folk for fortsatt fra fram funnet få får fått før først første gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går -ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan -hvorfor +ha hadde ham han hans har hele helt henne hennes her hun i ifølge igjen ikke ingen inn ja jeg kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld -kvinner -la laget land landet langt leder ligger like litt løpet lørdag +la laget land landet langt leder ligger like litt løpet -man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer -millioner minutter mot msci mye må mål måtte +man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte -ned neste noe noen nok norge norsk norske ntb ny nye nå når +ned neste noe noen nok ny nye nå når -og også om onsdag opp opplyser oslo oss over +og også om opp opplyser oss over -personer plass poeng politidistrikt politiet president prosent på +personer plass poeng på -regjeringen runde rundt russland +runde rundt -sa saken samme sammen samtidig satt se seg seks selv senere september ser sett +sa saken samme sammen samtidig satt se seg seks selv senere ser sett siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor -store står sverige svært så søndag +store står svært så -ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror -tyskland +ta tatt tid tidligere til tilbake tillegg tok tror -under usa ut uten utenfor +under ut uten utenfor vant var ved veldig vi videre viktig vil ville viser vår være vært diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 7979c7ea6..90802cb9b 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,56 +1,119 @@ from ...attrs import LIKE_NUM -_num_words = [ - "ноль", - "один", - "два", - "три", - "четыре", - "пять", - "шесть", - "семь", - "восемь", - "девять", - "десять", - "одиннадцать", - "двенадцать", - "тринадцать", - "четырнадцать", - "пятнадцать", - "шестнадцать", - "семнадцать", - "восемнадцать", - "девятнадцать", - "двадцать", - "тридцать", - "сорок", - "пятьдесят", - "шестьдесят", - "семьдесят", - "восемьдесят", - "девяносто", - "сто", - "двести", - "триста", - "четыреста", - "пятьсот", - "шестьсот", - "семьсот", - "восемьсот", - "девятьсот", - "тысяча", - "миллион", - "миллиард", - "триллион", - "квадриллион", - "квинтиллион", -] +_num_words = list( + set( + """ +ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми + +один первого первому единица одного одному первой первом первый первым одним одном во-первых + +два второго второму второй втором вторым двойка двумя двум двух во-вторых двое две двоих оба обе обеим обеими +обеих обоим обоими обоих + +полтора полторы полутора + +три третьего третьему третьем третьим третий тройка трешка трёшка трояк трёха треха тремя трем трех трое троих трёх + +четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым +четверых + +пять пятерочка пятерка пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми + +шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых + +семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро + +восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью + +девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых + +десять десятого десятому десятка десятом десятый десятым десятью десяти десятером вдесятером + +одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати + +двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати + +тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати + +четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати + +пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати + +шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати + +семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати + +восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати + +девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати + +двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати + +тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати + +тридевять + +сорок сорокового сороковому сороковом сороковым сороковой + +пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти полтинник + +шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти + +семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти + +восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти +восьмидесяти + +девяносто девяностого девяностому девяностом девяностый девяностым девяноста + +сто сотого сотому сотка сотня сотом сотен сотый сотым ста + +двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот + +триста тремястами трехсотого трехсотому трехсотом трехсотый трехсотым тремстам трехстах трехсот + +четыреста четырехсотого четырехсотому четырьмястами четырехсотом четырехсотый четырехсотым четыремстам четырехстах +четырехсот + +пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот + +шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот + +семьсот семисотого семисотому семьюстами семисотом семисотый семисотым семистам семистах семисот + +восемьсот восемисотого восемисотому восемисотом восемисотый восемисотым восьмистами восьмистам восьмистах восьмисот + +девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот + +тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс + +миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону +миллионов лям млн + +миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду +миллиардов лярд млрд + +триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону +триллионов трлн + +квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе +квадриллиону квадриллионов квадрлн + +квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе +квинтиллиону квинтиллионов квинтлн + +i ii iii iv vi vii viii ix xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix +""".split() + ) +) def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] + if text.endswith("%"): + text = text[:-1] text = text.replace(",", "").replace(".", "") if text.isdigit(): return True diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 16cb55ef9..d6ea6b42a 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,52 +1,111 @@ STOP_WORDS = set( """ -а +а авось ага агу аж ай али алло ау ах ая -будем будет будете будешь буду будут будучи будь будьте бы был была были было -быть +б будем будет будете будешь буду будут будучи будь будьте бы был была были было +быть бац без безусловно бишь благо благодаря ближайшие близко более больше +будто бывает бывала бывали бываю бывают бытует в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею -всея всю вся вы +всея всю вся вы ваш ваша ваше ваши вдали вдобавок вдруг ведь везде вернее +взаимно взаправду видно вишь включая вместо внакладе вначале вне вниз внизу +вновь вовсе возможно воистину вокруг вон вообще вопреки вперекор вплоть +вполне вправду вправе впрочем впрямь вресноту вроде вряд всегда всюду +всякий всякого всякой всячески вчеред -да для до +г го где гораздо гав -его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею +д да для до дабы давайте давно давным даже далее далеко дальше данная +данного данное данной данном данному данные данный данных дану данунах +даром де действительно довольно доколе доколь долго должен должна +должно должны должный дополнительно другая другие другим другими +других другое другой -же +е его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею едва +ежели еле -за +ж же -и из или им ими имъ их +з за затем зато зачем здесь значит зря + +и из или им ими имъ их ибо иль имеет имел имела имело именно иметь иначе +иногда иным иными итак ишь + +й к как кем ко когда кого ком кому комья которая которого которое которой котором -которому которою которую которые который которым которыми которых кто +которому которою которую которые который которым которыми которых кто ка кабы +каждая каждое каждые каждый кажется казалась казались казалось казался казаться +какая какие каким какими каков какого какой какому какою касательно кой коли +коль конечно короче кроме кстати ку куда -меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего +л ли либо лишь любая любого любое любой любом любую любыми любых + +м меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего моей моем моём моему моею можем может можете можешь мои мой моим моими моих -мочь мою моя мы +мочь мою моя мы мало меж между менее меньше мимо многие много многого многое +многом многому можно мол му -на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим +н на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но +наверняка наверху навряд навыворот над надо назад наиболее наизворот +наизнанку наипаче накануне наконец наоборот наперед наперекор наподобие +например напротив напрямую насилу настоящая настоящее настоящие настоящий +насчет нате находиться начала начале неважно негде недавно недалеко незачем +некем некогда некому некоторая некоторые некоторый некоторых некто некуда +нельзя немногие немногим немного необходимо необходимости необходимые +необходимым неоткуда непрерывно нередко несколько нету неужели нечего +нечем нечему нечто нешто нибудь нигде ниже низко никак никакой никем +никогда никого никому никто никуда ниоткуда нипочем ничего ничем ничему +ничто ну нужная нужно нужного нужные нужный нужных ныне нынешнее нынешней +нынешних нынче о об один одна одни одним одними одних одно одного одной одном одному одною -одну он она оне они оно от +одну он она оне они оно от оба общую обычно ого однажды однако ой около оный +оп опять особенно особо особую особые откуда отнелижа отнелиже отовсюду +отсюда оттого оттот оттуда отчего отчему ох очевидно очень ом -по при +п по при паче перед под подавно поди подобная подобно подобного подобные +подобный подобным подобных поелику пожалуй пожалуйста позже поистине +пока покамест поколе поколь покуда покудова помимо понеже поприще пор +пора посему поскольку после посреди посредством потом потому потомушта +похожем почему почти поэтому прежде притом причем про просто прочего +прочее прочему прочими проще прям пусть + +р ради разве ранее рано раньше рядом с сам сама сами самим самими самих само самого самом самому саму свое своё своего своей своем своём своему своею свои свой своим своими своих свою своя -себе себя собой собою +себе себя собой собою самая самое самой самый самых сверх свыше се сего сей +сейчас сие сих сквозь сколько скорее скоро следует слишком смогут сможет +сначала снова со собственно совсем сперва спокону спустя сразу среди сродни +стал стала стали стало стать суть сызнова -та так такая такие таким такими таких такого такое такой таком такому такою -такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому -тот тою ту ты +та то ту ты ти так такая такие таким такими таких такого такое такой таком такому такою +такую те тебе тебя тем теми тех тобой тобою того той только том томах тому +тот тою также таки таков такова там твои твоим твоих твой твоя твоё +теперь тогда тоже тотчас точно туда тут тьфу тая -у уже +у уже увы уж ура ух ую -чего чем чём чему что чтобы +ф фу -эта эти этим этими этих это этого этой этом этому этот этою эту +х ха хе хорошо хотел хотела хотелось хотеть хоть хотя хочешь хочу хуже -я +ч чего чем чём чему что чтобы часто чаще чей через чтоб чуть чхать чьим +чьих чьё чё + +ш ша + +щ ща щас + +ы ых ые ый + +э эта эти этим этими этих это этого этой этом этому этот этою эту эдак эдакий +эй эка экий этак этакий эх + +ю + +я явно явных яко якобы якоже """.split() ) diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index 1dc363fae..f3756e26c 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -2,7 +2,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc - _exc = {} _abbrev_exc = [ @@ -42,7 +41,6 @@ _abbrev_exc = [ {ORTH: "дек", NORM: "декабрь"}, ] - for abbrev_desc in _abbrev_exc: abbrev = abbrev_desc[ORTH] for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): @@ -50,17 +48,354 @@ for abbrev_desc in _abbrev_exc: _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}] -_slang_exc = [ +for abbr in [ + # Year slang abbreviations {ORTH: "2к15", NORM: "2015"}, {ORTH: "2к16", NORM: "2016"}, {ORTH: "2к17", NORM: "2017"}, {ORTH: "2к18", NORM: "2018"}, {ORTH: "2к19", NORM: "2019"}, {ORTH: "2к20", NORM: "2020"}, -] + {ORTH: "2к21", NORM: "2021"}, + {ORTH: "2к22", NORM: "2022"}, + {ORTH: "2к23", NORM: "2023"}, + {ORTH: "2к24", NORM: "2024"}, + {ORTH: "2к25", NORM: "2025"}, +]: + _exc[abbr[ORTH]] = [abbr] -for slang_desc in _slang_exc: - _exc[slang_desc[ORTH]] = [slang_desc] +for abbr in [ + # Profession and academic titles abbreviations + {ORTH: "ак.", NORM: "академик"}, + {ORTH: "акад.", NORM: "академик"}, + {ORTH: "д-р архитектуры", NORM: "доктор архитектуры"}, + {ORTH: "д-р биол. наук", NORM: "доктор биологических наук"}, + {ORTH: "д-р ветеринар. наук", NORM: "доктор ветеринарных наук"}, + {ORTH: "д-р воен. наук", NORM: "доктор военных наук"}, + {ORTH: "д-р геогр. наук", NORM: "доктор географических наук"}, + {ORTH: "д-р геол.-минерал. наук", NORM: "доктор геолого-минералогических наук"}, + {ORTH: "д-р искусствоведения", NORM: "доктор искусствоведения"}, + {ORTH: "д-р ист. наук", NORM: "доктор исторических наук"}, + {ORTH: "д-р культурологии", NORM: "доктор культурологии"}, + {ORTH: "д-р мед. наук", NORM: "доктор медицинских наук"}, + {ORTH: "д-р пед. наук", NORM: "доктор педагогических наук"}, + {ORTH: "д-р полит. наук", NORM: "доктор политических наук"}, + {ORTH: "д-р психол. наук", NORM: "доктор психологических наук"}, + {ORTH: "д-р с.-х. наук", NORM: "доктор сельскохозяйственных наук"}, + {ORTH: "д-р социол. наук", NORM: "доктор социологических наук"}, + {ORTH: "д-р техн. наук", NORM: "доктор технических наук"}, + {ORTH: "д-р фармацевт. наук", NORM: "доктор фармацевтических наук"}, + {ORTH: "д-р физ.-мат. наук", NORM: "доктор физико-математических наук"}, + {ORTH: "д-р филол. наук", NORM: "доктор филологических наук"}, + {ORTH: "д-р филос. наук", NORM: "доктор философских наук"}, + {ORTH: "д-р хим. наук", NORM: "доктор химических наук"}, + {ORTH: "д-р экон. наук", NORM: "доктор экономических наук"}, + {ORTH: "д-р юрид. наук", NORM: "доктор юридических наук"}, + {ORTH: "д-р", NORM: "доктор"}, + {ORTH: "д.б.н.", NORM: "доктор биологических наук"}, + {ORTH: "д.г.-м.н.", NORM: "доктор геолого-минералогических наук"}, + {ORTH: "д.г.н.", NORM: "доктор географических наук"}, + {ORTH: "д.и.н.", NORM: "доктор исторических наук"}, + {ORTH: "д.иск.", NORM: "доктор искусствоведения"}, + {ORTH: "д.м.н.", NORM: "доктор медицинских наук"}, + {ORTH: "д.п.н.", NORM: "доктор психологических наук"}, + {ORTH: "д.пед.н.", NORM: "доктор педагогических наук"}, + {ORTH: "д.полит.н.", NORM: "доктор политических наук"}, + {ORTH: "д.с.-х.н.", NORM: "доктор сельскохозяйственных наук"}, + {ORTH: "д.социол.н.", NORM: "доктор социологических наук"}, + {ORTH: "д.т.н.", NORM: "доктор технических наук"}, + {ORTH: "д.т.н", NORM: "доктор технических наук"}, + {ORTH: "д.ф.-м.н.", NORM: "доктор физико-математических наук"}, + {ORTH: "д.ф.н.", NORM: "доктор филологических наук"}, + {ORTH: "д.филос.н.", NORM: "доктор философских наук"}, + {ORTH: "д.фил.н.", NORM: "доктор филологических наук"}, + {ORTH: "д.х.н.", NORM: "доктор химических наук"}, + {ORTH: "д.э.н.", NORM: "доктор экономических наук"}, + {ORTH: "д.э.н", NORM: "доктор экономических наук"}, + {ORTH: "д.ю.н.", NORM: "доктор юридических наук"}, + {ORTH: "доц.", NORM: "доцент"}, + {ORTH: "и.о.", NORM: "исполняющий обязанности"}, + {ORTH: "к.б.н.", NORM: "кандидат биологических наук"}, + {ORTH: "к.воен.н.", NORM: "кандидат военных наук"}, + {ORTH: "к.г.-м.н.", NORM: "кандидат геолого-минералогических наук"}, + {ORTH: "к.г.н.", NORM: "кандидат географических наук"}, + {ORTH: "к.геогр.н", NORM: "кандидат географических наук"}, + {ORTH: "к.геогр.наук", NORM: "кандидат географических наук"}, + {ORTH: "к.и.н.", NORM: "кандидат исторических наук"}, + {ORTH: "к.иск.", NORM: "кандидат искусствоведения"}, + {ORTH: "к.м.н.", NORM: "кандидат медицинских наук"}, + {ORTH: "к.п.н.", NORM: "кандидат психологических наук"}, + {ORTH: "к.псх.н.", NORM: "кандидат психологических наук"}, + {ORTH: "к.пед.н.", NORM: "кандидат педагогических наук"}, + {ORTH: "канд.пед.наук", NORM: "кандидат педагогических наук"}, + {ORTH: "к.полит.н.", NORM: "кандидат политических наук"}, + {ORTH: "к.с.-х.н.", NORM: "кандидат сельскохозяйственных наук"}, + {ORTH: "к.социол.н.", NORM: "кандидат социологических наук"}, + {ORTH: "к.с.н.", NORM: "кандидат социологических наук"}, + {ORTH: "к.т.н.", NORM: "кандидат технических наук"}, + {ORTH: "к.ф.-м.н.", NORM: "кандидат физико-математических наук"}, + {ORTH: "к.ф.н.", NORM: "кандидат филологических наук"}, + {ORTH: "к.фил.н.", NORM: "кандидат филологических наук"}, + {ORTH: "к.филол.н", NORM: "кандидат филологических наук"}, + {ORTH: "к.фарм.наук", NORM: "кандидат фармакологических наук"}, + {ORTH: "к.фарм.н.", NORM: "кандидат фармакологических наук"}, + {ORTH: "к.фарм.н", NORM: "кандидат фармакологических наук"}, + {ORTH: "к.филос.наук", NORM: "кандидат философских наук"}, + {ORTH: "к.филос.н.", NORM: "кандидат философских наук"}, + {ORTH: "к.филос.н", NORM: "кандидат философских наук"}, + {ORTH: "к.х.н.", NORM: "кандидат химических наук"}, + {ORTH: "к.х.н", NORM: "кандидат химических наук"}, + {ORTH: "к.э.н.", NORM: "кандидат экономических наук"}, + {ORTH: "к.э.н", NORM: "кандидат экономических наук"}, + {ORTH: "к.ю.н.", NORM: "кандидат юридических наук"}, + {ORTH: "к.ю.н", NORM: "кандидат юридических наук"}, + {ORTH: "канд. архитектуры", NORM: "кандидат архитектуры"}, + {ORTH: "канд. биол. наук", NORM: "кандидат биологических наук"}, + {ORTH: "канд. ветеринар. наук", NORM: "кандидат ветеринарных наук"}, + {ORTH: "канд. воен. наук", NORM: "кандидат военных наук"}, + {ORTH: "канд. геогр. наук", NORM: "кандидат географических наук"}, + {ORTH: "канд. геол.-минерал. наук", NORM: "кандидат геолого-минералогических наук"}, + {ORTH: "канд. искусствоведения", NORM: "кандидат искусствоведения"}, + {ORTH: "канд. ист. наук", NORM: "кандидат исторических наук"}, + {ORTH: "к.ист.н.", NORM: "кандидат исторических наук"}, + {ORTH: "канд. культурологии", NORM: "кандидат культурологии"}, + {ORTH: "канд. мед. наук", NORM: "кандидат медицинских наук"}, + {ORTH: "канд. пед. наук", NORM: "кандидат педагогических наук"}, + {ORTH: "канд. полит. наук", NORM: "кандидат политических наук"}, + {ORTH: "канд. психол. наук", NORM: "кандидат психологических наук"}, + {ORTH: "канд. с.-х. наук", NORM: "кандидат сельскохозяйственных наук"}, + {ORTH: "канд. социол. наук", NORM: "кандидат социологических наук"}, + {ORTH: "к.соц.наук", NORM: "кандидат социологических наук"}, + {ORTH: "к.соц.н.", NORM: "кандидат социологических наук"}, + {ORTH: "к.соц.н", NORM: "кандидат социологических наук"}, + {ORTH: "канд. техн. наук", NORM: "кандидат технических наук"}, + {ORTH: "канд. фармацевт. наук", NORM: "кандидат фармацевтических наук"}, + {ORTH: "канд. физ.-мат. наук", NORM: "кандидат физико-математических наук"}, + {ORTH: "канд. филол. наук", NORM: "кандидат филологических наук"}, + {ORTH: "канд. филос. наук", NORM: "кандидат философских наук"}, + {ORTH: "канд. хим. наук", NORM: "кандидат химических наук"}, + {ORTH: "канд. экон. наук", NORM: "кандидат экономических наук"}, + {ORTH: "канд. юрид. наук", NORM: "кандидат юридических наук"}, + {ORTH: "в.н.с.", NORM: "ведущий научный сотрудник"}, + {ORTH: "мл. науч. сотр.", NORM: "младший научный сотрудник"}, + {ORTH: "м.н.с.", NORM: "младший научный сотрудник"}, + {ORTH: "проф.", NORM: "профессор"}, + {ORTH: "профессор.кафедры", NORM: "профессор кафедры"}, + {ORTH: "ст. науч. сотр.", NORM: "старший научный сотрудник"}, + {ORTH: "чл.-к.", NORM: "член корреспондент"}, + {ORTH: "чл.-корр.", NORM: "член-корреспондент"}, + {ORTH: "чл.-кор.", NORM: "член-корреспондент"}, + {ORTH: "дир.", NORM: "директор"}, + {ORTH: "зам. дир.", NORM: "заместитель директора"}, + {ORTH: "зав. каф.", NORM: "заведующий кафедрой"}, + {ORTH: "зав.кафедрой", NORM: "заведующий кафедрой"}, + {ORTH: "зав. кафедрой", NORM: "заведующий кафедрой"}, + {ORTH: "асп.", NORM: "аспирант"}, + {ORTH: "гл. науч. сотр.", NORM: "главный научный сотрудник"}, + {ORTH: "вед. науч. сотр.", NORM: "ведущий научный сотрудник"}, + {ORTH: "науч. сотр.", NORM: "научный сотрудник"}, + {ORTH: "к.м.с.", NORM: "кандидат в мастера спорта"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Literary phrases abbreviations + {ORTH: "и т.д.", NORM: "и так далее"}, + {ORTH: "и т.п.", NORM: "и тому подобное"}, + {ORTH: "т.д.", NORM: "так далее"}, + {ORTH: "т.п.", NORM: "тому подобное"}, + {ORTH: "т.е.", NORM: "то есть"}, + {ORTH: "т.к.", NORM: "так как"}, + {ORTH: "в т.ч.", NORM: "в том числе"}, + {ORTH: "и пр.", NORM: "и прочие"}, + {ORTH: "и др.", NORM: "и другие"}, + {ORTH: "т.н.", NORM: "так называемый"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Appeal to a person abbreviations + {ORTH: "г-н", NORM: "господин"}, + {ORTH: "г-да", NORM: "господа"}, + {ORTH: "г-жа", NORM: "госпожа"}, + {ORTH: "тов.", NORM: "товарищ"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Time periods abbreviations + {ORTH: "до н.э.", NORM: "до нашей эры"}, + {ORTH: "по н.в.", NORM: "по настоящее время"}, + {ORTH: "в н.в.", NORM: "в настоящее время"}, + {ORTH: "наст.", NORM: "настоящий"}, + {ORTH: "наст. время", NORM: "настоящее время"}, + {ORTH: "г.г.", NORM: "годы"}, + {ORTH: "гг.", NORM: "годы"}, + {ORTH: "т.г.", NORM: "текущий год"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Address forming elements abbreviations + {ORTH: "респ.", NORM: "республика"}, + {ORTH: "обл.", NORM: "область"}, + {ORTH: "г.ф.з.", NORM: "город федерального значения"}, + {ORTH: "а.обл.", NORM: "автономная область"}, + {ORTH: "а.окр.", NORM: "автономный округ"}, + {ORTH: "м.р-н", NORM: "муниципальный район"}, + {ORTH: "г.о.", NORM: "городской округ"}, + {ORTH: "г.п.", NORM: "городское поселение"}, + {ORTH: "с.п.", NORM: "сельское поселение"}, + {ORTH: "вн.р-н", NORM: "внутригородской район"}, + {ORTH: "вн.тер.г.", NORM: "внутригородская территория города"}, + {ORTH: "пос.", NORM: "поселение"}, + {ORTH: "р-н", NORM: "район"}, + {ORTH: "с/с", NORM: "сельсовет"}, + {ORTH: "г.", NORM: "город"}, + {ORTH: "п.г.т.", NORM: "поселок городского типа"}, + {ORTH: "пгт.", NORM: "поселок городского типа"}, + {ORTH: "р.п.", NORM: "рабочий поселок"}, + {ORTH: "рп.", NORM: "рабочий поселок"}, + {ORTH: "кп.", NORM: "курортный поселок"}, + {ORTH: "гп.", NORM: "городской поселок"}, + {ORTH: "п.", NORM: "поселок"}, + {ORTH: "в-ки", NORM: "выселки"}, + {ORTH: "г-к", NORM: "городок"}, + {ORTH: "з-ка", NORM: "заимка"}, + {ORTH: "п-к", NORM: "починок"}, + {ORTH: "киш.", NORM: "кишлак"}, + {ORTH: "п. ст. ", NORM: "поселок станция"}, + {ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"}, + {ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"}, + {ORTH: "ж/д б-ка", NORM: "железнодорожная будка"}, + {ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"}, + {ORTH: "ж/д к-ма", NORM: "железнодорожная казарма"}, + {ORTH: "ж/д к-т", NORM: "железнодорожный комбинат"}, + {ORTH: "ж/д пл-ма", NORM: "железнодорожная платформа"}, + {ORTH: "ж/д пл-ка", NORM: "железнодорожная площадка"}, + {ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"}, + {ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"}, + {ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"}, + {ORTH: "ж/д ст. ", NORM: "железнодорожная станция"}, + {ORTH: "м-ко", NORM: "местечко"}, + {ORTH: "д.", NORM: "деревня"}, + {ORTH: "с.", NORM: "село"}, + {ORTH: "сл.", NORM: "слобода"}, + {ORTH: "ст. ", NORM: "станция"}, + {ORTH: "ст-ца", NORM: "станица"}, + {ORTH: "у.", NORM: "улус"}, + {ORTH: "х.", NORM: "хутор"}, + {ORTH: "рзд.", NORM: "разъезд"}, + {ORTH: "зим.", NORM: "зимовье"}, + {ORTH: "б-г", NORM: "берег"}, + {ORTH: "ж/р", NORM: "жилой район"}, + {ORTH: "кв-л", NORM: "квартал"}, + {ORTH: "мкр.", NORM: "микрорайон"}, + {ORTH: "ост-в", NORM: "остров"}, + {ORTH: "платф.", NORM: "платформа"}, + {ORTH: "п/р", NORM: "промышленный район"}, + {ORTH: "р-н", NORM: "район"}, + {ORTH: "тер.", NORM: "территория"}, + { + ORTH: "тер. СНО", + NORM: "территория садоводческих некоммерческих объединений граждан", + }, + { + ORTH: "тер. ОНО", + NORM: "территория огороднических некоммерческих объединений граждан", + }, + {ORTH: "тер. ДНО", NORM: "территория дачных некоммерческих объединений граждан"}, + {ORTH: "тер. СНТ", NORM: "территория садоводческих некоммерческих товариществ"}, + {ORTH: "тер. ОНТ", NORM: "территория огороднических некоммерческих товариществ"}, + {ORTH: "тер. ДНТ", NORM: "территория дачных некоммерческих товариществ"}, + {ORTH: "тер. СПК", NORM: "территория садоводческих потребительских кооперативов"}, + {ORTH: "тер. ОПК", NORM: "территория огороднических потребительских кооперативов"}, + {ORTH: "тер. ДПК", NORM: "территория дачных потребительских кооперативов"}, + {ORTH: "тер. СНП", NORM: "территория садоводческих некоммерческих партнерств"}, + {ORTH: "тер. ОНП", NORM: "территория огороднических некоммерческих партнерств"}, + {ORTH: "тер. ДНП", NORM: "территория дачных некоммерческих партнерств"}, + {ORTH: "тер. ТСН", NORM: "территория товарищества собственников недвижимости"}, + {ORTH: "тер. ГСК", NORM: "территория гаражно-строительного кооператива"}, + {ORTH: "ус.", NORM: "усадьба"}, + {ORTH: "тер.ф.х.", NORM: "территория фермерского хозяйства"}, + {ORTH: "ю.", NORM: "юрты"}, + {ORTH: "ал.", NORM: "аллея"}, + {ORTH: "б-р", NORM: "бульвар"}, + {ORTH: "взв.", NORM: "взвоз"}, + {ORTH: "взд.", NORM: "въезд"}, + {ORTH: "дор.", NORM: "дорога"}, + {ORTH: "ззд.", NORM: "заезд"}, + {ORTH: "км", NORM: "километр"}, + {ORTH: "к-цо", NORM: "кольцо"}, + {ORTH: "лн.", NORM: "линия"}, + {ORTH: "мгстр.", NORM: "магистраль"}, + {ORTH: "наб.", NORM: "набережная"}, + {ORTH: "пер-д", NORM: "переезд"}, + {ORTH: "пер.", NORM: "переулок"}, + {ORTH: "пл-ка", NORM: "площадка"}, + {ORTH: "пл.", NORM: "площадь"}, + {ORTH: "пр-д", NORM: "проезд"}, + {ORTH: "пр-к", NORM: "просек"}, + {ORTH: "пр-ка", NORM: "просека"}, + {ORTH: "пр-лок", NORM: "проселок"}, + {ORTH: "пр-кт", NORM: "проспект"}, + {ORTH: "проул.", NORM: "проулок"}, + {ORTH: "рзд.", NORM: "разъезд"}, + {ORTH: "ряд", NORM: "ряд(ы)"}, + {ORTH: "с-р", NORM: "сквер"}, + {ORTH: "с-к", NORM: "спуск"}, + {ORTH: "сзд.", NORM: "съезд"}, + {ORTH: "туп.", NORM: "тупик"}, + {ORTH: "ул.", NORM: "улица"}, + {ORTH: "ш.", NORM: "шоссе"}, + {ORTH: "влд.", NORM: "владение"}, + {ORTH: "г-ж", NORM: "гараж"}, + {ORTH: "д.", NORM: "дом"}, + {ORTH: "двлд.", NORM: "домовладение"}, + {ORTH: "зд.", NORM: "здание"}, + {ORTH: "з/у", NORM: "земельный участок"}, + {ORTH: "кв.", NORM: "квартира"}, + {ORTH: "ком.", NORM: "комната"}, + {ORTH: "подв.", NORM: "подвал"}, + {ORTH: "кот.", NORM: "котельная"}, + {ORTH: "п-б", NORM: "погреб"}, + {ORTH: "к.", NORM: "корпус"}, + {ORTH: "ОНС", NORM: "объект незавершенного строительства"}, + {ORTH: "оф.", NORM: "офис"}, + {ORTH: "пав.", NORM: "павильон"}, + {ORTH: "помещ.", NORM: "помещение"}, + {ORTH: "раб.уч.", NORM: "рабочий участок"}, + {ORTH: "скл.", NORM: "склад"}, + {ORTH: "coop.", NORM: "сооружение"}, + {ORTH: "стр.", NORM: "строение"}, + {ORTH: "торг.зал", NORM: "торговый зал"}, + {ORTH: "а/п", NORM: "аэропорт"}, + {ORTH: "им.", NORM: "имени"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Others abbreviations + {ORTH: "тыс.руб.", NORM: "тысяч рублей"}, + {ORTH: "тыс.", NORM: "тысяч"}, + {ORTH: "руб.", NORM: "рубль"}, + {ORTH: "долл.", NORM: "доллар"}, + {ORTH: "прим.", NORM: "примечание"}, + {ORTH: "прим.ред.", NORM: "примечание редакции"}, + {ORTH: "см. также", NORM: "смотри также"}, + {ORTH: "кв.м.", NORM: "квадрантный метр"}, + {ORTH: "м2", NORM: "квадрантный метр"}, + {ORTH: "б/у", NORM: "бывший в употреблении"}, + {ORTH: "сокр.", NORM: "сокращение"}, + {ORTH: "чел.", NORM: "человек"}, + {ORTH: "б.п.", NORM: "базисный пункт"}, +]: + _exc[abbr[ORTH]] = [abbr] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 6fb01a183..c9004ed5d 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,13 +1,10 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -# TODO: probably needs to be tidied up – the list seems to have month names in -# it, which shouldn't be considered stop words. +# Removed various words that are not normally considered stop words, such as months. STOP_WORDS = set( """ a ali -april -avgust b bi bil @@ -19,7 +16,6 @@ biti blizu bo bodo -bojo bolj bom bomo @@ -37,16 +33,6 @@ da daleč dan danes -datum -december -deset -deseta -deseti -deseto -devet -deveta -deveti -deveto do dober dobra @@ -54,16 +40,7 @@ dobri dobro dokler dol -dolg -dolga -dolgi dovolj -drug -druga -drugi -drugo -dva -dve e eden en @@ -74,7 +51,6 @@ enkrat eno etc. f -februar g g. ga @@ -93,16 +69,12 @@ iv ix iz j -januar jaz je ji jih jim jo -julij -junij -jutri k kadarkoli kaj @@ -123,41 +95,23 @@ kje kjer kjerkoli ko -koder koderkoli koga komu kot -kratek -kratka -kratke -kratki l -lahka -lahke -lahki -lahko le lep lepa lepe lepi lepo -leto m -maj -majhen -majhna -majhni -malce -malo manj -marec me med medtem mene -mesec mi midva midve @@ -183,7 +137,6 @@ najmanj naju največ nam -narobe nas nato nazaj @@ -192,7 +145,6 @@ naša naše ne nedavno -nedelja nek neka nekaj @@ -236,7 +188,6 @@ njuna njuno no nocoj -november npr. o ob @@ -244,51 +195,23 @@ oba obe oboje od -odprt -odprta -odprti okoli -oktober on onadva one oni onidve -osem -osma -osmi -osmo oz. p pa -pet -peta -petek -peti -peto po pod pogosto poleg -poln -polna -polni -polno ponavadi -ponedeljek ponovno potem povsod -pozdravljen -pozdravljeni -prav -prava -prave -pravi -pravo -prazen -prazna -prazno prbl. precej pred @@ -297,19 +220,10 @@ preko pri pribl. približno -primer -pripravljen -pripravljena -pripravljeni proti -prva -prvi -prvo r -ravno redko res -reč s saj sam @@ -321,29 +235,17 @@ se sebe sebi sedaj -sedem -sedma -sedmi -sedmo sem -september seveda si sicer skoraj skozi -slab smo so -sobota spet -sreda -srednja -srednji sta ste -stran -stvar sva t ta @@ -358,10 +260,6 @@ te tebe tebi tega -težak -težka -težki -težko ti tista tiste @@ -371,11 +269,6 @@ tj. tja to toda -torek -tretja -tretje -tretji -tri tu tudi tukaj @@ -392,10 +285,6 @@ vaša vaše ve vedno -velik -velika -veliki -veliko vendar ves več @@ -403,10 +292,6 @@ vi vidva vii viii -visok -visoka -visoke -visoki vsa vsaj vsak @@ -420,34 +305,21 @@ vsega vsi vso včasih -včeraj x z za zadaj zadnji zakaj -zaprta -zaprti -zaprto zdaj zelo zunaj č če često -četrta -četrtek -četrti -četrto čez čigav š -šest -šesta -šesti -šesto -štiri ž že """.split() diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index 94016fd52..7e168a27c 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -6,19 +6,30 @@ from ...util import update_exc _exc = {} for exc_data in [ + {ORTH: "обл.", NORM: "область"}, + {ORTH: "р-н.", NORM: "район"}, + {ORTH: "р-н", NORM: "район"}, + {ORTH: "м.", NORM: "місто"}, {ORTH: "вул.", NORM: "вулиця"}, - {ORTH: "ім.", NORM: "імені"}, {ORTH: "просп.", NORM: "проспект"}, + {ORTH: "пр-кт", NORM: "проспект"}, {ORTH: "бул.", NORM: "бульвар"}, {ORTH: "пров.", NORM: "провулок"}, {ORTH: "пл.", NORM: "площа"}, + {ORTH: "майд.", NORM: "майдан"}, + {ORTH: "мкр.", NORM: "мікрорайон"}, + {ORTH: "ст.", NORM: "станція"}, + {ORTH: "ж/м", NORM: "житловий масив"}, + {ORTH: "наб.", NORM: "набережна"}, + {ORTH: "в/ч", NORM: "військова частина"}, + {ORTH: "в/м", NORM: "військове містечко"}, + {ORTH: "оз.", NORM: "озеро"}, + {ORTH: "ім.", NORM: "імені"}, {ORTH: "г.", NORM: "гора"}, {ORTH: "п.", NORM: "пан"}, - {ORTH: "м.", NORM: "місто"}, {ORTH: "проф.", NORM: "професор"}, {ORTH: "акад.", NORM: "академік"}, {ORTH: "доц.", NORM: "доцент"}, - {ORTH: "оз.", NORM: "озеро"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 8d63c3c20..34570d747 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -59,7 +59,7 @@ sentences = [ "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.", - "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.." + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..", "São Francisco considera banir os robôs de entrega que andam pelas calçadas.", "Londres é a maior cidade do Reino Unido.", # Translations from English: diff --git a/spacy/language.py b/spacy/language.py index 204b24ecb..bab403f0e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -131,7 +131,7 @@ class Language: self, vocab: Union[Vocab, bool] = True, *, - max_length: int = 10 ** 6, + max_length: int = 10**6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, batch_size: int = 1000, @@ -354,12 +354,15 @@ class Language: @property def pipe_labels(self) -> Dict[str, List[str]]: """Get the labels set by the pipeline components, if available (if - the component exposes a labels property). + the component exposes a labels property and the labels are not + hidden). RETURNS (Dict[str, List[str]]): Labels keyed by component name. """ labels = {} for name, pipe in self._components: + if hasattr(pipe, "hide_labels") and pipe.hide_labels is True: + continue if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) return SimpleFrozenDict(labels) @@ -522,7 +525,7 @@ class Language: requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, func: Optional["Pipe"] = None, - ) -> Callable: + ) -> Callable[..., Any]: """Register a new pipeline component. Can be used for stateless function components that don't require a separate factory. Can be used as a decorator on a function or classmethod, or called as a function with the @@ -1219,8 +1222,9 @@ class Language: component_cfg = {} grads = {} - def get_grads(W, dW, key=None): + def get_grads(key, W, dW): grads[key] = (W, dW) + return W, dW get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr] get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr] @@ -1233,7 +1237,7 @@ class Language: examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {}) ) for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) # type: ignore[call-arg, misc] + sgd(key, W, dW) # type: ignore[call-arg, misc] return losses def begin_training( @@ -1285,9 +1289,9 @@ class Language: ) except IOError: raise IOError(Errors.E884.format(vectors=I["vectors"])) - if self.vocab.vectors.data.shape[1] >= 1: + if self.vocab.vectors.shape[1] >= 1: ops = get_current_ops() - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + self.vocab.vectors.to_ops(ops) if hasattr(self.tokenizer, "initialize"): tok_settings = validate_init_settings( self.tokenizer.initialize, # type: ignore[union-attr] @@ -1332,8 +1336,8 @@ class Language: DOCS: https://spacy.io/api/language#resume_training """ ops = get_current_ops() - if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + if self.vocab.vectors.shape[1] >= 1: + self.vocab.vectors.to_ops(ops) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] @@ -1404,20 +1408,13 @@ class Language: for eg in examples: self.make_doc(eg.reference.text) # apply all pipeline components - for name, pipe in self.pipeline: - kwargs = component_cfg.get(name, {}) - kwargs.setdefault("batch_size", batch_size) - for doc, eg in zip( - _pipe( - (eg.predicted for eg in examples), - proc=pipe, - name=name, - default_error_handler=self.default_error_handler, - kwargs=kwargs, - ), - examples, - ): - eg.predicted = doc + docs = self.pipe( + (eg.predicted for eg in examples), + batch_size=batch_size, + component_cfg=component_cfg, + ) + for eg, doc in zip(examples, docs): + eg.predicted = doc end_time = timer() results = scorer.score(examples) n_words = sum(len(eg.predicted) for eg in examples) diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4eae6be43..4fcaa82cf 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -19,7 +19,7 @@ class Lexeme: @property def vector_norm(self) -> float: ... vector: Floats1d - rank: str + rank: int sentiment: float @property def orth_(self) -> str: ... diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 792e405dd..6c66effde 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -130,8 +130,10 @@ cdef class Lexeme: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + @property def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi new file mode 100644 index 000000000..c19d3a71c --- /dev/null +++ b/spacy/matcher/dependencymatcher.pyi @@ -0,0 +1,66 @@ +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from .matcher import Matcher +from ..vocab import Vocab +from ..tokens.doc import Doc +from ..tokens.span import Span + +class DependencyMatcher: + """Match dependency parse tree based on pattern rules.""" + + _patterns: Dict[str, List[Any]] + _raw_patterns: Dict[str, List[Any]] + _tokens_to_key: Dict[str, List[Any]] + _root: Dict[str, List[Any]] + _tree: Dict[str, List[Any]] + _callbacks: Dict[ + Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ] + _ops: Dict[str, Any] + vocab: Vocab + _matcher: Matcher + def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ... + def __reduce__( + self, + ) -> Tuple[ + Callable[ + [Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher + ], + Tuple[ + Vocab, + Dict[str, List[Any]], + Dict[ + str, + Callable[ + [DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any + ], + ], + ], + None, + None, + ]: ... + def __len__(self) -> int: ... + def __contains__(self, key: Union[str, int]) -> bool: ... + def add( + self, + key: Union[str, int], + patterns: List[List[Dict[str, Any]]], + *, + on_match: Optional[ + Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ] = ... + ) -> None: ... + def has_key(self, key: Union[str, int]) -> bool: ... + def get( + self, key: Union[str, int], default: Optional[Any] = ... + ) -> Tuple[ + Optional[ + Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ], + List[List[Dict[str, Any]]], + ]: ... + def remove(self, key: Union[str, int]) -> None: ... + def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ... + +def unpickle_matcher( + vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]] +) -> DependencyMatcher: ... diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index ec4a88eaf..390629ff8 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -1,4 +1,6 @@ -from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable +from typing import Any, List, Dict, Tuple, Optional, Callable, Union +from typing import Iterator, Iterable, overload +from ..compat import Literal from ..vocab import Vocab from ..tokens import Doc, Span @@ -31,12 +33,22 @@ class Matcher: ) -> Union[ Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc] ]: ... + @overload def __call__( self, doclike: Union[Doc, Span], *, - as_spans: bool = ..., + as_spans: Literal[False] = ..., allow_missing: bool = ..., with_alignments: bool = ... - ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + ) -> List[Tuple[int, int, int]]: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[True], + allow_missing: bool = ..., + with_alignments: bool = ... + ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 745d7cf43..6aa58f0e3 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis -from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH +from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store): attr = "SENT_START" attr = IDS.get(attr) if isinstance(value, str): - value = string_store.add(value) + if attr == ENT_IOB and value in Token.iob_strings(): + value = Token.iob_strings().index(value) + else: + value = string_store.add(value) elif isinstance(value, bool): value = int(value) elif isinstance(value, int): diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index d73633ec0..68e3386e4 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -1,6 +1,6 @@ -from typing import List, Tuple, Union, Optional, Callable, Any, Dict - -from . import Matcher +from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload +from ..compat import Literal +from .matcher import Matcher from ..vocab import Vocab from ..tokens import Doc, Span @@ -8,18 +8,30 @@ class PhraseMatcher: def __init__( self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ... ) -> None: ... - def __call__( - self, - doclike: Union[Doc, Span], - *, - as_spans: bool = ..., - ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + def __reduce__(self) -> Any: ... + def __len__(self) -> int: ... + def __contains__(self, key: str) -> bool: ... def add( self, key: str, - docs: List[List[Dict[str, Any]]], + docs: List[Doc], *, on_match: Optional[ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] ] = ..., ) -> None: ... + def remove(self, key: str) -> None: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[False] = ..., + ) -> List[Tuple[int, int, int]]: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[True], + ) -> List[Span]: ... diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 37473b7f4..a7d67c6dd 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -23,7 +23,7 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: - if vocab.vectors.data.shape[1] == 0: + if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces @@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char): target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") target = target.reshape((-1, 256 * nr_char)) diff = prediction - target - loss = (diff ** 2).sum() + loss = (diff**2).sum() d_target = diff / float(prediction.shape[0]) return loss, d_target @@ -116,7 +116,7 @@ def build_multi_task_model( def build_cloze_multi_task_model( vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int ) -> Model: - nO = vocab.vectors.data.shape[1] + nO = vocab.vectors.shape[1] output_layer = chain( cast(Model[List["Floats2d"], Floats2d], list2array()), Maxout( diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 44ab50e85..ecdf6be27 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -123,7 +123,7 @@ def MultiHashEmbed( attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into account some subword information, without constructing a fully character-based representation. If pretrained vectors are available, they can be included in - the representation as well, with the vectors table will be kept static + the representation as well, with the vectors table kept static (i.e. it's not updated). The `width` parameter specifies the output width of the layer and the widths diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 8dd65833b..8d9b1af9b 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -94,7 +94,7 @@ def init( nM = model.get_dim("nM") if model.has_dim("nM") else None nO = model.get_dim("nO") if model.has_dim("nO") else None if X is not None and len(X): - nM = X[0].vocab.vectors.data.shape[1] + nM = X[0].vocab.vectors.shape[1] if Y is not None: nO = Y.data.shape[1] diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 161f3ca48..a1262bb61 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,7 +1,9 @@ +from cython.operator cimport dereference as deref, preincrement as incr from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t cimport libcpp +from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from libcpp.set cimport set from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno @@ -29,8 +31,8 @@ cdef cppclass StateC: vector[int] _stack vector[int] _rebuffer vector[SpanC] _ents - vector[ArcC] _left_arcs - vector[ArcC] _right_arcs + unordered_map[int, vector[ArcC]] _left_arcs + unordered_map[int, vector[ArcC]] _right_arcs vector[libcpp.bool] _unshiftable set[int] _sent_starts TokenC _empty_token @@ -159,15 +161,22 @@ cdef cppclass StateC: else: return &this._sent[i] - void get_arcs(vector[ArcC]* arcs) nogil const: - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head != -1 and arc.child != -1: - arcs.push_back(arc) - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head != -1 and arc.child != -1: - arcs.push_back(arc) + void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const: + cdef const vector[ArcC]* arcs + head_arcs_it = heads_arcs.const_begin() + while head_arcs_it != heads_arcs.const_end(): + arcs = &deref(head_arcs_it).second + arcs_it = arcs.const_begin() + while arcs_it != arcs.const_end(): + arc = deref(arcs_it) + if arc.head != -1 and arc.child != -1: + out.push_back(arc) + incr(arcs_it) + incr(head_arcs_it) + + void get_arcs(vector[ArcC]* out) nogil const: + this.map_get_arcs(this._left_arcs, out) + this.map_get_arcs(this._right_arcs, out) int H(int child) nogil const: if child >= this.length or child < 0: @@ -181,33 +190,35 @@ cdef cppclass StateC: else: return this._ents.back().start + int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const: + if idx < 1: + return -1 + + head_arcs_it = heads_arcs.const_find(head) + if head_arcs_it == heads_arcs.const_end(): + return -1 + + cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second + + # Work backwards through arcs to find the arc at the + # requested index more quickly. + cdef size_t child_index = 0 + arcs_it = arcs.const_rbegin() + while arcs_it != arcs.const_rend() and child_index != idx: + arc = deref(arcs_it) + if arc.child != -1: + child_index += 1 + if child_index == idx: + return arc.child + incr(arcs_it) + + return -1 + int L(int head, int idx) nogil const: - if idx < 1 or this._left_arcs.size() == 0: - return -1 - cdef vector[int] lefts - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child < head: - lefts.push_back(arc.child) - idx = (lefts.size()) - idx - if idx < 0: - return -1 - else: - return lefts.at(idx) + return this.nth_child(this._left_arcs, head, idx) int R(int head, int idx) nogil const: - if idx < 1 or this._right_arcs.size() == 0: - return -1 - cdef vector[int] rights - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child > head: - rights.push_back(arc.child) - idx = (rights.size()) - idx - if idx < 0: - return -1 - else: - return rights.at(idx) + return this.nth_child(this._right_arcs, head, idx) bint empty() nogil const: return this._stack.size() == 0 @@ -248,22 +259,29 @@ cdef cppclass StateC: int r_edge(int word) nogil const: return word - - int n_L(int head) nogil const: + + int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const: cdef int n = 0 - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child < arc.head: + head_arcs_it = heads_arcs.const_find(head) + if head_arcs_it == heads_arcs.const_end(): + return n + + cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second + arcs_it = arcs.const_begin() + while arcs_it != arcs.end(): + arc = deref(arcs_it) + if arc.child != -1: n += 1 + incr(arcs_it) + return n + + int n_L(int head) nogil const: + return n_arcs(this._left_arcs, head) + int n_R(int head) nogil const: - cdef int n = 0 - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child > arc.head: - n += 1 - return n + return n_arcs(this._right_arcs, head) bint stack_is_connected() nogil const: return False @@ -323,19 +341,20 @@ cdef cppclass StateC: arc.child = child arc.label = label if head > child: - this._left_arcs.push_back(arc) + this._left_arcs[arc.head].push_back(arc) else: - this._right_arcs.push_back(arc) + this._right_arcs[arc.head].push_back(arc) this._heads[child] = head - void del_arc(int h_i, int c_i) nogil: - cdef vector[ArcC]* arcs - if h_i > c_i: - arcs = &this._left_arcs - else: - arcs = &this._right_arcs + void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil: + arcs_it = heads_arcs.find(h_i) + if arcs_it == heads_arcs.end(): + return + + arcs = &deref(arcs_it).second if arcs.size() == 0: return + arc = arcs.back() if arc.head == h_i and arc.child == c_i: arcs.pop_back() @@ -348,6 +367,12 @@ cdef cppclass StateC: arc.label = 0 break + void del_arc(int h_i, int c_i) nogil: + if h_i > c_i: + this.map_del_arc(&this._left_arcs, h_i, c_i) + else: + this.map_del_arc(&this._right_arcs, h_i, c_i) + SpanC get_ent() nogil const: cdef SpanC ent if this._ents.size() == 0: diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index ddcc911c8..029e2e29e 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem): actions[SHIFT][''] += 1 if min_freq is not None: for action, label_freqs in actions.items(): - for label, freq in list(label_freqs.items()): + for label, freq in label_freqs.copy().items(): if freq < min_freq: label_freqs.pop(label) # Ensure these actions are present diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx index 82070cd27..36163fcc3 100644 --- a/spacy/pipeline/_parser_internals/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from copy import copy +from libc.limits cimport INT_MAX +from libc.stdlib cimport abs +from libcpp cimport bool +from libcpp.vector cimport vector from ...tokens.doc cimport Doc, set_children_from_heads @@ -41,13 +45,18 @@ def contains_cycle(heads): def is_nonproj_arc(tokenid, heads): + cdef vector[int] c_heads = _heads_to_c(heads) + return _is_nonproj_arc(tokenid, c_heads) + + +cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil: # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] if head == tokenid: # root arcs cannot be non-projective return False - elif head is None: # unattached tokens cannot be non-projective + elif head < 0: # unattached tokens cannot be non-projective return False cdef int start, end @@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads): else: start, end = (tokenid+1, head) for k in range(start, end): - for ancestor in ancestors(k, heads): - if ancestor is None: # for unattached tokens/subtrees - break - elif ancestor == head: # normal case: k dominated by h - break + if _has_head_as_ancestor(k, head, heads): + continue else: # head not in ancestors: d -> h is non-projective return True return False +cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil: + ancestor = tokenid + cnt = 0 + while cnt < heads.size(): + if heads[ancestor] == head or heads[ancestor] < 0: + return True + ancestor = heads[ancestor] + cnt += 1 + + return False + + def is_nonproj_tree(heads): + cdef vector[int] c_heads = _heads_to_c(heads) # a tree is non-projective if at least one arc is non-projective - return any(is_nonproj_arc(word, heads) for word in range(len(heads))) + return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads))) def decompose(label): @@ -98,16 +117,31 @@ def projectivize(heads, labels): # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) # which encode a projective and decorated tree. proj_heads = copy(heads) - smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) - if smallest_np_arc is None: # this sentence is already projective + + cdef int new_head + cdef vector[int] c_proj_heads = _heads_to_c(proj_heads) + cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) + if smallest_np_arc == -1: # this sentence is already projective return proj_heads, copy(labels) - while smallest_np_arc is not None: - _lift(smallest_np_arc, proj_heads) - smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) + while smallest_np_arc != -1: + new_head = _lift(smallest_np_arc, proj_heads) + c_proj_heads[smallest_np_arc] = new_head + smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) deco_labels = _decorate(heads, proj_heads, labels) return proj_heads, deco_labels +cdef vector[int] _heads_to_c(heads): + cdef vector[int] c_heads; + for head in heads: + if head == None: + c_heads.push_back(-1) + else: + assert head < len(heads) + c_heads.push_back(head) + return c_heads + + cpdef deprojectivize(Doc doc): # Reattach arcs with decorated labels (following HEAD scheme). For each # decorated arc X||Y, search top-down, left-to-right, breadth-first until @@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels): deco_labels.append(labels[tokenid]) return deco_labels +def get_smallest_nonproj_arc_slow(heads): + cdef vector[int] c_heads = _heads_to_c(heads) + return _get_smallest_nonproj_arc(c_heads) -def _get_smallest_nonproj_arc(heads): + +cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil: # return the smallest non-proj arc or None # where size is defined as the distance between dep and head # and ties are broken left to right - smallest_size = float('inf') - smallest_np_arc = None - for tokenid, head in enumerate(heads): + cdef int smallest_size = INT_MAX + cdef int smallest_np_arc = -1 + cdef int size + cdef int tokenid + cdef int head + + for tokenid in range(heads.size()): + head = heads[tokenid] size = abs(tokenid-head) - if size < smallest_size and is_nonproj_arc(tokenid, heads): + if size < smallest_size and _is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc -def _lift(tokenid, heads): +cpdef int _lift(tokenid, heads): # reattaches a word to it's grandfather head = heads[tokenid] ghead = heads[head] + cdef int new_head = ghead if head != ghead else tokenid # attach to ghead if head isn't attached to root else attach to root - heads[tokenid] = ghead if head != ghead else tokenid + heads[tokenid] = new_head + return new_head def _find_new_head(token, headlabel): diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 78d7a0be2..614d71f41 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -348,6 +348,46 @@ class EntityRuler(Pipe): self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate ) + def remove(self, ent_id: str) -> None: + """Remove a pattern by its ent_id if a pattern with this ent_id was added before + + ent_id (str): id of the pattern to be removed + RETURNS: None + DOCS: https://spacy.io/api/entityruler#remove + """ + label_id_pairs = [ + (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id + ] + if not label_id_pairs: + raise ValueError(Errors.E1024.format(ent_id=ent_id)) + created_labels = [ + self._create_label(label, eid) for (label, eid) in label_id_pairs + ] + # remove the patterns from self.phrase_patterns + self.phrase_patterns = defaultdict( + list, + { + label: val + for (label, val) in self.phrase_patterns.items() + if label not in created_labels + }, + ) + # remove the patterns from self.token_pattern + self.token_patterns = defaultdict( + list, + { + label: val + for (label, val) in self.token_patterns.items() + if label not in created_labels + }, + ) + # remove the patterns from self.token_pattern + for label in created_labels: + if label in self.phrase_matcher: + self.phrase_matcher.remove(label) + else: + self.matcher.remove(label) + def _require_patterns(self) -> None: """Raise a warning if this component has no patterns defined.""" if len(self) == 0: diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index db425b69a..73d3799b1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -231,12 +231,13 @@ class Morphologizer(Tagger): cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] cdef bint extend = self.cfg["extend"] + labels = self.labels for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): - morph = self.labels[tag_id] + morph = labels[tag_id] # set morph if doc.c[j].morph == 0 or overwrite or extend: if overwrite and extend: diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi index c7c0568f9..9dd6a9d50 100644 --- a/spacy/pipeline/pipe.pyi +++ b/spacy/pipeline/pipe.pyi @@ -26,6 +26,8 @@ class Pipe: @property def labels(self) -> Tuple[str, ...]: ... @property + def hide_labels(self) -> bool: ... + @property def label_data(self) -> Any: ... def _require_labels(self) -> None: ... def set_error_handler( diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 9eddc1e3f..d24e4d574 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -102,6 +102,10 @@ cdef class Pipe: def labels(self) -> Tuple[str, ...]: return tuple() + @property + def hide_labels(self) -> bool: + return False + @property def label_data(self): """Optional JSON-serializable data that would be sufficient to recreate diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 54ce021af..6d00e829d 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True, binding=True -from itertools import islice from typing import Optional, Callable +from itertools import islice import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config @@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def hide_labels(self): + return True + @property def label_data(self): return None diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 829def1eb..3759466d1 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,9 +1,10 @@ -import numpy from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d, Ints1d +import numpy + from ..compat import Protocol, runtime_checkable from ..scorer import Scorer from ..language import Language @@ -377,7 +378,7 @@ class SpanCategorizer(TrainablePipe): # If the prediction is 0.9 and it's false, the gradient will be # 0.9 (0.9 - 0.0) d_scores = scores - target - loss = float((d_scores ** 2).sum()) + loss = float((d_scores**2).sum()) return loss, d_scores def initialize( @@ -412,7 +413,7 @@ class SpanCategorizer(TrainablePipe): self._require_labels() if subbatch: docs = [eg.x for eg in subbatch] - spans = self.suggester(docs) + spans = build_ngram_suggester(sizes=[1])(docs) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) self.model.initialize(X=(docs, spans), Y=Y) else: diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a9cbac37a..e21a9096e 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -45,7 +45,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}}, + default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"}, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -54,6 +54,7 @@ def make_tagger( model: Model, overwrite: bool, scorer: Optional[Callable], + neg_prefix: str, ): """Construct a part-of-speech tagger component. @@ -62,7 +63,7 @@ def make_tagger( in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix) def tagger_score(examples, **kwargs): @@ -87,6 +88,7 @@ class Tagger(TrainablePipe): *, overwrite=BACKWARD_OVERWRITE, scorer=tagger_score, + neg_prefix="!", ): """Initialize a part-of-speech tagger. @@ -103,7 +105,7 @@ class Tagger(TrainablePipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": [], "overwrite": overwrite} + cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix} self.cfg = dict(sorted(cfg.items())) self.scorer = scorer @@ -166,13 +168,14 @@ class Tagger(TrainablePipe): cdef Doc doc cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] + labels = self.labels for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): if doc.c[j].tag == 0 or overwrite: - doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] + doc.c[j].tag = self.vocab.strings[labels[tag_id]] def update(self, examples, *, drop=0., sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, @@ -222,6 +225,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#rehearse """ + loss_func = SequenceCategoricalCrossentropy() if losses is None: losses = {} losses.setdefault(self.name, 0.0) @@ -233,12 +237,12 @@ class Tagger(TrainablePipe): # Handle cases where there are no tokens in any docs. return losses set_dropout_rate(self.model, drop) - guesses, backprop = self.model.begin_update(docs) - target = self._rehearsal_model(examples) - gradient = guesses - target - backprop(gradient) + tag_scores, bp_tag_scores = self.model.begin_update(docs) + tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs) + grads, loss = loss_func(tag_scores, tutor_tag_scores) + bp_tag_scores(grads) self.finish_update(sgd) - losses[self.name] += (gradient**2).sum() + losses[self.name] += loss return losses def get_loss(self, examples, scores): @@ -252,7 +256,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix="!") + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"]) # Convert empty tag "" to missing value None so that both misaligned # tokens and tokens with missing annotation have the default missing # value None. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 30a65ec52..bc3f127fc 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,8 +1,8 @@ -from itertools import islice from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d import numpy +from itertools import islice from .trainable_pipe import TrainablePipe from ..language import Language @@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe): self.cfg = dict(cfg) self.scorer = scorer + @property + def support_missing_values(self): + # There are no missing values as the textcat should always + # predict exactly one label. All other labels are 0.0 + # Subclasses may override this property to change internal behaviour. + return False + @property def labels(self) -> Tuple[str]: """RETURNS (Tuple[str]): The labels currently added to the component. @@ -276,12 +283,12 @@ class TextCategorizer(TrainablePipe): return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) - target = self._rehearsal_model(examples) + target, _ = self._rehearsal_model.begin_update(docs) gradient = scores - target bp_scores(gradient) if sgd is not None: self.finish_update(sgd) - losses[self.name] += (gradient ** 2).sum() + losses[self.name] += (gradient**2).sum() return losses def _examples_to_truth( @@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe): for j, label in enumerate(self.labels): if label in eg.reference.cats: truths[i, j] = eg.reference.cats[label] - else: + elif self.support_missing_values: not_missing[i, j] = 0.0 truths = self.model.ops.asarray(truths) # type: ignore return truths, not_missing # type: ignore @@ -313,9 +320,9 @@ class TextCategorizer(TrainablePipe): self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) # type: ignore - d_scores = (scores - truths) / scores.shape[0] + d_scores = scores - truths d_scores *= not_missing - mean_square_error = (d_scores ** 2).sum(axis=1).mean() + mean_square_error = (d_scores**2).mean() return float(mean_square_error), d_scores def add_label(self, label: str) -> int: diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index a7bfacca7..e33a885f8 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,8 +1,8 @@ -from itertools import islice from typing import Iterable, Optional, Dict, List, Callable, Any - -from thinc.api import Model, Config from thinc.types import Floats2d +from thinc.api import Model, Config + +from itertools import islice from ..language import Language from ..training import Example, validate_get_examples @@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer): self.cfg = dict(cfg) self.scorer = scorer + @property + def support_missing_values(self): + return True + def initialize( # type: ignore[override] self, get_examples: Callable[[], Iterable[Example]], diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index cb601e5dc..2e3dde3cb 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#predict """ + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + width = self.model.get_dim("nO") + return [self.model.ops.alloc((0, width)) for doc in docs] tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: diff --git a/spacy/schemas.py b/spacy/schemas.py index cf58688ef..1dfd8ee85 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,5 +1,6 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING +from .compat import Literal from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool @@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ TokenPatternString, TokenPatternNumber, str, int, float, list, bool ] +IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3] class TokenPattern(BaseModel): @@ -222,6 +224,7 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_iob: Optional[IobValue] = None ent_id: Optional[StringValue] = None ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None diff --git a/spacy/scorer.py b/spacy/scorer.py index 4d596b5e1..ae9338bd5 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -445,7 +445,8 @@ class Scorer: getter(doc, attr) should return the values for the individual doc. labels (Iterable[str]): The set of possible labels. Defaults to []. multi_label (bool): Whether the attribute allows multiple labels. - Defaults to True. + Defaults to True. When set to False (exclusive labels), missing + gold labels are interpreted as 0.0. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. threshold (float): Cutoff to consider a prediction "positive". Defaults @@ -484,13 +485,15 @@ class Scorer: for label in labels: pred_score = pred_cats.get(label, 0.0) - gold_score = gold_cats.get(label, 0.0) + gold_score = gold_cats.get(label) + if not gold_score and not multi_label: + gold_score = 0.0 if gold_score is not None: auc_per_type[label].score_set(pred_score, gold_score) if multi_label: for label in labels: pred_score = pred_cats.get(label, 0.0) - gold_score = gold_cats.get(label, 0.0) + gold_score = gold_cats.get(label) if gold_score is not None: if pred_score >= threshold and gold_score > 0: f_per_type[label].tp += 1 @@ -502,16 +505,15 @@ class Scorer: # Get the highest-scoring for each. pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) - if gold_score is not None: - if pred_label == gold_label and pred_score >= threshold: - f_per_type[pred_label].tp += 1 - else: - f_per_type[gold_label].fn += 1 - if pred_score >= threshold: - f_per_type[pred_label].fp += 1 + if pred_label == gold_label and pred_score >= threshold: + f_per_type[pred_label].tp += 1 + else: + f_per_type[gold_label].fn += 1 + if pred_score >= threshold: + f_per_type[pred_label].fp += 1 elif gold_cats: gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) - if gold_score is not None and gold_score > 0: + if gold_score > 0: f_per_type[gold_label].fn += 1 elif pred_cats: pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 0bc2604bb..dda7ccbcb 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -51,6 +51,11 @@ def tokenizer(): return get_lang_class("xx")().tokenizer +@pytest.fixture(scope="session") +def af_tokenizer(): + return get_lang_class("af")().tokenizer + + @pytest.fixture(scope="session") def am_tokenizer(): return get_lang_class("am")().tokenizer @@ -127,6 +132,11 @@ def es_vocab(): return get_lang_class("es")().vocab +@pytest.fixture(scope="session") +def et_tokenizer(): + return get_lang_class("et")().tokenizer + + @pytest.fixture(scope="session") def eu_tokenizer(): return get_lang_class("eu")().tokenizer @@ -147,6 +157,11 @@ def fr_tokenizer(): return get_lang_class("fr")().tokenizer +@pytest.fixture(scope="session") +def fr_vocab(): + return get_lang_class("fr")().vocab + + @pytest.fixture(scope="session") def ga_tokenizer(): return get_lang_class("ga")().tokenizer @@ -187,11 +202,21 @@ def id_tokenizer(): return get_lang_class("id")().tokenizer +@pytest.fixture(scope="session") +def is_tokenizer(): + return get_lang_class("is")().tokenizer + + @pytest.fixture(scope="session") def it_tokenizer(): return get_lang_class("it")().tokenizer +@pytest.fixture(scope="session") +def it_vocab(): + return get_lang_class("it")().vocab + + @pytest.fixture(scope="session") def ja_tokenizer(): pytest.importorskip("sudachipy") @@ -204,6 +229,19 @@ def ko_tokenizer(): return get_lang_class("ko")().tokenizer +@pytest.fixture(scope="session") +def ko_tokenizer_tokenizer(): + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.Tokenizer.v1", + } + } + } + nlp = get_lang_class("ko").from_config(config) + return nlp.tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer @@ -214,6 +252,11 @@ def lt_tokenizer(): return get_lang_class("lt")().tokenizer +@pytest.fixture(scope="session") +def lv_tokenizer(): + return get_lang_class("lv")().tokenizer + + @pytest.fixture(scope="session") def mk_tokenizer(): return get_lang_class("mk")().tokenizer @@ -281,11 +324,26 @@ def sa_tokenizer(): return get_lang_class("sa")().tokenizer +@pytest.fixture(scope="session") +def sk_tokenizer(): + return get_lang_class("sk")().tokenizer + + +@pytest.fixture(scope="session") +def sl_tokenizer(): + return get_lang_class("sl")().tokenizer + + @pytest.fixture(scope="session") def sr_tokenizer(): return get_lang_class("sr")().tokenizer +@pytest.fixture(scope="session") +def sq_tokenizer(): + return get_lang_class("sq")().tokenizer + + @pytest.fixture(scope="session") def sv_tokenizer(): return get_lang_class("sv")().tokenizer @@ -346,6 +404,11 @@ def vi_tokenizer(): return get_lang_class("vi")().tokenizer +@pytest.fixture(scope="session") +def xx_tokenizer(): + return get_lang_class("xx")().tokenizer + + @pytest.fixture(scope="session") def yo_tokenizer(): return get_lang_class("yo")().tokenizer diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index ef54c581c..c334cc6eb 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,8 +1,31 @@ +import numpy import pytest + from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH +@pytest.mark.issue(2203) +def test_issue2203(en_vocab): + """Test that lemmas are set correctly in doc.from_array.""" + words = ["I", "'ll", "survive"] + tags = ["PRP", "MD", "VB"] + lemmas = ["-PRON-", "will", "survive"] + tag_ids = [en_vocab.strings.add(tag) for tag in tags] + lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] + doc = Doc(en_vocab, words=words) + # Work around lemma corruption problem and set lemmas after tags + doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) + doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) + assert [t.tag_ for t in doc] == tags + assert [t.lemma_ for t in doc] == lemmas + # We need to serialize both tag and lemma, since this is what causes the bug + doc_array = doc.to_array(["TAG", "LEMMA"]) + new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) + assert [t.tag_ for t in new_doc] == tags + assert [t.lemma_ for t in new_doc] == lemmas + + def test_doc_array_attr_of_token(en_vocab): doc = Doc(en_vocab, words=["An", "example", "sentence"]) example = doc.vocab["example"] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 57df87642..858c7cbb6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,14 +1,17 @@ import weakref -import pytest import numpy +import pytest +from thinc.api import NumpyOps, get_current_ops +from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS +from spacy.attrs import SENT_START, TAG +from spacy.lang.en import English from spacy.lang.xx import MultiLanguage +from spacy.language import Language +from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab -from spacy.lexeme import Lexeme -from spacy.lang.en import English -from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH from .test_underscore import clean_underscore # noqa: F401 @@ -30,6 +33,220 @@ def test_doc_api_init(en_vocab): assert [t.is_sent_start for t in doc] == [True, False, True, False] +@pytest.mark.issue(1547) +def test_issue1547(): + """Test that entity labels still match after merging tokens.""" + words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] + doc = Doc(Vocab(), words=words) + doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[5:7]) + assert [ent.text for ent in doc.ents] + + +@pytest.mark.issue(1757) +def test_issue1757(): + """Test comparison against None doesn't cause segfault.""" + doc = Doc(Vocab(), words=["a", "b", "c"]) + assert not doc[0] < None + assert not doc[0] is None + assert doc[0] >= None + assert not doc[:2] < None + assert not doc[:2] is None + assert doc[:2] >= None + assert not doc.vocab["a"] is None + assert not doc.vocab["a"] < None + + +@pytest.mark.issue(2396) +def test_issue2396(en_vocab): + words = ["She", "created", "a", "test", "for", "spacy"] + heads = [1, 1, 3, 1, 3, 4] + deps = ["dep"] * len(heads) + matrix = numpy.array( + [ + [0, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 2, 3, 3, 3], + [1, 1, 3, 3, 3, 3], + [1, 1, 3, 3, 4, 4], + [1, 1, 3, 3, 4, 5], + ], + dtype=numpy.int32, + ) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + span = doc[:] + assert (doc.get_lca_matrix() == matrix).all() + assert (span.get_lca_matrix() == matrix).all() + + +@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) +@pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) +@pytest.mark.issue(2782) +def test_issue2782(text, lang_cls): + """Check that like_num handles + and - before number.""" + nlp = lang_cls() + doc = nlp(text) + assert len(doc) == 1 + assert doc[0].like_num + + +@pytest.mark.parametrize( + "sentence", + [ + "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", + "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", + "It was a missed assignment, but it shouldn't have resulted in a turnover ...", + ], +) +@pytest.mark.issue(3869) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + doc = nlp(sentence) + count = 0 + for token in doc: + count += token.is_alpha + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + +@pytest.mark.issue(3962) +def test_issue3962(en_vocab): + """Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] + heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7] + deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = doc[1:5] # "jests at scars ," + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "dep" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" # head set to the new artificial root + assert doc2[3].dep_ == "dep" + # We should still have 1 sentence + assert len(list(doc2.sents)) == 1 + span3 = doc[6:9] # "never felt a" + doc3 = span3.as_doc() + doc3_json = doc3.to_json() + assert doc3_json + assert doc3[0].head.text == "felt" + assert doc3[0].dep_ == "neg" + assert doc3[1].head.text == "felt" + assert doc3[1].dep_ == "ROOT" + assert doc3[2].head.text == "felt" # head set to ancestor + assert doc3[2].dep_ == "dep" + # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" + assert len(list(doc3.sents)) == 1 + + +@pytest.mark.issue(3962) +def test_issue3962_long(en_vocab): + """Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] + heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7] + deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = two_sent_doc[1:7] # "jests at scars. They never" + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root (in sentence 1) + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "ROOT" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" + assert doc2[3].dep_ == "punct" + # head set to itself, being the new artificial root (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # head set to the new artificial head (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # We should still have 2 sentences + sents = list(doc2.sents) + assert len(sents) == 2 + assert sents[0].text == "jests at scars ." + assert sents[1].text == "They never" + + +@Language.factory("my_pipe") +class CustomPipe: + def __init__(self, nlp, name="my_pipe"): + self.name = name + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +@pytest.mark.issue(4903) +def test_issue4903(): + """Ensure that this runs correctly and doesn't hang or crash on Windows / + macOS.""" + nlp = English() + nlp.add_pipe("sentencizer") + nlp.add_pipe("my_pipe", after="sentencizer") + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + if isinstance(get_current_ops(), NumpyOps): + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." + + +@pytest.mark.issue(5048) +def test_issue5048(en_vocab): + words = ["This", "is", "a", "sentence"] + pos_s = ["DET", "VERB", "DET", "NOUN"] + spaces = [" ", " ", " ", ""] + deps_s = ["dep", "adj", "nn", "atm"] + tags_s = ["DT", "VBZ", "DT", "NN"] + strings = en_vocab.strings + for w in words: + strings.add(w) + deps = [strings.add(d) for d in deps_s] + pos = [strings.add(p) for p in pos_s] + tags = [strings.add(t) for t in tags_s] + attrs = [POS, DEP, TAG] + array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") + doc = Doc(en_vocab, words=words, spaces=spaces) + doc.from_array(attrs, array) + v1 = [(token.text, token.pos_, token.tag_) for token in doc] + doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) + v2 = [(token.text, token.pos_, token.tag_) for token in doc2] + assert v1 == v2 + + @pytest.mark.parametrize("text", [["one", "two", "three"]]) def test_doc_api_compare_by_string_position(en_vocab, text): doc = Doc(en_vocab, words=text) @@ -350,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): "Merging the docs is fun.", "", "They don't think alike. ", + "", "Another doc.", ] en_texts_without_empty = [t for t in en_texts if len(t)] @@ -357,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - en_docs[3].spans["group"] = [en_docs[3][0:1]] + en_docs[4].spans["group"] = [en_docs[4][0:1]] span_group_texts = sorted( - [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text] ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) @@ -466,6 +684,7 @@ def test_has_annotation(en_vocab): attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") for attr in attrs: assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) doc[0].tag_ = "A" doc[0].pos_ = "X" @@ -491,6 +710,27 @@ def test_has_annotation(en_vocab): assert doc.has_annotation(attr, require_complete=True) +def test_has_annotation_sents(en_vocab): + doc = Doc(en_vocab, words=["Hello", "beautiful", "world"]) + attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END") + for attr in attrs: + assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + # The first token (index 0) is always assumed to be a sentence start, + # and ignored by the check in doc.has_annotation + + doc[1].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + doc[2].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert doc.has_annotation(attr, require_complete=True) + + def test_is_flags_deprecated(en_tokenizer): doc = en_tokenizer("test") with pytest.deprecated_call(): diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 16df1713d..ec4deb033 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -1,8 +1,50 @@ +import numpy import pytest + from spacy.vocab import Vocab from spacy.tokens import Doc, Token +@pytest.mark.issue(3540) +def test_issue3540(en_vocab): + words = ["I", "live", "in", "NewYork", "right", "now"] + tensor = numpy.asarray( + [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], + dtype="f", + ) + doc = Doc(en_vocab, words=words) + doc.tensor = tensor + gold_text = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + for i, lemma in enumerate(gold_lemma): + doc[i].lemma_ = lemma + assert [token.lemma_ for token in doc] == gold_lemma + vectors_1 = [token.vector for token in doc] + assert len(vectors_1) == len(doc) + + with doc.retokenize() as retokenizer: + heads = [(doc[3], 1), doc[2]] + attrs = { + "POS": ["PROPN", "PROPN"], + "LEMMA": ["New", "York"], + "DEP": ["pobj", "compound"], + } + retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) + + gold_text = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_2 = [token.vector for token in doc] + assert len(vectors_2) == len(doc) + assert vectors_1[0].tolist() == vectors_2[0].tolist() + assert vectors_1[1].tolist() == vectors_2[1].tolist() + assert vectors_1[2].tolist() == vectors_2[2].tolist() + assert vectors_1[4].tolist() == vectors_2[5].tolist() + assert vectors_1[5].tolist() == vectors_2[6].tolist() + + def test_doc_retokenize_split(en_vocab): words = ["LosAngeles", "start", "."] heads = [1, 2, 2] diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 2503ad94c..c0496cabf 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,7 +1,9 @@ import pytest import numpy from numpy.testing import assert_array_equal + from spacy.attrs import ORTH, LENGTH +from spacy.lang.en import English from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab from spacy.util import filter_spans @@ -43,6 +45,106 @@ def doc_not_parsed(en_tokenizer): return doc +@pytest.mark.issue(1537) +def test_issue1537(): + """Test that Span.as_doc() doesn't segfault.""" + string = "The sky is blue . The man is pink . The dog is purple ." + doc = Doc(Vocab(), words=string.split()) + doc[0].sent_start = True + for word in doc[1:]: + if word.nbor(-1).text == ".": + word.sent_start = True + else: + word.sent_start = False + sents = list(doc.sents) + sent0 = sents[0].as_doc() + sent1 = sents[1].as_doc() + assert isinstance(sent0, Doc) + assert isinstance(sent1, Doc) + + +@pytest.mark.issue(1612) +def test_issue1612(en_tokenizer): + """Test that span.orth_ is identical to span.text""" + doc = en_tokenizer("The black cat purrs.") + span = doc[1:3] + assert span.orth_ == span.text + + +@pytest.mark.issue(3199) +def test_issue3199(): + """Test that Span.noun_chunks works correctly if no noun chunks iterator + is available. To make this test future-proof, we're constructing a Doc + with a new Vocab here and a parse tree to make sure the noun chunks run. + """ + words = ["This", "is", "a", "sentence"] + doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) + with pytest.raises(NotImplementedError): + list(doc[0:3].noun_chunks) + + +@pytest.mark.issue(5152) +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + with pytest.warns(UserWarning): + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + with pytest.warns(UserWarning): + assert span_2.similarity(span_3) < 1.0 + + +@pytest.mark.issue(6755) +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,label", + [("Welcome to Mumbai, my friend", 11, 17, "GPE")], +) +@pytest.mark.issue(6815) +def test_issue6815_1(sentence, start_idx, end_idx, label): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, label=label) + assert span.label_ == label + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] +) +@pytest.mark.issue(6815) +def test_issue6815_2(sentence, start_idx, end_idx, kb_id): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) + assert span.kb_id == kb_id + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,vector", + [("Welcome to Mumbai, my friend", 11, 17, numpy.array([0.1, 0.2, 0.3]))], +) +@pytest.mark.issue(6815) +def test_issue6815_3(sentence, start_idx, end_idx, vector): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, vector=vector) + assert (span.vector == vector).all() + + @pytest.mark.parametrize( "i_sent,i,j,text", [ @@ -98,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed): assert doc[:2].sent.root.text == "is" assert doc[:2].sent.text == "This is a sentence." assert doc[6:7].sent.root.left_edge.text == "This" + assert doc[0 : len(doc)].sent == list(doc.sents)[0] + assert list(doc[0 : len(doc)].sents) == list(doc.sents) + + with pytest.raises(ValueError): + doc_not_parsed[:2].sent + # test on manual sbd doc_not_parsed[0].is_sent_start = True doc_not_parsed[5].is_sent_start = True @@ -105,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed): assert doc_not_parsed[10:14].sent == doc_not_parsed[5:] +@pytest.mark.parametrize( + "start,end,expected_sentence", + [ + (0, 14, "This is"), # Entire doc + (1, 4, "This is"), # Overlapping with 2 sentences + (0, 2, "This is"), # Beginning of the Doc. Full sentence + (0, 1, "This is"), # Beginning of the Doc. Part of a sentence + (10, 14, "And a"), # End of the Doc. Overlapping with 2 senteces + (12, 14, "third."), # End of the Doc. Full sentence + (1, 1, "This is"), # Empty Span + ], +) +def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence): + + # Doc-level sents hook + def user_hook(doc): + return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)] + + doc.user_hooks["sents"] = user_hook + + # Make sure doc-level sents hook works + assert doc[start:end].sent.text == expected_sentence + + # Span-level sent hook + doc.user_span_hooks["sent"] = lambda x: x + # Now, span=level sent hook overrides the doc-level sents hook + assert doc[start:end].sent == doc[start:end] + + def test_spans_lca_matrix(en_tokenizer): """Test span's lca matrix generation""" tokens = en_tokenizer("the lazy dog slept") @@ -434,3 +571,100 @@ def test_span_with_vectors(doc): # single-token span with vector assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1]) doc.vocab.vectors = prev_vectors + + +# fmt: off +def test_span_comparison(doc): + + # Identical start, end, only differ in label and kb_id + assert Span(doc, 0, 3) == Span(doc, 0, 3) + assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3) + assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID")) + assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")) + + assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3)) + assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3)) + + # Different end + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4) + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4) + assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + # Different start + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3) + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3) + assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + # Different start & different end + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3) + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3) + assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID") + assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID") +# fmt: on + + +@pytest.mark.parametrize( + "start,end,expected_sentences,expected_sentences_with_hook", + [ + (0, 14, 3, 7), # Entire doc + (3, 6, 2, 2), # Overlapping with 2 sentences + (0, 4, 1, 2), # Beginning of the Doc. Full sentence + (0, 3, 1, 2), # Beginning of the Doc. Part of a sentence + (9, 14, 2, 3), # End of the Doc. Overlapping with 2 senteces + (10, 14, 1, 2), # End of the Doc. Full sentence + (11, 14, 1, 2), # End of the Doc. Partial sentence + (0, 0, 1, 1), # Empty Span + ], +) +def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook): + + assert len(list(doc[start:end].sents)) == expected_sentences + + def user_hook(doc): + return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)] + + doc.user_hooks["sents"] = user_hook + + assert len(list(doc[start:end].sents)) == expected_sentences_with_hook + + doc.user_span_hooks["sents"] = lambda x: [x] + + assert list(doc[start:end].sents)[0] == doc[start:end] + assert len(list(doc[start:end].sents)) == 1 + + +def test_span_sents_not_parsed(doc_not_parsed): + with pytest.raises(ValueError): + list(Span(doc_not_parsed, 0, 3).sents) + + +def test_span_group_copy(doc): + doc.spans["test"] = [doc[0:1], doc[2:4]] + assert len(doc.spans["test"]) == 2 + doc_copy = doc.copy() + # check that the spans were indeed copied + assert len(doc_copy.spans["test"]) == 2 + # add a new span to the original doc + doc.spans["test"].append(doc[3:4]) + assert len(doc.spans["test"]) == 3 + # check that the copy spans were not modified and this is an isolated doc + assert len(doc_copy.spans["test"]) == 2 diff --git a/spacy/tests/regression/__init__.py b/spacy/tests/lang/af/__init__.py similarity index 100% rename from spacy/tests/regression/__init__.py rename to spacy/tests/lang/af/__init__.py diff --git a/spacy/tests/lang/af/test_text.py b/spacy/tests/lang/af/test_text.py new file mode 100644 index 000000000..99c2a9f4c --- /dev/null +++ b/spacy/tests/lang/af/test_text.py @@ -0,0 +1,22 @@ +import pytest + + +def test_long_text(af_tokenizer): + # Excerpt: Universal Declaration of Human Rights; “'n” changed to “die” in first sentence + text = """ +Hierdie Universele Verklaring van Menseregte as die algemene standaard vir die verwesenliking deur alle mense en nasies, +om te verseker dat elke individu en elke deel van die gemeenskap hierdie Verklaring in ag sal neem en deur opvoeding, +respek vir hierdie regte en vryhede te bevorder, op nasionale en internasionale vlak, daarna sal strewe om die universele +en effektiewe erkenning en agting van hierdie regte te verseker, nie net vir die mense van die Lidstate nie, maar ook vir +die mense in die gebiede onder hul jurisdiksie. + +""" + tokens = af_tokenizer(text) + assert len(tokens) == 100 + + +@pytest.mark.xfail +def test_indefinite_article(af_tokenizer): + text = "as 'n algemene standaard" + tokens = af_tokenizer(text) + assert len(tokens) == 4 diff --git a/spacy/tests/lang/af/test_tokenizer.py b/spacy/tests/lang/af/test_tokenizer.py new file mode 100644 index 000000000..db52db5e3 --- /dev/null +++ b/spacy/tests/lang/af/test_tokenizer.py @@ -0,0 +1,29 @@ +import pytest + +AF_BASIC_TOKENIZATION_TESTS = [ + ( + "Elkeen het die reg tot lewe, vryheid en sekuriteit van persoon.", + [ + "Elkeen", + "het", + "die", + "reg", + "tot", + "lewe", + ",", + "vryheid", + "en", + "sekuriteit", + "van", + "persoon", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", AF_BASIC_TOKENIZATION_TESTS) +def test_af_tokenizer_basic(af_tokenizer, text, expected_tokens): + tokens = af_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 39d8d3b59..d30c72750 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -4,6 +4,15 @@ from spacy.tokens import Doc from ...util import apply_transition_sequence +@pytest.mark.issue(309) +def test_issue309(en_vocab): + """Test Issue #309: SBD fails on empty string""" + doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"]) + assert len(doc) == 1 + sents = list(doc.sents) + assert len(sents) == 1 + + @pytest.mark.parametrize("words", [["A", "test", "sentence"]]) @pytest.mark.parametrize("punct", [".", "!", "?", ""]) def test_en_sbd_single_punct(en_vocab, words, punct): diff --git a/spacy/tests/lang/en/test_tokenizer.py b/spacy/tests/lang/en/test_tokenizer.py new file mode 100644 index 000000000..e6d1d7d85 --- /dev/null +++ b/spacy/tests/lang/en/test_tokenizer.py @@ -0,0 +1,169 @@ +import pytest + + +@pytest.mark.issue(351) +def test_issue351(en_tokenizer): + doc = en_tokenizer(" This is a cat.") + assert doc[0].idx == 0 + assert len(doc[0]) == 3 + assert doc[1].idx == 3 + + +@pytest.mark.issue(360) +def test_issue360(en_tokenizer): + """Test tokenization of big ellipsis""" + tokens = en_tokenizer("$45...............Asking") + assert len(tokens) > 2 + + +@pytest.mark.issue(736) +@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) +def test_issue736(en_tokenizer, text, number): + """Test that times like "7am" are tokenized correctly and that numbers are + converted to string.""" + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == number + + +@pytest.mark.issue(740) +@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) +def test_issue740(en_tokenizer, text): + """Test that dates are not split and kept as one token. This behaviour is + currently inconsistent, since dates separated by hyphens are still split. + This will be hard to prevent without causing clashes with numeric ranges.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.issue(744) +@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) +def test_issue744(en_tokenizer, text): + """Test that 'were' and 'Were' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text.lower() == "were" + + +@pytest.mark.issue(759) +@pytest.mark.parametrize( + "text,is_num", [("one", True), ("ten", True), ("teneleven", False)] +) +def test_issue759(en_tokenizer, text, is_num): + tokens = en_tokenizer(text) + assert tokens[0].like_num == is_num + + +@pytest.mark.issue(775) +@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) +def test_issue775(en_tokenizer, text): + """Test that 'Shell' and 'shell' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text + + +@pytest.mark.issue(792) +@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) +def test_issue792(en_tokenizer, text): + """Test for Issue #792: Trailing whitespace is removed after tokenization.""" + doc = en_tokenizer(text) + assert "".join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.issue(792) +@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) +def test_control_issue792(en_tokenizer, text): + """Test base case for Issue #792: Non-trailing whitespace""" + doc = en_tokenizer(text) + assert "".join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.issue(859) +@pytest.mark.parametrize( + "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] +) +def test_issue859(en_tokenizer, text): + """Test that no extra space is added in doc.text method.""" + doc = en_tokenizer(text) + assert doc.text == text + + +@pytest.mark.issue(886) +@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) +def test_issue886(en_tokenizer, text): + """Test that token.idx matches the original text index for texts with newlines.""" + doc = en_tokenizer(text) + for token in doc: + assert len(token.text) == len(token.text_with_ws) + assert text[token.idx] == token.text[0] + + +@pytest.mark.issue(891) +@pytest.mark.parametrize("text", ["want/need"]) +def test_issue891(en_tokenizer, text): + """Test that / infixes are split correctly.""" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "/" + + +@pytest.mark.issue(957) +@pytest.mark.slow +def test_issue957(en_tokenizer): + """Test that spaCy doesn't hang on many punctuation characters. + If this test hangs, check (new) regular expressions for conflicting greedy operators + """ + # Skip test if pytest-timeout is not installed + pytest.importorskip("pytest_timeout") + for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]: + string = "0" + for i in range(1, 100): + string += punct + str(i) + doc = en_tokenizer(string) + assert doc + + +@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) +@pytest.mark.issue(1698) +def test_issue1698(en_tokenizer, text): + """Test that doc doesn't identify email-addresses as URLs""" + doc = en_tokenizer(text) + assert len(doc) == 1 + assert not doc[0].like_url + + +@pytest.mark.issue(1758) +def test_issue1758(en_tokenizer): + """Test that "would've" is handled by the English tokenizer exceptions.""" + tokens = en_tokenizer("would've") + assert len(tokens) == 2 + + +@pytest.mark.issue(1773) +def test_issue1773(en_tokenizer): + """Test that spaces don't receive a POS but no TAG. This is the root cause + of the serialization issue reported in #1773.""" + doc = en_tokenizer("\n") + if doc[0].pos_ == "SPACE": + assert doc[0].tag_ != "" + + +@pytest.mark.issue(3277) +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013" + + +@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) +@pytest.mark.issue(3521) +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms + assert tok.is_stop diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index 96f6bcab5..d95f6d26b 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -1,5 +1,16 @@ import pytest from spacy.lang.es.lex_attrs import like_num +from spacy.lang.es import Spanish + + +@pytest.mark.issue(3803) +def test_issue3803(): + """Test that spanish num-like tokens have True for like_num attribute.""" + nlp = Spanish() + text = "2 dos 1000 mil 12 doce" + doc = nlp(text) + + assert [t.like_num for t in doc] == [True, True, True, True, True, True] def test_es_tokenizer_handles_long_text(es_tokenizer): diff --git a/spacy/tests/lang/et/__init__.py b/spacy/tests/lang/et/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/et/test_text.py b/spacy/tests/lang/et/test_text.py new file mode 100644 index 000000000..9515a7cc1 --- /dev/null +++ b/spacy/tests/lang/et/test_text.py @@ -0,0 +1,26 @@ +import pytest + + +def test_long_text(et_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +arvestades, et nimetatud deklaratsiooni eesmärk on tagada selles +kuulutatud õiguste üldine ja tõhus tunnustamine ning järgimine; +arvestades, et Euroopa Nõukogu eesmärk on saavutada tema +liikmete suurem ühtsus ning et üheks selle eesmärgi saavutamise +vahendiks on inimõiguste ja põhivabaduste järgimine ning +elluviimine; +taaskinnitades oma sügavat usku neisse põhivabadustesse, mis +on õigluse ja rahu aluseks maailmas ning mida kõige paremini +tagab ühelt poolt tõhus poliitiline demokraatia ning teiselt poolt +inimõiguste, millest nad sõltuvad, üldine mõistmine ja järgimine; +""" + tokens = et_tokenizer(text) + assert len(tokens) == 94 + + +@pytest.mark.xfail +def test_ordinal_number(et_tokenizer): + text = "10. detsembril 1948" + tokens = et_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/et/test_tokenizer.py b/spacy/tests/lang/et/test_tokenizer.py new file mode 100644 index 000000000..f0f8079ca --- /dev/null +++ b/spacy/tests/lang/et/test_tokenizer.py @@ -0,0 +1,29 @@ +import pytest + +ET_BASIC_TOKENIZATION_TESTS = [ + ( + "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda " + "ega karistada.", + [ + "Kedagi", + "ei", + "või", + "piinata", + "ega", + "ebainimlikult", + "või", + "alandavalt", + "kohelda", + "ega", + "karistada", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", ET_BASIC_TOKENIZATION_TESTS) +def test_et_tokenizer_basic(et_tokenizer, text, expected_tokens): + tokens = et_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/fi/test_noun_chunks.py b/spacy/tests/lang/fi/test_noun_chunks.py new file mode 100644 index 000000000..cab84b311 --- /dev/null +++ b/spacy/tests/lang/fi/test_noun_chunks.py @@ -0,0 +1,189 @@ +import pytest +from spacy.tokens import Doc + + +FI_NP_TEST_EXAMPLES = [ + ( + "Kaksi tyttöä potkii punaista palloa", + ["NUM", "NOUN", "VERB", "ADJ", "NOUN"], + ["nummod", "nsubj", "ROOT", "amod", "obj"], + [1, 1, 0, 1, -2], + ["Kaksi tyttöä", "punaista palloa"], + ), + ( + "Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä", + ["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"], + ["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"], + [1, 1, 1, 0, 1, 1, -3], + ["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"], + ), + ( + "Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä", + ["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"], + ["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"], + [3, 1, -2, 0, 1, -2, -1], + ["Leijona raidallisine tassuineen", "Porin kaupungin"], + ), + ( + "Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä", + ["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"], + ["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"], + [1, 0, -1, 2, 1, -3, 2, 1, -6], + ["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"], + ), + ( + "Minua houkuttaa maalle muuttaminen talven jälkeen", + ["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"], + ["obj", "ROOT", "nmod", "nsubj", "obl", "case"], + [1, 0, 1, -2, -3, -1], + ["maalle muuttaminen", "talven"], + ), + ( + "Päivän kohokohta oli vierailu museossa kummilasten kanssa", + ["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"], + ["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"], + [1, 2, 1, 0, -1, -2, -1], + ["Päivän kohokohta", "vierailu museossa", "kummilasten"], + ), + ( + "Yrittäjät maksoivat tuomioistuimen määräämät korvaukset", + ["NOUN", "VERB", "NOUN", "VERB", "NOUN"], + ["nsubj", "ROOT", "nsubj", "acl", "obj"], + [1, 0, 1, 1, -3], + ["Yrittäjät", "tuomioistuimen", "korvaukset"], + ), + ( + "Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia", + ["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"], + ["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"], + [4, 3, 1, 1, 3, 2, 1, 0], + ["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"], + ), + ( + "Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta", + ["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"], + ["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"], + [3, 2, 1, 0, 1, -2, 2, 1, -3], + ["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"], + ), + ( + "Isä souti veneellä, jonka hän oli vuokrannut", + ["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"], + ["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"], + [1, 0, -1, 4, 3, 2, 1, -5], + ["Isä", "veneellä"], + ), + ( + "Kirja, jonka poimin hyllystä, kertoo norsuista", + ["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"], + ["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"], + [6, 2, 1, -3, -1, 1, 0, -1], + ["Kirja", "hyllystä", "norsuista"], + ), + ( + "Huomenna on päivä, jota olemme odottaneet", + ["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"], + ["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"], + [0, -1, -2, 3, 2, 1, -4], + ["Huomenna", "päivä"], + ), + ( + "Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista", + ["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"], + ["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"], + [1, 2, 1, 0, 2, 1, -3], + [ + "Liikkuvuuden lisääminen", + "korkeakoulutuksen keskeisistä kehittämiskohteista", + ], + ), + ( + "Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi", + ["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"], + ["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"], + [1, 1, 0, 1, 1, -3], + ["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"], + ), + ( + "New York tunnetaan kaupunkina, joka ei koskaan nuku", + ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"], + [ + "obj", + "flat:name", + "ROOT", + "obl", + "punct", + "nsubj", + "aux", + "advmod", + "acl:relcl", + ], + [2, -1, 0, -1, 4, 3, 2, 1, -5], + ["New York", "kaupunkina"], + ), + ( + "Loput vihjeet saat herra Möttöseltä", + ["NOUN", "NOUN", "VERB", "NOUN", "PROPN"], + ["compound:nn", "obj", "ROOT", "compound:nn", "obj"], + [1, 1, 0, 1, -2], + ["Loput vihjeet", "herra Möttöseltä"], + ), + ( + "mahdollisuus tukea muita päivystysyksiköitä", + ["NOUN", "VERB", "PRON", "NOUN"], + ["ROOT", "acl", "det", "obj"], + [0, -1, 1, -2], + ["mahdollisuus", "päivystysyksiköitä"], + ), + ( + "sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa", + ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"], + ["nsubj", "ROOT", "obj", "obl", "amod", "obl"], + [1, 0, -1, -1, 1, -3], + [ + "sairaanhoitopiirit", + "leikkaustoimintaa", + "alueellaan", + "useammassa sairaalassa", + ], + ), + ( + "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa", + ["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"], + ["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"], + [5, -1, 3, 2, 1, 0], + ["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"], + ), +] + + +def test_noun_chunks_is_parsed(fi_tokenizer): + """Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = fi_tokenizer("Tämä on testi") + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +@pytest.mark.parametrize( + "text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES +) +def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks): + tokens = fi_tokenizer(text) + + assert len(heads) == len(pos) + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[head + i for i, head in enumerate(heads)], + deps=deps, + pos=pos, + ) + + noun_chunks = list(doc.noun_chunks) + assert len(noun_chunks) == len(expected_noun_chunks) + for i, np in enumerate(noun_chunks): + assert np.text == expected_noun_chunks[i] diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 48ac88ead..25b95f566 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,8 +1,230 @@ +from spacy.tokens import Doc import pytest +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un nom -> un nom + ( + ["un", "nom"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + noun starting with vowel + # l'heure -> l'heure + ( + ["l'", "heure"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + plural noun + # les romans -> les romans + ( + ["les", "romans"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # det + adj + noun + # Le vieux Londres -> Le vieux Londres + ( + ['Les', 'vieux', 'Londres'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + noun + adj + # le nom propre -> le nom propre a proper noun + ( + ["le", "nom", "propre"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # det + noun + adj plural + # Les chiens bruns -> les chiens bruns + ( + ["Les", "chiens", "bruns"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # multiple adjectives: one adj before the noun, one adj after the noun + # un nouveau film intéressant -> un nouveau film intéressant + ( + ["un", "nouveau", "film", "intéressant"], + [2, 2, 2, 2], + ["det", "amod", "ROOT", "amod"], + ["DET", "ADJ", "NOUN", "ADJ"], + [(0,4)] + ), + # multiple adjectives, both adjs after the noun + # une personne intelligente et drôle -> une personne intelligente et drôle + ( + ["une", "personne", "intelligente", "et", "drôle"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # relative pronoun + # un bus qui va au ville -> un bus, qui, ville + ( + ['un', 'bus', 'qui', 'va', 'au', 'ville'], + [1, 1, 3, 1, 5, 3], + ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'], + ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'], + [(0,2), (2,3), (5,6)] + ), + # relative subclause + # Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy. + ( + ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'], + [0, 2, 0, 5, 5, 2, 5], + ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'], + ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'], + [(1,3), (4,5)] + ), + # Person name and title by flat + # Louis XIV -> Louis XIV + ( + ["Louis", "XIV"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Organization name by flat + # Nations Unies -> Nations Unies + ( + ["Nations", "Unies"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Noun compound, person name created by two flats + # Louise de Bratagne -> Louise de Bratagne + ( + ["Louise", "de", "Bratagne"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound, person name created by two flats + # Louis François Joseph -> Louis François Joseph + ( + ["Louis", "François", "Joseph"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # quelques agriculteurs très riches -> quelques agriculteurs très riches + ( + ["quelques", "agriculteurs", "très", "riches"], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Il a un chien et un chat -> Il, un chien, un chat + ( + ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], + [1, 1, 3, 1, 6, 6, 3], + ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(0,1), (2,4), (5,7)] + + ), + # Two NPs together + # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado + ( + ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'appos', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # nmod relation between NPs + # la destruction de la ville -> la destruction, la ville + ( + ['la', 'destruction', 'de', 'la', 'ville'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'case', 'det', 'nmod'], + ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'], + [(0,2), (3,5)] + ), + # nmod relation between NPs + # Archiduchesse d’Autriche -> Archiduchesse, Autriche + ( + ['Archiduchesse', 'd’', 'Autriche'], + [0, 2, 0], + ['ROOT', 'case', 'nmod'], + ['NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3)] + ), + # Compounding by nmod, several NPs chained together + # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement + ( + ["la", "première", "usine", "de", "drogue", "du", "gouvernement"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduction du rapport de Susana -> Traduction, rapport, Susana + ( + ['Traduction', 'du', 'raport', 'de', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie + ( + ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], + [2, 2, 2, 4, 2, 7, 7, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'], + [(0,3), (4,5), (6,8)] + ), + # Passive subject + # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton + ( + ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'], + [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'], + [(0, 3), (6, 10), (11, 12)] + ) + ], +) +# fmt: on +def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + def test_noun_chunks_is_parsed_fr(fr_tokenizer): """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" - doc = fr_tokenizer("trouver des travaux antérieurs") + doc = fr_tokenizer("Je suis allé à l'école") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/hi/test_text.py b/spacy/tests/lang/hi/test_text.py new file mode 100644 index 000000000..791cc3822 --- /dev/null +++ b/spacy/tests/lang/hi/test_text.py @@ -0,0 +1,11 @@ +import pytest +from spacy.lang.hi import Hindi + + +@pytest.mark.issue(3625) +def test_issue3625(): + """Test that default punctuation rules applies to hindi unicode characters""" + nlp = Hindi() + doc = nlp("hi. how हुए. होटल, होटल") + expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] + assert [token.text for token in doc] == expected diff --git a/spacy/tests/lang/hr/__init__.py b/spacy/tests/lang/hr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/hr/test_text.py b/spacy/tests/lang/hr/test_text.py new file mode 100644 index 000000000..82e65afe7 --- /dev/null +++ b/spacy/tests/lang/hr/test_text.py @@ -0,0 +1,26 @@ +import pytest + + +def test_long_text(hr_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +uzimajući u obzir da ta deklaracija nastoji osigurati opće i djelotvorno +priznanje i poštovanje u njoj proglašenih prava; +uzimajući u obzir da je cilj Vijeća Europe postizanje većeg jedinstva +njegovih članica, i da je jedan od načina postizanja toga cilja +očuvanje i daljnje ostvarivanje ljudskih prava i temeljnih sloboda; +potvrđujući svoju duboku privrženost tim temeljnim slobodama +koje su osnova pravde i mira u svijetu i koje su najbolje zaštićene +istinskom političkom demokracijom s jedne strane te zajedničkim +razumijevanjem i poštovanjem ljudskih prava o kojima te slobode +ovise s druge strane; +""" + tokens = hr_tokenizer(text) + assert len(tokens) == 105 + + +@pytest.mark.xfail +def test_ordinal_number(hr_tokenizer): + text = "10. prosinca 1948" + tokens = hr_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/hr/test_tokenizer.py b/spacy/tests/lang/hr/test_tokenizer.py new file mode 100644 index 000000000..dace33b2d --- /dev/null +++ b/spacy/tests/lang/hr/test_tokenizer.py @@ -0,0 +1,31 @@ +import pytest + +HR_BASIC_TOKENIZATION_TESTS = [ + ( + "Nitko se ne smije podvrgnuti mučenju ni nečovječnom ili " + "ponižavajućem postupanju ili kazni.", + [ + "Nitko", + "se", + "ne", + "smije", + "podvrgnuti", + "mučenju", + "ni", + "nečovječnom", + "ili", + "ponižavajućem", + "postupanju", + "ili", + "kazni", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", HR_BASIC_TOKENIZATION_TESTS) +def test_hr_tokenizer_basic(hr_tokenizer, text, expected_tokens): + tokens = hr_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/is/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/is/test_text.py new file mode 100644 index 000000000..6e3654a6e --- /dev/null +++ b/spacy/tests/lang/is/test_text.py @@ -0,0 +1,26 @@ +import pytest + + +def test_long_text(is_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja +almenna og raunhæfa viðurkenningu og vernd þeirra réttinda, +sem þar er lýst; +hafa í huga, að markmið Evrópuráðs er að koma á nánari einingu +aðildarríkjanna og að ein af leiðunum að því marki er sú, að +mannréttindi og mannfrelsi séu í heiðri höfð og efld; +lýsa á ný eindreginni trú sinni á það mannfrelsi, sem er undirstaða +réttlætis og friðar í heiminum og best er tryggt, annars vegar með +virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi +og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins; +""" + tokens = is_tokenizer(text) + assert len(tokens) == 120 + + +@pytest.mark.xfail +def test_ordinal_number(is_tokenizer): + text = "10. desember 1948" + tokens = is_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/is/test_tokenizer.py new file mode 100644 index 000000000..0c05a6050 --- /dev/null +++ b/spacy/tests/lang/is/test_tokenizer.py @@ -0,0 +1,30 @@ +import pytest + +IS_BASIC_TOKENIZATION_TESTS = [ + ( + "Enginn maður skal sæta pyndingum eða ómannlegri eða " + "vanvirðandi meðferð eða refsingu. ", + [ + "Enginn", + "maður", + "skal", + "sæta", + "pyndingum", + "eða", + "ómannlegri", + "eða", + "vanvirðandi", + "meðferð", + "eða", + "refsingu", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS) +def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens): + tokens = is_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py new file mode 100644 index 000000000..0a8c10e79 --- /dev/null +++ b/spacy/tests/lang/it/test_noun_chunks.py @@ -0,0 +1,221 @@ +from spacy.tokens import Doc +import pytest + + +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un pollo -> un pollo + ( + ["un", "pollo"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0,2)], + ), + # two determiners + noun + # il mio cane -> il mio cane + ( + ["il", "mio", "cane"], + [2, 2, 2], + ["det", "det:poss", "ROOT"], + ["DET", "DET", "NOUN"], + [(0,3)], + ), + # two determiners, one is after noun. rare usage but still testing + # il cane mio-> il cane mio + ( + ["il", "cane", "mio"], + [1, 1, 1], + ["det", "ROOT", "det:poss"], + ["DET", "NOUN", "DET"], + [(0,3)], + ), + # relative pronoun + # È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty. + ( + ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"], + [2, 2, 2, 4, 2, 7, 7, 4], + ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'], + ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(3,5), (5,6)] + ), + # relative subclause + # il computer che hai comprato -> il computer, che the computer that you bought + ( + ['il', 'computer', 'che', 'hai', 'comprato'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'], + ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(0,2), (2,3)] + ), + # det + noun + adj + # Una macchina grande -> Una macchina grande + ( + ["Una", "macchina", "grande"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0,3)], + ), + # noun + adj plural + # mucche bianche + ( + ["mucche", "bianche"], + [0, 0], + ["ROOT", "amod"], + ["NOUN", "ADJ"], + [(0,2)], + ), + # det + adj + noun + # Una grande macchina -> Una grande macchina + ( + ['Una', 'grande', 'macchina'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + adj + noun, det with apostrophe + # un'importante associazione -> un'importante associazione + ( + ["Un'", 'importante', 'associazione'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # multiple adjectives + # Un cane piccolo e marrone -> Un cane piccolo e marrone + ( + ["Un", "cane", "piccolo", "e", "marrone"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # determiner, adjective, compound created by flat + # le Nazioni Unite -> le Nazioni Unite + ( + ["le", "Nazioni", "Unite"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers + ( + ['alcuni', 'contadini', 'molto', 'ricchi'], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Ho un cane e un gatto -> un cane, un gatto + ( + ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], + [0, 2, 0, 5, 5, 0], + ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(1,3), (4,6)] + + ), + # Two NPs together + # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado + ( + ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'nmod', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # Noun compound, person name and titles + # Dom Pedro II -> Dom Pedro II + ( + ["Dom", "Pedro", "II"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound created by flat + # gli Stati Uniti + ( + ["gli", "Stati", "Uniti"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # nmod relation between NPs + # la distruzione della città -> la distruzione, città + ( + ['la', 'distruzione', 'della', 'città'], + [1, 1, 3, 1], + ['det', 'ROOT', 'case', 'nmod'], + ['DET', 'NOUN', 'ADP', 'NOUN'], + [(0,2), (3,4)] + ), + # Compounding by nmod, several NPs chained together + # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo + ( + ["la", "prima", "fabbrica", "di", "droga", "del", "governo"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana + ( + ['Traduzione', 'del', 'rapporto', 'di', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica + ( + ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], + [1, 1, 1, 4, 1, 8, 8, 8, 1], + ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], + ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'], + [(0,3), (4,5), (6,9)] + ), + # Passive subject + # La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton + ( + ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'], + [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0, 3), (6, 8), (9, 10), (11,12)] + ), + # Misc + # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti + ( + ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'], + [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17], + ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'], + ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(2,4), (9,12), (13,14), (17,18), (19,20)] + ) + ], +) +# fmt: on +def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + +def test_noun_chunks_is_parsed_it(it_tokenizer): + """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed.""" + doc = it_tokenizer("Sei andato a Oxford") + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/it/test_stopwords.py b/spacy/tests/lang/it/test_stopwords.py new file mode 100644 index 000000000..954913164 --- /dev/null +++ b/spacy/tests/lang/it/test_stopwords.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.mark.parametrize( + "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"] +) +def test_stopwords_basic(it_tokenizer, word): + tok = it_tokenizer(word)[0] + assert tok.is_stop + + +@pytest.mark.parametrize( + "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"] +) +def test_stopwords_elided(it_tokenizer, word): + tok = it_tokenizer(word)[0] + assert tok.is_stop diff --git a/spacy/tests/lang/it/test_text.py b/spacy/tests/lang/it/test_text.py new file mode 100644 index 000000000..6023a20b1 --- /dev/null +++ b/spacy/tests/lang/it/test_text.py @@ -0,0 +1,14 @@ +import pytest + + +@pytest.mark.issue(2822) +def test_issue2822(it_tokenizer): + """Test that the abbreviation of poco is kept as one word.""" + doc = it_tokenizer("Vuoi un po' di zucchero?") + assert len(doc) == 6 + assert doc[0].text == "Vuoi" + assert doc[1].text == "un" + assert doc[2].text == "po'" + assert doc[3].text == "di" + assert doc[4].text == "zucchero" + assert doc[5].text == "?" diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 3437ea283..ef7bed06d 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -54,6 +54,18 @@ SUB_TOKEN_TESTS = [ # fmt: on +@pytest.mark.issue(2901) +def test_issue2901(): + """Test that `nlp` doesn't fail.""" + try: + nlp = Japanese() + except ImportError: + pytest.skip() + + doc = nlp("pythonが大好きです") + assert doc + + @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) def test_ja_tokenizer(ja_tokenizer, text, expected_tokens): tokens = [token.text for token in ja_tokenizer(text)] diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index eac309857..e6b65dee9 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): def test_ko_empty_doc(ko_tokenizer): tokens = ko_tokenizer("") assert len(tokens) == 0 + + +# fmt: off +SPACY_TOKENIZER_TESTS = [ + ("있다.", "있다 ."), + ("'예'는", "' 예 ' 는"), + ("부 (富) 는", "부 ( 富 ) 는"), + ("부(富)는", "부 ( 富 ) 는"), + ("1982~1983.", "1982 ~ 1983 ."), + ("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."), + ("그렇구나~", "그렇구나~"), + ("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS) +def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer_tokenizer(text)] + assert tokens == expected_tokens.split() diff --git a/spacy/tests/lang/lv/__init__.py b/spacy/tests/lang/lv/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/lv/test_text.py b/spacy/tests/lang/lv/test_text.py new file mode 100644 index 000000000..5ca5fd0a7 --- /dev/null +++ b/spacy/tests/lang/lv/test_text.py @@ -0,0 +1,27 @@ +import pytest + + +def test_long_text(lv_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +Ievērodamas, ka šī deklarācija paredz nodrošināt vispārēju un +efektīvu tajā pasludināto tiesību atzīšanu un ievērošanu; +Ievērodamas, ka Eiropas Padomes mērķis ir panākt lielāku vienotību +tās dalībvalstu starpā un ka viens no līdzekļiem, kā šo mērķi +sasniegt, ir cilvēka tiesību un pamatbrīvību ievērošana un turpmāka +īstenošana; +No jauna apliecinādamas patiesu pārliecību, ka šīs pamatbrīvības +ir taisnīguma un miera pamats visā pasaulē un ka tās vislabāk var +nodrošināt patiess demokrātisks politisks režīms no vienas puses un +vispārējo cilvēktiesību, uz kurām tās pamatojas, kopīga izpratne un +ievērošana no otras puses; +""" + tokens = lv_tokenizer(text) + assert len(tokens) == 109 + + +@pytest.mark.xfail +def test_ordinal_number(lv_tokenizer): + text = "10. decembrī" + tokens = lv_tokenizer(text) + assert len(tokens) == 2 diff --git a/spacy/tests/lang/lv/test_tokenizer.py b/spacy/tests/lang/lv/test_tokenizer.py new file mode 100644 index 000000000..3ce7ad5fa --- /dev/null +++ b/spacy/tests/lang/lv/test_tokenizer.py @@ -0,0 +1,30 @@ +import pytest + +LV_BASIC_TOKENIZATION_TESTS = [ + ( + "Nevienu nedrīkst spīdzināt vai cietsirdīgi vai pazemojoši ar viņu " + "apieties vai sodīt.", + [ + "Nevienu", + "nedrīkst", + "spīdzināt", + "vai", + "cietsirdīgi", + "vai", + "pazemojoši", + "ar", + "viņu", + "apieties", + "vai", + "sodīt", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", LV_BASIC_TOKENIZATION_TESTS) +def test_lv_tokenizer_basic(lv_tokenizer, text, expected_tokens): + tokens = lv_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sk/__init__.py b/spacy/tests/lang/sk/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sk/test_text.py b/spacy/tests/lang/sk/test_text.py new file mode 100644 index 000000000..62ea2a783 --- /dev/null +++ b/spacy/tests/lang/sk/test_text.py @@ -0,0 +1,48 @@ +import pytest + + +def test_long_text(sk_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +majúc na zreteli, že cieľom tejto deklarácie je zabezpečiť všeobecné +a účinné uznávanie a dodržiavanie práv v nej vyhlásených; +majúc na zreteli, že cieľom Rady Európy je dosiahnutie väčšej +jednoty medzi jej členmi, a že jedným zo spôsobov, ktorým sa +má tento cieľ napĺňať, je ochrana a ďalší rozvoj ľudských práv +a základných slobôd; +znovu potvrdzujúc svoju hlbokú vieru v tie základné slobody, ktoré +sú základom spravodlivosti a mieru vo svete, a ktoré sú najlepšie +zachovávané na jednej strane účinnou politickou demokraciou +a na strane druhej spoločným poňatím a dodržiavaním ľudských +práv, od ktorých závisia; + """ + tokens = sk_tokenizer(text) + assert len(tokens) == 118 + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("štyri", True), + ("devätnásť", True), + ("milión", True), + ("pes", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(sk_tokenizer, text, match): + tokens = sk_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.xfail +def test_ordinal_number(sk_tokenizer): + text = "10. decembra 1948" + tokens = sk_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/sk/test_tokenizer.py b/spacy/tests/lang/sk/test_tokenizer.py new file mode 100644 index 000000000..247847284 --- /dev/null +++ b/spacy/tests/lang/sk/test_tokenizer.py @@ -0,0 +1,15 @@ +import pytest + +SK_BASIC_TOKENIZATION_TESTS = [ + ( + "Kedy sa narodil Andrej Kiska?", + ["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS) +def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens): + tokens = sk_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sl/__init__.py b/spacy/tests/lang/sl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py new file mode 100644 index 000000000..ddc5b6b5d --- /dev/null +++ b/spacy/tests/lang/sl/test_text.py @@ -0,0 +1,27 @@ +import pytest + + +def test_long_text(sl_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +upoštevajoč, da si ta deklaracija prizadeva zagotoviti splošno in +učinkovito priznavanje in spoštovanje v njej razglašenih pravic, +upoštevajoč, da je cilj Sveta Evrope doseči večjo enotnost med +njegovimi članicami, in da je eden izmed načinov za zagotavljanje +tega cilja varstvo in nadaljnji razvoj človekovih pravic in temeljnih +svoboščin, +ponovno potrjujoč svojo globoko vero v temeljne svoboščine, na +katerih temeljita pravičnost in mir v svetu, in ki jih je mogoče najbolje +zavarovati na eni strani z dejansko politično demokracijo in na drugi +strani s skupnim razumevanjem in spoštovanjem človekovih pravic, +od katerih so te svoboščine odvisne, +""" + tokens = sl_tokenizer(text) + assert len(tokens) == 116 + + +@pytest.mark.xfail +def test_ordinal_number(sl_tokenizer): + text = "10. decembra 1948" + tokens = sl_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/sl/test_tokenizer.py b/spacy/tests/lang/sl/test_tokenizer.py new file mode 100644 index 000000000..f2b15b0ff --- /dev/null +++ b/spacy/tests/lang/sl/test_tokenizer.py @@ -0,0 +1,32 @@ +import pytest + +SL_BASIC_TOKENIZATION_TESTS = [ + ( + "Vsakdo ima pravico do spoštovanja njegovega zasebnega in " + "družinskega življenja, doma in dopisovanja.", + [ + "Vsakdo", + "ima", + "pravico", + "do", + "spoštovanja", + "njegovega", + "zasebnega", + "in", + "družinskega", + "življenja", + ",", + "doma", + "in", + "dopisovanja", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SL_BASIC_TOKENIZATION_TESTS) +def test_sl_tokenizer_basic(sl_tokenizer, text, expected_tokens): + tokens = sl_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sq/__init__.py b/spacy/tests/lang/sq/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py new file mode 100644 index 000000000..44eedaa54 --- /dev/null +++ b/spacy/tests/lang/sq/test_text.py @@ -0,0 +1,25 @@ +import pytest + + +def test_long_text(sq_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +Qeveritë nënshkruese, anëtare të Këshillit të Evropës, +Duke pasur parasysh Deklaratën Universale të të Drejtave të +Njeriut, të shpallur nga Asambleja e Përgjithshme e Kombeve të +Bashkuara më 10 dhjetor 1948; +Duke pasur parasysh, se kjo Deklaratë ka për qëllim të sigurojë +njohjen dhe zbatimin universal dhe efektiv të të drejtave të +shpallura në të; +Duke pasur parasysh se qëllimi i Këshillit të Evropës është që të +realizojë një bashkim më të ngushtë midis anëtarëve të tij dhe +se një nga mjetet për të arritur këtë qëllim është mbrojtja dhe +zhvillimi i të drejtave të njeriut dhe i lirive themelore; +Duke ripohuar besimin e tyre të thellë në këto liri themelore që +përbëjnë themelet e drejtësisë dhe të paqes në botë, ruajtja e të +cilave mbështetet kryesisht mbi një regjim politik demokratik nga +njëra anë, dhe nga ana tjetër mbi një kuptim dhe respektim të +përbashkët të të drejtave të njeriut nga të cilat varen; +""" + tokens = sq_tokenizer(text) + assert len(tokens) == 182 diff --git a/spacy/tests/lang/sq/test_tokenizer.py b/spacy/tests/lang/sq/test_tokenizer.py new file mode 100644 index 000000000..8fd25f588 --- /dev/null +++ b/spacy/tests/lang/sq/test_tokenizer.py @@ -0,0 +1,31 @@ +import pytest + +SQ_BASIC_TOKENIZATION_TESTS = [ + ( + "Askush nuk mund t’i nënshtrohet torturës ose dënimeve ose " + "trajtimeve çnjerëzore ose poshtëruese.", + [ + "Askush", + "nuk", + "mund", + "t’i", + "nënshtrohet", + "torturës", + "ose", + "dënimeve", + "ose", + "trajtimeve", + "çnjerëzore", + "ose", + "poshtëruese", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SQ_BASIC_TOKENIZATION_TESTS) +def test_sq_tokenizer_basic(sq_tokenizer, text, expected_tokens): + tokens = sq_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index e6cae4d2b..b49a0c832 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -1,6 +1,5 @@ import pytest - SV_TOKEN_EXCEPTION_TESTS = [ ( "Smörsåsen används bl.a. till fisk", @@ -17,6 +16,26 @@ SV_TOKEN_EXCEPTION_TESTS = [ ] +@pytest.mark.issue(805) +@pytest.mark.parametrize( + "text,expected_tokens", + [ + ( + "Smörsåsen används bl.a. till fisk", + ["Smörsåsen", "används", "bl.a.", "till", "fisk"], + ), + ( + "Jag kommer först kl. 13 p.g.a. diverse förseningar", + ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], + ), + ], +) +def test_issue805(sv_tokenizer, text, expected_tokens): + tokens = sv_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list + + @pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS) def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): tokens = sv_tokenizer(text) diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 6a7a404fd..1c27c1744 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,6 +1,16 @@ import pytest -from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA -from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape +from spacy.attrs import intify_attrs, ENT_IOB + +from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs +from spacy.lang.en.stop_words import STOP_WORDS +from spacy.lang.lex_attrs import is_ascii, is_currency, is_punct, is_stop +from spacy.lang.lex_attrs import like_url, word_shape + + +@pytest.mark.parametrize("word", ["the"]) +@pytest.mark.issue(1889) +def test_issue1889(word): + assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) @pytest.mark.parametrize("text", ["dog"]) @@ -24,6 +34,38 @@ def test_attrs_do_deprecated(text): assert int_attrs == {ORTH: 10, IS_ALPHA: True} +def test_attrs_ent_iob_intify(): + int_attrs = intify_attrs({"ENT_IOB": ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({"ENT_IOB": "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({"ENT_IOB": "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({"ENT_IOB": "B"}) + assert int_attrs == {ENT_IOB: 3} + + int_attrs = intify_attrs({ENT_IOB: ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({ENT_IOB: "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({ENT_IOB: "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({ENT_IOB: "B"}) + assert int_attrs == {ENT_IOB: 3} + + with pytest.raises(ValueError): + int_attrs = intify_attrs({"ENT_IOB": "XX"}) + + with pytest.raises(ValueError): + int_attrs = intify_attrs({ENT_IOB: "XX"}) + + @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) def test_lex_attrs_is_punct(text, match): assert is_punct(text) == match diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/xx/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py new file mode 100644 index 000000000..477f0ebe2 --- /dev/null +++ b/spacy/tests/lang/xx/test_text.py @@ -0,0 +1,24 @@ +import pytest + + +def test_long_text(xx_tokenizer): + # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi + text = """ +Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest. +Alggmeer kriteeʹr vuâđđâʹvve meeraikõskksaž tuâjjorganisaatio, ILO, suåppmõʹšše nââmar 169. +Suåppmõõžž mieʹldd jiõččvälddsaž jânnmin jälsteei meeraid ââʹnet alggmeeran, +ko sij puõlvvâʹvve naroodâst, kååʹtt jânnam välddmõõžž leʹbe aazztummuž leʹbe ânnʼjõž riikkraaʹji šõddâm ääiʹj jälste +jânnmest leʹbe tõn mäddtiõđlaž vuuʹdest, koozz jânnam kooll. Alggmeer ij leäkku mieʹrreei sââʹjest jiiʹjjes jälstemvuuʹdest. +Alggmeer âlgg jiõčč ââʹnned jiiʹjjes alggmeeran leʹbe leeʹd tõn miõlâst, što sij lie alggmeer. +Alggmeer lij õlggâm seeilted vuõiggâdvuõđlaž sââʹjest huõlǩâni obbnes leʹbe vueʹzzi jiiʹjjes sosiaalʼlaž, täälʼlaž, +kulttuurlaž da poliittlaž instituutioid. + +Säʹmmlai statuuzz ǩeeʹrjteš Lääʹddjânnam vuâđđläkka eeʹjj 1995. Säʹmmlain alggmeeran lij vuõiggâdvuõtt tuõʹllʼjed da +ooudâsviikkâd ǩiõlâz da kulttuurâz di tõõzz kuulli ääʹrbvuâlaž jieʹllemvueʹjjeez. Sääʹmǩiõl ââʹnnmest veʹrǧǧniiʹǩǩi +åʹrnn lij šiõttuum jiiʹjjes lääʹǩǩ. Säʹmmlain lij leämmaž eeʹjjest 1996 vueʹljeeʹl dommvuuʹdsteez ǩiõlâz da kulttuurâz kuõskki +vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuulli tuâjaid håidd säʹmmlai vaalin vaʹlljääm parlameʹntt, +Sääʹmteʹǧǧ. +""" + + tokens = xx_tokenizer(text) + assert len(tokens) == 179 diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/xx/test_tokenizer.py new file mode 100644 index 000000000..15c760a6b --- /dev/null +++ b/spacy/tests/lang/xx/test_tokenizer.py @@ -0,0 +1,25 @@ +import pytest + +XX_BASIC_TOKENIZATION_TESTS = [ + ( + "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", + [ + "Lääʹddjânnmest", + "lie", + "nuʹtt", + "10", + "000", + "säʹmmliʹžžed", + ".", + "Seeʹst", + "pâʹjjel", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) +def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): + tokens = xx_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c02d65cdf..a27baf130 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab): matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0 + + +def test_matcher_ent_iob_key(en_vocab): + """Test that patterns with ent_iob works correctly.""" + matcher = Matcher(en_vocab) + matcher.add("Rule", [[{"ENT_IOB": "I"}]]) + doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"]) + doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")] + doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"]) + doc2.ents = [Span(doc2, 4, 5, label="PERSON")] + matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)] + matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)] + assert len(matches1) == 1 + assert matches1[0] == "York" + assert len(matches2) == 0 + + matcher = Matcher(en_vocab) # Test iob pattern with operators + matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]]) + doc = Doc( + en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"] + ) + doc.ents = [Span(doc, 4, 7, label="PERSON")] + matches = [doc[start:end].text for _, start, end in matcher(doc)] + assert len(matches) == 3 + assert matches[0] == "Maria" + assert matches[1] == "Maria Esperanza" + assert matches[2] == "Esperanza" diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index b96bb2032..3649b07ed 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -1,10 +1,14 @@ -import pytest import re -from spacy.lang.en import English -from spacy.matcher import Matcher -from spacy.tokens import Doc, Span +import pytest +from spacy.attrs import IS_PUNCT, LOWER, ORTH +from spacy.errors import MatchPatternError +from spacy.lang.en import English +from spacy.lang.lex_attrs import LEX_ATTRS +from spacy.matcher import Matcher +from spacy.tokens import Doc, Span, Token +from spacy.vocab import Vocab pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}] @@ -36,6 +40,473 @@ def doc(en_tokenizer, text): return doc +@pytest.mark.issue(118) +@pytest.mark.parametrize( + "patterns", + [ + [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], + [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]], + ], +) +def test_issue118(en_tokenizer, patterns): + """Test a bug that arose from having overlapping matches""" + text = ( + "how many points did lebron james score against the boston celtics last night" + ) + doc = en_tokenizer(text) + ORG = doc.vocab.strings["ORG"] + matcher = Matcher(doc.vocab) + matcher.add("BostonCeltics", patterns) + assert len(list(doc.ents)) == 0 + matches = [(ORG, start, end) for _, start, end in matcher(doc)] + assert matches == [(ORG, 9, 11), (ORG, 10, 11)] + doc.ents = matches[:1] + ents = list(doc.ents) + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +@pytest.mark.issue(118) +@pytest.mark.parametrize( + "patterns", + [ + [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], + [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]], + ], +) +def test_issue118_prefix_reorder(en_tokenizer, patterns): + """Test a bug that arose from having overlapping matches""" + text = ( + "how many points did lebron james score against the boston celtics last night" + ) + doc = en_tokenizer(text) + ORG = doc.vocab.strings["ORG"] + matcher = Matcher(doc.vocab) + matcher.add("BostonCeltics", patterns) + assert len(list(doc.ents)) == 0 + matches = [(ORG, start, end) for _, start, end in matcher(doc)] + doc.ents += tuple(matches)[1:] + assert matches == [(ORG, 9, 10), (ORG, 9, 11)] + ents = doc.ents + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +@pytest.mark.issue(242) +def test_issue242(en_tokenizer): + """Test overlapping multi-word phrases.""" + text = "There are different food safety standards in different countries." + patterns = [ + [{"LOWER": "food"}, {"LOWER": "safety"}], + [{"LOWER": "safety"}, {"LOWER": "standards"}], + ] + doc = en_tokenizer(text) + matcher = Matcher(doc.vocab) + matcher.add("FOOD", patterns) + matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] + match1, match2 = matches + assert match1[1] == 3 + assert match1[2] == 5 + assert match2[1] == 4 + assert match2[2] == 6 + with pytest.raises(ValueError): + # One token can only be part of one entity, so test that the matches + # can't be added as entities + doc.ents += tuple(matches) + + +@pytest.mark.issue(587) +def test_issue587(en_tokenizer): + """Test that Matcher doesn't segfault on particular input""" + doc = en_tokenizer("a b; c") + matcher = Matcher(doc.vocab) + matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]]) + matches = matcher(doc) + assert len(matches) == 1 + matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]]) + matches = matcher(doc) + assert len(matches) == 2 + matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]]) + matches = matcher(doc) + assert len(matches) == 2 + + +@pytest.mark.issue(588) +def test_issue588(en_vocab): + """Test if empty specs still cause an error when adding patterns""" + matcher = Matcher(en_vocab) + with pytest.raises(ValueError): + matcher.add("TEST", [[]]) + + +@pytest.mark.issue(590) +def test_issue590(en_vocab): + """Test overlapping matches""" + doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) + matcher = Matcher(en_vocab) + matcher.add( + "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]] + ) + matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]]) + matches = matcher(doc) + assert len(matches) == 2 + + +@pytest.mark.issue(615) +def test_issue615(en_tokenizer): + def merge_phrases(matcher, doc, i, matches): + """Merge a phrase. We have to be careful here because we'll change the + token indices. To avoid problems, merge all the phrases once we're called + on the last match.""" + if i != len(matches) - 1: + return None + spans = [Span(doc, start, end, label=label) for label, start, end in matches] + with doc.retokenize() as retokenizer: + for span in spans: + tag = "NNP" if span.label_ else span.root.tag_ + attrs = {"tag": tag, "lemma": span.text} + retokenizer.merge(span, attrs=attrs) + doc.ents = doc.ents + (span,) + + text = "The golf club is broken" + pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] + label = "Sport_Equipment" + doc = en_tokenizer(text) + matcher = Matcher(doc.vocab) + matcher.add(label, [pattern], on_match=merge_phrases) + matcher(doc) + entities = list(doc.ents) + assert entities != [] + assert entities[0].label != 0 + + +@pytest.mark.issue(850) +def test_issue850(): + """The variable-length pattern matches the succeeding token. Check we + handle the ambiguity correctly.""" + vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) + matcher = Matcher(vocab) + pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] + matcher.add("FarAway", [pattern]) + doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) + match = matcher(doc) + assert len(match) == 1 + ent_id, start, end = match[0] + assert start == 0 + assert end == 4 + + +@pytest.mark.issue(850) +def test_issue850_basic(): + """Test Matcher matches with '*' operator and Boolean flag""" + vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) + matcher = Matcher(vocab) + pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] + matcher.add("FarAway", [pattern]) + doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) + match = matcher(doc) + assert len(match) == 1 + ent_id, start, end = match[0] + assert start == 0 + assert end == 4 + + +@pytest.mark.issue(1434) +def test_issue1434(): + """Test matches occur when optional element at end of short doc.""" + pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] + vocab = Vocab(lex_attr_getters=LEX_ATTRS) + hello_world = Doc(vocab, words=["Hello", "World"]) + hello = Doc(vocab, words=["Hello"]) + matcher = Matcher(vocab) + matcher.add("MyMatcher", [pattern]) + matches = matcher(hello_world) + assert matches + matches = matcher(hello) + assert matches + + +@pytest.mark.parametrize( + "string,start,end", + [ + ("a", 0, 1), + ("a b", 0, 2), + ("a c", 0, 1), + ("a b c", 0, 2), + ("a b b c", 0, 3), + ("a b b", 0, 3), + ], +) +@pytest.mark.issue(1450) +def test_issue1450(string, start, end): + """Test matcher works when patterns end with * operator.""" + pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] + matcher = Matcher(Vocab()) + matcher.add("TSTEND", [pattern]) + doc = Doc(Vocab(), words=string.split()) + matches = matcher(doc) + if start is None or end is None: + assert matches == [] + assert matches[-1][1] == start + assert matches[-1][2] == end + + +@pytest.mark.issue(1945) +def test_issue1945(): + """Test regression in Matcher introduced in v2.0.6.""" + matcher = Matcher(Vocab()) + matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) + doc = Doc(matcher.vocab, words=["a", "a", "a"]) + matches = matcher(doc) # we should see two overlapping matches here + assert len(matches) == 2 + assert matches[0][1:] == (0, 2) + assert matches[1][1:] == (1, 3) + + +@pytest.mark.issue(1971) +def test_issue1971(en_vocab): + # Possibly related to #2675 and #2671? + matcher = Matcher(en_vocab) + pattern = [ + {"ORTH": "Doe"}, + {"ORTH": "!", "OP": "?"}, + {"_": {"optional": True}, "OP": "?"}, + {"ORTH": "!", "OP": "?"}, + ] + Token.set_extension("optional", default=False) + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) + # We could also assert length 1 here, but this is more conclusive, because + # the real problem here is that it returns a duplicate match for a match_id + # that's not actually in the vocab! + matches = matcher(doc) + assert all([match_id in en_vocab.strings for match_id, start, end in matches]) + + +@pytest.mark.issue(1971) +def test_issue_1971_2(en_vocab): + matcher = Matcher(en_vocab) + pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] + pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] + doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) + matcher.add("TEST1", [pattern1, pattern2]) + matches = matcher(doc) + assert len(matches) == 2 + + +@pytest.mark.issue(1971) +def test_issue_1971_3(en_vocab): + """Test that pattern matches correctly for multiple extension attributes.""" + Token.set_extension("a", default=1, force=True) + Token.set_extension("b", default=2, force=True) + doc = Doc(en_vocab, words=["hello", "world"]) + matcher = Matcher(en_vocab) + matcher.add("A", [[{"_": {"a": 1}}]]) + matcher.add("B", [[{"_": {"b": 2}}]]) + matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) + assert len(matches) == 4 + assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) + + +@pytest.mark.issue(1971) +def test_issue_1971_4(en_vocab): + """Test that pattern matches correctly with multiple extension attribute + values on a single token. + """ + Token.set_extension("ext_a", default="str_a", force=True) + Token.set_extension("ext_b", default="str_b", force=True) + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["this", "is", "text"]) + pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 + matcher.add("TEST", [pattern]) + matches = matcher(doc) + # Uncommenting this caused a segmentation fault + assert len(matches) == 1 + assert matches[0] == (en_vocab.strings["TEST"], 0, 3) + + +@pytest.mark.issue(2464) +def test_issue2464(en_vocab): + """Test problem with successive ?. This is the same bug, so putting it here.""" + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["a", "b"]) + matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]]) + matches = matcher(doc) + assert len(matches) == 3 + + +@pytest.mark.issue(2569) +def test_issue2569(en_tokenizer): + """Test that operator + is greedy.""" + doc = en_tokenizer("It is May 15, 1993.") + doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] + matcher = Matcher(doc.vocab) + matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]]) + matched = [doc[start:end] for _, start, end in matcher(doc)] + matched = sorted(matched, key=len, reverse=True) + assert len(matched) == 10 + assert len(matched[0]) == 4 + assert matched[0].text == "May 15, 1993" + + +@pytest.mark.issue(2671) +def test_issue2671(): + """Ensure the correct entity ID is returned for matches with quantifiers. + See also #2675 + """ + nlp = English() + matcher = Matcher(nlp.vocab) + pattern_id = "test_pattern" + pattern = [ + {"LOWER": "high"}, + {"IS_PUNCT": True, "OP": "?"}, + {"LOWER": "adrenaline"}, + ] + matcher.add(pattern_id, [pattern]) + doc1 = nlp("This is a high-adrenaline situation.") + doc2 = nlp("This is a high adrenaline situation.") + matches1 = matcher(doc1) + for match_id, start, end in matches1: + assert nlp.vocab.strings[match_id] == pattern_id + matches2 = matcher(doc2) + for match_id, start, end in matches2: + assert nlp.vocab.strings[match_id] == pattern_id + + +@pytest.mark.issue(3009) +def test_issue3009(en_vocab): + """Test problem with matcher quantifiers""" + patterns = [ + [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}], + [ + {"ORTH": "has"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"TAG": "IN"}, + ], + [ + {"ORTH": "has"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"TAG": "IN"}, + ], + ] + words = ["also", "has", "to", "do", "with"] + tags = ["RB", "VBZ", "TO", "VB", "IN"] + pos = ["ADV", "VERB", "ADP", "VERB", "ADP"] + doc = Doc(en_vocab, words=words, tags=tags, pos=pos) + matcher = Matcher(en_vocab) + for i, pattern in enumerate(patterns): + matcher.add(str(i), [pattern]) + matches = matcher(doc) + assert matches + + +@pytest.mark.issue(3328) +def test_issue3328(en_vocab): + doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) + matcher = Matcher(en_vocab) + patterns = [ + [{"LOWER": {"IN": ["hello", "how"]}}], + [{"LOWER": {"IN": ["you", "doing"]}}], + ] + matcher.add("TEST", patterns) + matches = matcher(doc) + assert len(matches) == 4 + matched_texts = [doc[start:end].text for _, start, end in matches] + assert matched_texts == ["Hello", "how", "you", "doing"] + + +@pytest.mark.issue(3549) +def test_issue3549(en_vocab): + """Test that match pattern validation doesn't raise on empty errors.""" + matcher = Matcher(en_vocab, validate=True) + pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] + matcher.add("GOOD", [pattern]) + with pytest.raises(MatchPatternError): + matcher.add("BAD", [[{"X": "Y"}]]) + + +@pytest.mark.skip("Matching currently only works on strings and integers") +@pytest.mark.issue(3555) +def test_issue3555(en_vocab): + """Test that custom extensions with default None don't break matcher.""" + Token.set_extension("issue3555", default=None) + matcher = Matcher(en_vocab) + pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["have", "apple"]) + matcher(doc) + + +@pytest.mark.issue(3839) +def test_issue3839(en_vocab): + """Test that match IDs returned by the matcher are correct, are in the string""" + doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) + matcher = Matcher(en_vocab) + match_id = "PATTERN" + pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] + pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] + matcher.add(match_id, [pattern1]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + matcher = Matcher(en_vocab) + matcher.add(match_id, [pattern2]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + + +@pytest.mark.issue(3879) +def test_issue3879(en_vocab): + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + assert len(doc) == 5 + pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] + matcher = Matcher(en_vocab) + matcher.add("TEST", [pattern]) + assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' + + +@pytest.mark.issue(3951) +def test_issue3951(en_vocab): + """Test that combinations of optional rules are matched correctly.""" + matcher = Matcher(en_vocab) + pattern = [ + {"LOWER": "hello"}, + {"LOWER": "this", "OP": "?"}, + {"OP": "?"}, + {"LOWER": "world"}, + ] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) + matches = matcher(doc) + assert len(matches) == 0 + + +@pytest.mark.issue(4120) +def test_issue4120(en_vocab): + """Test that matches without a final {OP: ?} token are returned.""" + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) + doc1 = Doc(en_vocab, words=["a"]) + assert len(matcher(doc1)) == 1 # works + doc2 = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc2)) == 2 # fixed + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) + doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc3)) == 2 # works + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) + doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc4)) == 3 # fixed + + @pytest.mark.parametrize( "pattern,re_pattern", [ diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 74feb7c5d..8c265785c 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -12,6 +12,7 @@ TEST_PATTERNS = [ ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), + ([{"ENT_IOB": "foo"}], 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 478949601..f893d81f8 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,8 +1,125 @@ import pytest import srsly from mock import Mock -from spacy.matcher import PhraseMatcher + +from spacy.lang.en import English +from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span +from spacy.vocab import Vocab + + +from ..util import make_tempdir + + +@pytest.mark.issue(3248) +def test_issue3248_1(): + """Test that the PhraseMatcher correctly reports its number of rules, not + total number of patterns.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) + matcher.add("TEST2", [nlp("d")]) + assert len(matcher) == 2 + + +@pytest.mark.issue(3331) +def test_issue3331(en_vocab): + """Test that duplicate patterns for different rules result in multiple + matches, one per rule. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])]) + matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])]) + doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) + matches = matcher(doc) + assert len(matches) == 2 + match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] + assert sorted(match_ids) == ["A", "B"] + + +@pytest.mark.issue(3972) +def test_issue3972(en_vocab): + """Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) + matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) + doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) + matches = matcher(doc) + + assert len(matches) == 2 + + # We should have a match for each of the two rules + found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] + assert "A" in found_ids + assert "B" in found_ids + + +@pytest.mark.issue(4002) +def test_issue4002(en_vocab): + """Test that the PhraseMatcher can match on overwritten NORM attributes.""" + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern1 = Doc(en_vocab, words=["c", "d"]) + assert [t.norm_ for t in pattern1] == ["c", "d"] + matcher.add("TEST", [pattern1]) + doc = Doc(en_vocab, words=["a", "b", "c", "d"]) + assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] + matches = matcher(doc) + assert len(matches) == 1 + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern2 = Doc(en_vocab, words=["1", "2"]) + pattern2[0].norm_ = "c" + pattern2[1].norm_ = "d" + assert [t.norm_ for t in pattern2] == ["c", "d"] + matcher.add("TEST", [pattern2]) + matches = matcher(doc) + assert len(matches) == 1 + + +@pytest.mark.issue(4373) +def test_issue4373(): + """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" + matcher = Matcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + matcher = PhraseMatcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + + +@pytest.mark.issue(4651) +def test_issue4651_with_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialized correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + ruler.add_patterns(patterns) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +@pytest.mark.issue(6839) +def test_issue6839(en_vocab): + """Ensure that PhraseMatcher accepts Span as input""" + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches = matcher(span) + assert matches def test_matcher_phrase_matcher(en_vocab): diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 75908df59..e20227455 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -12,6 +12,7 @@ def test_build_dependencies(): "flake8", "hypothesis", "pre-commit", + "black", "mypy", "types-dataclasses", "types-mock", diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index cba6fa81e..bb226f9c5 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -40,6 +40,28 @@ def arc_eager(vocab): return moves +@pytest.mark.issue(7056) +def test_issue7056(): + """Test that the Unshift transition works properly, and doesn't cause + sentence segmentation errors.""" + vocab = Vocab() + ae = ArcEager( + vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + ) + doc = Doc(vocab, words="Severe pain , after trauma".split()) + state = ae.init_batch([doc])[0] + ae.apply_transition(state, "S") + ae.apply_transition(state, "L-amod") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "R-pobj") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + assert not state.eol() + + def test_oracle_four_words(arc_eager, vocab): words = ["a", "b", "c", "d"] heads = [1, 1, 3, 3] diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 21094bcb1..b3b29d1f9 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,13 +1,16 @@ +import random + import pytest from numpy.testing import assert_equal -from spacy.attrs import ENT_IOB +from spacy.attrs import ENT_IOB from spacy import util, registry from spacy.lang.en import English +from spacy.lang.it import Italian from spacy.language import Language from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown -from spacy.training import Example +from spacy.training import Example, iob_to_biluo from spacy.tokens import Doc, Span from spacy.vocab import Vocab import logging @@ -58,6 +61,152 @@ def tsys(vocab, entity_types): return BiluoPushDown(vocab.strings, actions) +@pytest.mark.parametrize("label", ["U-JOB-NAME"]) +@pytest.mark.issue(1967) +def test_issue1967(label): + nlp = Language() + config = {} + ner = nlp.create_pipe("ner", config=config) + example = Example.from_dict( + Doc(ner.vocab, words=["word"]), + { + "ids": [0], + "words": ["word"], + "tags": ["tag"], + "heads": [0], + "deps": ["dep"], + "entities": [label], + }, + ) + assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] + + +@pytest.mark.issue(2179) +def test_issue2179(): + """Test that spurious 'extra_labels' aren't created when initializing NER.""" + nlp = Italian() + ner = nlp.add_pipe("ner") + ner.add_label("CITIZENSHIP") + nlp.initialize() + nlp2 = Italian() + nlp2.add_pipe("ner") + assert len(nlp2.get_pipe("ner").labels) == 0 + model = nlp2.get_pipe("ner").model + model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves) + nlp2.from_bytes(nlp.to_bytes()) + assert "extra_labels" not in nlp2.get_pipe("ner").cfg + assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) + + +@pytest.mark.issue(2385) +def test_issue2385(): + """Test that IOB tags are correctly converted to BILUO tags.""" + # fix bug in labels with a 'b' character + tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") + assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] + # maintain support for iob1 format + tags2 = ("I-ORG", "I-ORG", "B-ORG") + assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] + # maintain support for iob2 format + tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") + assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"] + + +@pytest.mark.issue(2800) +def test_issue2800(): + """Test issue that arises when too many labels are added to NER model. + Used to cause segfault. + """ + nlp = English() + train_data = [] + train_data.extend( + [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] + ) + entity_types = [str(i) for i in range(1000)] + ner = nlp.add_pipe("ner") + for entity_type in list(entity_types): + ner.add_label(entity_type) + optimizer = nlp.initialize() + for i in range(20): + losses = {} + random.shuffle(train_data) + for example in train_data: + nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) + + +@pytest.mark.issue(3209) +def test_issue3209(): + """Test issue that occurred in spaCy nightly where NER labels were being + mapped to classes incorrectly after loading the model, when the labels + were added using ner.add_label(). + """ + nlp = English() + ner = nlp.add_pipe("ner") + ner.add_label("ANIMAL") + nlp.initialize() + move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] + assert ner.move_names == move_names + nlp2 = English() + ner2 = nlp2.add_pipe("ner") + model = ner2.model + model.attrs["resize_output"](model, ner.moves.n_moves) + nlp2.from_bytes(nlp.to_bytes()) + assert ner2.move_names == move_names + + +@pytest.mark.issue(4267) +def test_issue4267(): + """Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.add_pipe("ner") + ner.add_label("PEOPLE") + nlp.initialize() + assert "ner" in nlp.pipe_names + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.has_annotation("ENT_IOB") + for token in doc1: + assert token.ent_iob == 2 + # add entity ruler and run again + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.has_annotation("ENT_IOB") + for token in doc2: + assert token.ent_iob == 2 + + +@pytest.mark.issue(4313) +def test_issue4313(): + """This should not crash or exit with some strange error code""" + beam_width = 16 + beam_density = 0.0001 + nlp = English() + config = { + "beam_width": beam_width, + "beam_density": beam_density, + } + ner = nlp.add_pipe("beam_ner", config=config) + ner.add_label("SOME_LABEL") + nlp.initialize() + # add a new label to the doc + doc = nlp("What do you think about Apple ?") + assert len(ner.labels) == 1 + assert "SOME_LABEL" in ner.labels + apple_ent = Span(doc, 5, 6, label="MY_ORG") + doc.ents = list(doc.ents) + [apple_ent] + + # ensure the beam_parse still works with the new label + docs = [doc] + ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density) + assert len(ner.labels) == 2 + assert "MY_ORG" in ner.labels + + def test_get_oracle_moves(tsys, doc, entity_annots): example = Example.from_dict(doc, {"entities": entity_annots}) act_classes = tsys.get_oracle_sequence(example, _debug=False) diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 3957e4d77..60d000c44 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab): assert nonproj.is_decorated("X") is False nonproj._lift(0, tree) assert tree == [2, 2, 2] - assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 - assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 + assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7 + assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10 # fmt: off proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index b7575d063..7bbb30d8e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,15 +1,17 @@ import pytest from numpy.testing import assert_equal -from spacy.attrs import DEP +from thinc.api import Adam +from spacy import registry, util +from spacy.attrs import DEP, NORM from spacy.lang.en import English -from spacy.training import Example from spacy.tokens import Doc -from spacy import util, registry +from spacy.training import Example +from spacy.vocab import Vocab -from ..util import apply_transition_sequence, make_tempdir from ...pipeline import DependencyParser from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL +from ..util import apply_transition_sequence, make_tempdir TRAIN_DATA = [ ( @@ -59,6 +61,94 @@ PARTIAL_DATA = [ eps = 0.1 +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + + +@pytest.fixture +def parser(vocab): + vocab.strings.add("ROOT") + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser = DependencyParser(vocab, model) + parser.cfg["token_vector_width"] = 4 + parser.cfg["hidden_width"] = 32 + # parser.add_label('right') + parser.add_label("left") + parser.initialize(lambda: [_parser_example(parser)]) + sgd = Adam(0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=["a", "b", "c", "d"]) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) + parser.update([example], sgd=sgd, losses=losses) + return parser + + +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +@pytest.mark.issue(2772) +def test_issue2772(en_vocab): + """Test that deprojectivization doesn't mess up sentence boundaries.""" + # fmt: off + words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."] + # fmt: on + # A tree with a non-projective (i.e. crossing) arc + # The arcs (0, 4) and (2, 9) cross. + heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9] + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + assert doc[1].is_sent_start is False + + +@pytest.mark.issue(3830) +def test_issue3830_no_subtok(): + """Test that the parser doesn't have subtok label if not learn_tokens""" + config = { + "learn_tokens": False, + } + model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] + parser = DependencyParser(Vocab(), model, **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.initialize(lambda: [_parser_example(parser)]) + assert "subtok" not in parser.labels + + +@pytest.mark.issue(3830) +def test_issue3830_with_subtok(): + """Test that the parser does have subtok label if learn_tokens=True.""" + config = { + "learn_tokens": True, + } + model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] + parser = DependencyParser(Vocab(), model, **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.initialize(lambda: [_parser_example(parser)]) + assert "subtok" in parser.labels + + +@pytest.mark.issue(7716) +@pytest.mark.xfail(reason="Not fixed yet") +def test_partial_annotation(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + doc[2].is_sent_start = False + # Note that if the following line is used, then doc[2].is_sent_start == False + # doc[3].is_sent_start = False + + doc = parser(doc) + assert doc[2].is_sent_start == False + + def test_parser_root(en_vocab): words = ["i", "do", "n't", "have", "other", "assistance"] heads = [3, 3, 3, 3, 5, 3] diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index a98d01964..3740e430e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,18 +1,20 @@ from typing import Callable, Iterable + import pytest from numpy.testing import assert_equal + +from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import KnowledgeBase, get_candidates, Candidate -from spacy.vocab import Vocab - -from spacy import util, registry +from spacy.kb import Candidate, KnowledgeBase, get_candidates +from spacy.lang.en import English from spacy.ml import load_kb from spacy.scorer import Scorer -from spacy.training import Example -from spacy.lang.en import English from spacy.tests.util import make_tempdir from spacy.tokens import Span +from spacy.training import Example +from spacy.util import ensure_path +from spacy.vocab import Vocab @pytest.fixture @@ -25,6 +27,198 @@ def assert_almost_equal(a, b): assert a - delta <= b <= a + delta +@pytest.mark.issue(4674) +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) + assert kb.get_size_entities() == 1 + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.to_disk(str(file_path)) + kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb2.from_disk(str(file_path)) + assert kb2.get_size_entities() == 1 + + +@pytest.mark.issue(6730) +def test_issue6730(en_vocab): + """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" + from spacy.kb import KnowledgeBase + + kb = KnowledgeBase(en_vocab, entity_vector_length=3) + kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) + + with pytest.raises(ValueError): + kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) + assert kb.contains_alias("") is False + + kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) + kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) + + with make_tempdir() as tmp_dir: + kb.to_disk(tmp_dir) + kb.from_disk(tmp_dir) + assert kb.get_size_aliases() == 2 + assert set(kb.get_alias_strings()) == {"x", "y"} + + +@pytest.mark.issue(7065) +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + { + "label": "THING", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + } + ] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 + + +@pytest.mark.issue(7065) +def test_issue7065_b(): + # Test that the NEL doesn't crash when an entity crosses a sentence boundary + nlp = English() + vector_length = 3 + nlp.add_pipe("sentencizer") + text = "Mahler 's Symphony No. 8 was beautiful." + entities = [(0, 6, "PERSON"), (10, 24, "WORK")] + links = { + (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, + (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, + } + sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + doc = nlp(text) + example = Example.from_dict( + doc, {"entities": entities, "links": links, "sent_starts": sent_starts} + ) + train_examples = [example] + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="No. 8", + entities=["Q270853"], + probabilities=[1.0], + ) + mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias( + alias="Mahler", + entities=["Q7304"], + probabilities=[1.0], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + # train the NEL pipe + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # Add a custom rule-based component to mimick NER + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, + { + "label": "WORK", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + }, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + # test the trained model - this should not throw E148 + doc = nlp(text) + assert doc + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + TRAIN_DATA = [ + ( + "Russ Cochran his reprints include EC Comics.", + { + "links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"] + + def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index e66b49518..f2031d0a9 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -1,9 +1,11 @@ import pytest from spacy import registry -from spacy.tokens import Span +from spacy.tokens import Doc, Span from spacy.language import Language -from spacy.pipeline import EntityRuler +from spacy.lang.en import English +from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities +from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.errors import MatchPatternError from spacy.tests.util import make_tempdir @@ -34,6 +36,117 @@ def add_ent_component(doc): return doc +@pytest.mark.issue(3345) +def test_issue3345(): + """Test case where preset entity crosses sentence boundary.""" + nlp = English() + doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) + doc[4].is_sent_start = True + ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) + cfg = {"model": DEFAULT_NER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + ner = EntityRecognizer(doc.vocab, model) + # Add the OUT action. I wouldn't have thought this would be necessary... + ner.moves.add_action(5, "") + ner.add_label("GPE") + doc = ruler(doc) + # Get into the state just before "New" + state = ner.moves.init_batch([doc])[0] + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + # Check that B-GPE is valid. + assert ner.moves.is_valid(state, "B-GPE") + + +@pytest.mark.issue(4849) +def test_issue4849(): + nlp = English() + patterns = [ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, + ] + ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + ruler.add_patterns(patterns) + text = """ + The left is starting to take aim at Democratic front-runner Joe Biden. + Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." + """ + # USING 1 PROCESS + count_ents = 0 + for doc in nlp.pipe([text], n_process=1): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + # USING 2 PROCESSES + if isinstance(get_current_ops, NumpyOps): + count_ents = 0 + for doc in nlp.pipe([text], n_process=2): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + + +@pytest.mark.issue(5918) +def test_issue5918(): + # Test edge case when merging entities. + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + {"label": "ORG", "pattern": "Digicon Inc"}, + {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, + {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"}, + ] + ruler.add_patterns(patterns) + + text = """ + Digicon Inc said it has completed the previously-announced disposition + of its computer systems division to an investment group led by + Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate. + """ + doc = nlp(text) + assert len(doc.ents) == 3 + # make it so that the third span's head is within the entity (ent_iob=I) + # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents. + # TODO: test for logging here + # with pytest.warns(UserWarning): + # doc[29].head = doc[33] + doc = merge_entities(doc) + assert len(doc.ents) == 3 + + +@pytest.mark.issue(8168) +def test_issue8168(): + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + {"label": "ORG", "pattern": "Apple"}, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], + "id": "san-francisco", + }, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], + "id": "san-francisco", + }, + ] + ruler.add_patterns(patterns) + + assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} + + +@pytest.mark.issue(8216) +def test_entity_ruler_fix8216(nlp, patterns): + """Test that patterns don't get added excessively.""" + ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) + ruler.add_patterns(patterns) + pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert pattern_count > 0 + ruler.add_patterns([]) + after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert after_count == pattern_count + + def test_entity_ruler_init(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) @@ -260,3 +373,185 @@ def test_entity_ruler_serialize_dir(nlp, patterns): ruler.from_disk(d / "test_ruler") # read from an existing directory with pytest.raises(ValueError): ruler.from_disk(d / "non_existing_dir") # read from a bad directory + + +def test_entity_ruler_remove_basic(nlp): + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": "Duygu", "id": "duygu"}, + {"label": "ORG", "pattern": "ACME", "id": "acme"}, + {"label": "ORG", "pattern": "ACM"}, + ] + ruler.add_patterns(patterns) + doc = ruler(nlp.make_doc("Duygu went to school")) + assert len(ruler.patterns) == 3 + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "PERSON" + assert doc.ents[0].text == "Duygu" + assert "PERSON||duygu" in ruler.phrase_matcher + ruler.remove("duygu") + doc = ruler(nlp.make_doc("Duygu went to school")) + assert len(doc.ents) == 0 + assert "PERSON||duygu" not in ruler.phrase_matcher + assert len(ruler.patterns) == 2 + + +def test_entity_ruler_remove_same_id_multiple_patterns(nlp): + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": "Duygu", "id": "duygu"}, + {"label": "ORG", "pattern": "DuyguCorp", "id": "duygu"}, + {"label": "ORG", "pattern": "ACME", "id": "acme"}, + ] + ruler.add_patterns(patterns) + doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME.")) + assert len(ruler.patterns) == 3 + assert "PERSON||duygu" in ruler.phrase_matcher + assert "ORG||duygu" in ruler.phrase_matcher + assert len(doc.ents) == 3 + ruler.remove("duygu") + doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME.")) + assert len(ruler.patterns) == 1 + assert "PERSON||duygu" not in ruler.phrase_matcher + assert "ORG||duygu" not in ruler.phrase_matcher + assert len(doc.ents) == 1 + + +def test_entity_ruler_remove_nonexisting_pattern(nlp): + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": "Duygu", "id": "duygu"}, + {"label": "ORG", "pattern": "ACME", "id": "acme"}, + {"label": "ORG", "pattern": "ACM"}, + ] + ruler.add_patterns(patterns) + assert len(ruler.patterns) == 3 + with pytest.raises(ValueError): + ruler.remove("nepattern") + assert len(ruler.patterns) == 3 + + +def test_entity_ruler_remove_several_patterns(nlp): + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": "Duygu", "id": "duygu"}, + {"label": "ORG", "pattern": "ACME", "id": "acme"}, + {"label": "ORG", "pattern": "ACM"}, + ] + ruler.add_patterns(patterns) + doc = ruler(nlp.make_doc("Duygu founded her company ACME.")) + assert len(ruler.patterns) == 3 + assert len(doc.ents) == 2 + assert doc.ents[0].label_ == "PERSON" + assert doc.ents[0].text == "Duygu" + assert doc.ents[1].label_ == "ORG" + assert doc.ents[1].text == "ACME" + ruler.remove("duygu") + doc = ruler(nlp.make_doc("Duygu founded her company ACME")) + assert len(ruler.patterns) == 2 + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "ORG" + assert doc.ents[0].text == "ACME" + ruler.remove("acme") + doc = ruler(nlp.make_doc("Duygu founded her company ACME")) + assert len(ruler.patterns) == 1 + assert len(doc.ents) == 0 + + +def test_entity_ruler_remove_patterns_in_a_row(nlp): + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": "Duygu", "id": "duygu"}, + {"label": "ORG", "pattern": "ACME", "id": "acme"}, + {"label": "DATE", "pattern": "her birthday", "id": "bday"}, + {"label": "ORG", "pattern": "ACM"}, + ] + ruler.add_patterns(patterns) + doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday")) + assert len(doc.ents) == 3 + assert doc.ents[0].label_ == "PERSON" + assert doc.ents[0].text == "Duygu" + assert doc.ents[1].label_ == "ORG" + assert doc.ents[1].text == "ACME" + assert doc.ents[2].label_ == "DATE" + assert doc.ents[2].text == "her birthday" + ruler.remove("duygu") + ruler.remove("acme") + ruler.remove("bday") + doc = ruler(nlp.make_doc("Duygu went to school")) + assert len(doc.ents) == 0 + + +def test_entity_ruler_remove_all_patterns(nlp): + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": "Duygu", "id": "duygu"}, + {"label": "ORG", "pattern": "ACME", "id": "acme"}, + {"label": "DATE", "pattern": "her birthday", "id": "bday"}, + ] + ruler.add_patterns(patterns) + assert len(ruler.patterns) == 3 + ruler.remove("duygu") + assert len(ruler.patterns) == 2 + ruler.remove("acme") + assert len(ruler.patterns) == 1 + ruler.remove("bday") + assert len(ruler.patterns) == 0 + with pytest.warns(UserWarning): + doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday")) + assert len(doc.ents) == 0 + + +def test_entity_ruler_remove_and_add(nlp): + ruler = EntityRuler(nlp) + patterns = [{"label": "DATE", "pattern": "last time"}] + ruler.add_patterns(patterns) + doc = ruler( + nlp.make_doc("I saw him last time we met, this time he brought some flowers") + ) + assert len(ruler.patterns) == 1 + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "DATE" + assert doc.ents[0].text == "last time" + patterns1 = [{"label": "DATE", "pattern": "this time", "id": "ttime"}] + ruler.add_patterns(patterns1) + doc = ruler( + nlp.make_doc("I saw him last time we met, this time he brought some flowers") + ) + assert len(ruler.patterns) == 2 + assert len(doc.ents) == 2 + assert doc.ents[0].label_ == "DATE" + assert doc.ents[0].text == "last time" + assert doc.ents[1].label_ == "DATE" + assert doc.ents[1].text == "this time" + ruler.remove("ttime") + doc = ruler( + nlp.make_doc("I saw him last time we met, this time he brought some flowers") + ) + assert len(ruler.patterns) == 1 + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "DATE" + assert doc.ents[0].text == "last time" + ruler.add_patterns(patterns1) + doc = ruler( + nlp.make_doc("I saw him last time we met, this time he brought some flowers") + ) + assert len(ruler.patterns) == 2 + assert len(doc.ents) == 2 + patterns2 = [{"label": "DATE", "pattern": "another time", "id": "ttime"}] + ruler.add_patterns(patterns2) + doc = ruler( + nlp.make_doc( + "I saw him last time we met, this time he brought some flowers, another time some chocolate." + ) + ) + assert len(ruler.patterns) == 3 + assert len(doc.ents) == 3 + ruler.remove("ttime") + doc = ruler( + nlp.make_doc( + "I saw him last time we met, this time he brought some flowers, another time some chocolate." + ) + ) + assert len(ruler.patterns) == 1 + assert len(doc.ents) == 1 diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 0c2554727..4128e2a48 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,4 +1,6 @@ import pytest + +import spacy from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German @@ -11,6 +13,37 @@ from pydantic import StrictInt, StrictStr from ..util import make_tempdir +@pytest.mark.issue(5137) +def test_issue5137(): + factory_name = "test_issue5137" + pipe_name = "my_component" + + @Language.factory(factory_name) + class MyComponent: + def __init__(self, nlp, name=pipe_name, categories="all_categories"): + self.nlp = nlp + self.categories = categories + self.name = name + + def __call__(self, doc): + pass + + def to_disk(self, path, **kwargs): + pass + + def from_disk(self, path, **cfg): + pass + + nlp = English() + my_component = nlp.add_pipe(factory_name, name=pipe_name) + assert my_component.categories == "all_categories" + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + overrides = {"components": {pipe_name: {"categories": "my_categories"}}} + nlp2 = spacy.load(tmpdir, config=overrides) + assert nlp2.get_pipe(pipe_name).categories == "my_categories" + + def test_pipe_function_component(): name = "test_component" diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 87fd64307..4b8fb8ebc 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,9 +1,17 @@ +import gc + +import numpy import pytest +from thinc.api import get_current_ops + +from spacy.lang.en import English +from spacy.lang.en.syntax_iterators import noun_chunks from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.tokens import Doc from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names -from spacy.lang.en import English +from spacy.vocab import Vocab @pytest.fixture @@ -21,6 +29,138 @@ def other_pipe(doc): return doc +@pytest.mark.issue(1506) +def test_issue1506(): + def string_generator(): + for _ in range(10001): + yield "It's sentence produced by that bug." + for _ in range(10001): + yield "I erase some hbdsaj lemmas." + for _ in range(10001): + yield "I erase lemmas." + for _ in range(10001): + yield "It's sentence produced by that bug." + for _ in range(10001): + yield "It's sentence produced by that bug." + + nlp = English() + for i, d in enumerate(nlp.pipe(string_generator())): + # We should run cleanup more than one time to actually cleanup data. + # In first run — clean up only mark strings as «not hitted». + if i == 10000 or i == 20000 or i == 30000: + gc.collect() + for t in d: + str(t.lemma_) + + +@pytest.mark.issue(1654) +def test_issue1654(): + nlp = Language(Vocab()) + assert not nlp.pipeline + + @Language.component("component") + def component(doc): + return doc + + nlp.add_pipe("component", name="1") + nlp.add_pipe("component", name="2", after="1") + nlp.add_pipe("component", name="3", after="2") + assert nlp.pipe_names == ["1", "2", "3"] + nlp2 = Language(Vocab()) + assert not nlp2.pipeline + nlp2.add_pipe("component", name="3") + nlp2.add_pipe("component", name="2", before="3") + nlp2.add_pipe("component", name="1", before="2") + assert nlp2.pipe_names == ["1", "2", "3"] + + +@pytest.mark.issue(3880) +def test_issue3880(): + """Test that `nlp.pipe()` works when an empty string ends the batch. + + Fixed in v7.0.5 of Thinc. + """ + texts = ["hello", "world", "", ""] + nlp = English() + nlp.add_pipe("parser").add_label("dep") + nlp.add_pipe("ner").add_label("PERSON") + nlp.add_pipe("tagger").add_label("NN") + nlp.initialize() + for doc in nlp.pipe(texts): + pass + + +@pytest.mark.issue(5082) +def test_issue5082(): + # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens + nlp = English() + vocab = nlp.vocab + array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32) + array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32) + array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32) + array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32) + array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32) + vocab.set_vector("I", array1) + vocab.set_vector("like", array2) + vocab.set_vector("David", array3) + vocab.set_vector("Bowie", array4) + text = "I like David Bowie" + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]} + ] + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + parsed_vectors_1 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_1) == 4 + ops = get_current_ops() + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4) + nlp.add_pipe("merge_entities") + parsed_vectors_2 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_2) == 3 + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) + + +@pytest.mark.issue(5458) +def test_issue5458(): + # Test that the noun chuncker does not generate overlapping spans + # fmt: off + words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] + vocab = Vocab(strings=words) + deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] + pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] + heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] + # fmt: on + en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps) + en_doc.noun_chunks_iterator = noun_chunks + + # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" + nlp = English() + merge_nps = nlp.create_pipe("merge_noun_chunks") + merge_nps(en_doc) + + +def test_multiple_predictions(): + class DummyPipe(TrainablePipe): + def __init__(self): + self.model = "dummy_model" + + def predict(self, docs): + return ([1, 2, 3], [4, 5, 6]) + + def set_annotations(self, docs, scores): + return docs + + nlp = Language() + doc = nlp.make_doc("foo") + dummy_pipe = DummyPipe() + dummy_pipe(doc) + + def test_add_pipe_no_name(nlp): nlp.add_pipe("new_pipe") assert "new_pipe" in nlp.pipe_names diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 7a256f79b..047f59bef 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -97,3 +97,7 @@ def test_overfitting_IO(): ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + + # test internal pipe labels vs. Language.pipe_labels with hidden labels + assert nlp.get_pipe("senter").labels == ("I", "S") + assert "senter" not in nlp.pipe_labels diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 2f7e952d3..8060bc621 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -80,6 +80,8 @@ def test_explicit_labels(): assert spancat.labels == ("PERSON", "LOC") +# TODO figure out why this is flaky +@pytest.mark.skip(reason="Test is unreliable for unknown reason") def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards nlp = Language() @@ -97,6 +99,7 @@ def test_doc_gc(): assert isinstance(spangroups, SpanGroups) for key, spangroup in spangroups.items(): assert isinstance(spangroup, SpanGroup) + # XXX This fails with length 0 sometimes assert len(spangroup) > 0 with pytest.raises(RuntimeError): span = spangroup[0] diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index ec14b70da..96e75851e 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -6,10 +6,27 @@ from spacy import util from spacy.training import Example from spacy.lang.en import English from spacy.language import Language +from thinc.api import compounding from ..util import make_tempdir +@pytest.mark.issue(4348) +def test_issue4348(): + """Test that training the tagger with empty data, doesn't throw errors""" + nlp = English() + example = Example.from_dict(nlp.make_doc(""), {"tags": []}) + TRAIN_DATA = [example, example] + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") + optimizer = nlp.initialize() + for i in range(5): + losses = {} + batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + def test_label_types(): nlp = Language() tagger = nlp.add_pipe("tagger") diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index b134b8508..798dd165e 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,20 +1,31 @@ -import pytest import random + import numpy.random +import pytest from numpy.testing import assert_almost_equal -from thinc.api import fix_random_seed +from thinc.api import Config, compounding, fix_random_seed, get_current_ops +from wasabi import msg + +import spacy from spacy import util +from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer -from spacy.tokens import Doc +from spacy.pipeline.textcat import single_label_bow_config +from spacy.pipeline.textcat import single_label_cnn_config +from spacy.pipeline.textcat import single_label_default_config +from spacy.pipeline.textcat_multilabel import multi_label_bow_config +from spacy.pipeline.textcat_multilabel import multi_label_cnn_config +from spacy.pipeline.textcat_multilabel import multi_label_default_config from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.tokens import Doc, DocBin from spacy.training import Example +from spacy.training.initialize import init_nlp from ..util import make_tempdir - TRAIN_DATA_SINGLE_LABEL = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), @@ -48,6 +59,239 @@ def make_get_examples_multi_label(nlp): return get_examples +@pytest.mark.issue(3611) +def test_issue3611(): + """Test whether adding n-grams in the textcat works even when n > token length of some docs""" + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + model = { + "@architectures": "spacy.TextCatBOW.v1", + "exclusive_classes": True, + "ngram_size": 2, + "no_output_layer": False, + } + textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) + for label in unique_classes: + textcat.add_label(label) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.initialize() + for i in range(3): + losses = {} + batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) + + +@pytest.mark.issue(4030) +def test_issue4030(): + """Test whether textcat works fine with empty doc""" + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + model = { + "@architectures": "spacy.TextCatBOW.v1", + "exclusive_classes": True, + "ngram_size": 2, + "no_output_layer": False, + } + textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) + for label in unique_classes: + textcat.add_label(label) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.initialize() + for i in range(3): + losses = {} + batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) + # processing of an empty doc should result in 0.0 for all categories + doc = nlp("") + assert doc.cats["offensive"] == 0.0 + assert doc.cats["inoffensive"] == 0.0 + + +@pytest.mark.parametrize( + "textcat_config", + [ + single_label_default_config, + single_label_bow_config, + single_label_cnn_config, + multi_label_default_config, + multi_label_bow_config, + multi_label_cnn_config, + ], +) +@pytest.mark.issue(5551) +def test_issue5551(textcat_config): + """Test that after fixing the random seed, the results of the pipeline are truly identical""" + component = "textcat" + + pipe_cfg = Config().from_str(textcat_config) + results = [] + for i in range(3): + fix_random_seed(0) + nlp = English() + text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g." + annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}} + pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) + for label in set(annots["cats"]): + pipe.add_label(label) + # Train + nlp.initialize() + doc = nlp.make_doc(text) + nlp.update([Example.from_dict(doc, annots)]) + # Store the result of each iteration + result = pipe.model.predict([doc]) + results.append(result[0]) + # All results should be the same because of the fixed seed + assert len(results) == 3 + ops = get_current_ops() + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) + + +CONFIG_ISSUE_6908 = """ +[paths] +train = "TRAIN_PLACEHOLDER" +raw = null +init_tok2vec = null +vectors = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["textcat"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 + +[components] + +[components.textcat] +factory = "TEXTCAT_PLACEHOLDER" + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +frozen_components = [] +before_to_disk = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.components.textcat] +labels = ['label1', 'label2'] + +[initialize.tokenizer] +""" + + +@pytest.mark.parametrize( + "component_name", + ["textcat", "textcat_multilabel"], +) +@pytest.mark.issue(6908) +def test_issue6908(component_name): + """Test intializing textcat with labels in a list""" + + def create_data(out_file): + nlp = spacy.blank("en") + doc = nlp.make_doc("Some text") + doc.cats = {"label1": 0, "label2": 1} + out_data = DocBin(docs=[doc]).to_bytes() + with out_file.open("wb") as file_: + file_.write(out_data) + + with make_tempdir() as tmp_path: + train_path = tmp_path / "train.spacy" + create_data(train_path) + config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) + config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) + config = util.load_config_from_str(config_str) + init_nlp(config) + + +@pytest.mark.issue(7019) +def test_issue7019(): + scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} + print_textcats_auc_per_cat(msg, scores) + scores = { + "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, + "LABEL_B": {"p": None, "r": None, "f": None}, + } + print_prf_per_type(msg, scores, name="foo", type="bar") + + +@pytest.mark.issue(9904) +def test_issue9904(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + get_examples = make_get_examples_single_label(nlp) + nlp.initialize(get_examples) + + examples = get_examples() + scores = textcat.predict([eg.predicted for eg in examples]) + + loss = textcat.get_loss(examples, scores)[0] + loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] + assert loss == pytest.approx(loss_double_bs) + + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): nlp = Language() @@ -496,6 +740,72 @@ def test_textcat_evaluation(): assert scores["cats_micro_r"] == 4 / 6 +@pytest.mark.parametrize( + "multi_label,spring_p", + [(True, 1 / 1), (False, 1 / 2)], +) +def test_textcat_eval_missing(multi_label: bool, spring_p: float): + """ + multi-label: the missing 'spring' in gold_doc_2 doesn't incur a penalty + exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0""" + train_examples = [] + nlp = English() + + ref1 = nlp("one") + ref1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + pred1 = nlp("one") + pred1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + train_examples.append(Example(ref1, pred1)) + + ref2 = nlp("two") + # reference 'spring' is missing, pred 'spring' is 1 + ref2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 1.0} + pred2 = nlp("two") + pred2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + train_examples.append(Example(pred2, ref2)) + + scores = Scorer().score_cats( + train_examples, + "cats", + labels=["winter", "summer", "spring", "autumn"], + multi_label=multi_label, + ) + assert scores["cats_f_per_type"]["spring"]["p"] == spring_p + assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 1 + + +@pytest.mark.parametrize( + "multi_label,expected_loss", + [(True, 0), (False, 0.125)], +) +def test_textcat_loss(multi_label: bool, expected_loss: float): + """ + multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss + exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss""" + train_examples = [] + nlp = English() + + doc1 = nlp("one") + cats1 = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + train_examples.append(Example.from_dict(doc1, {"cats": cats1})) + + doc2 = nlp("two") + cats2 = {"winter": 0.0, "summer": 0.0, "autumn": 1.0} + train_examples.append(Example.from_dict(doc2, {"cats": cats2})) + + if multi_label: + textcat = nlp.add_pipe("textcat_multilabel") + else: + textcat = nlp.add_pipe("textcat") + textcat.initialize(lambda: train_examples) + assert isinstance(textcat, TextCategorizer) + scores = textcat.model.ops.asarray( + [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f" # type: ignore + ) + loss, d_scores = textcat.get_loss(train_examples, scores) + assert loss == expected_loss + + def test_textcat_threshold(): # Ensure the scorer can be called with a different threshold nlp = English() diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index eeea906bb..a5ac85e1e 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -11,7 +11,7 @@ from spacy.lang.en import English from thinc.api import Config, get_current_ops from numpy.testing import assert_array_equal -from ..util import get_batch, make_tempdir +from ..util import get_batch, make_tempdir, add_vecs_to_vocab def test_empty_doc(): @@ -140,9 +140,25 @@ TRAIN_DATA = [ ] -def test_tok2vec_listener(): +@pytest.mark.parametrize("with_vectors", (False, True)) +def test_tok2vec_listener(with_vectors): orig_config = Config().from_str(cfg_string) + orig_config["components"]["tok2vec"]["model"]["embed"][ + "include_static_vectors" + ] = with_vectors nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + + if with_vectors: + ops = get_current_ops() + vectors = [ + ("apple", ops.asarray([1, 2, 3])), + ("orange", ops.asarray([-1, -2, -3])), + ("and", ops.asarray([-1, -1, -1])), + ("juice", ops.asarray([5, 5, 10])), + ("pie", ops.asarray([7, 6.3, 8.9])), + ] + add_vecs_to_vocab(nlp.vocab, vectors) + assert nlp.pipe_names == ["tok2vec", "tagger"] tagger = nlp.get_pipe("tagger") tok2vec = nlp.get_pipe("tok2vec") @@ -169,6 +185,9 @@ def test_tok2vec_listener(): ops = get_current_ops() assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor)) + # test with empty doc + doc = nlp("") + # TODO: should this warn or error? nlp.select_pipes(disable="tok2vec") assert nlp.pipe_names == ["tagger"] diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py deleted file mode 100644 index 4846d2075..000000000 --- a/spacy/tests/regression/test_issue1-1000.py +++ /dev/null @@ -1,486 +0,0 @@ -import pytest -import random -from spacy import util -from spacy.training import Example -from spacy.matcher import Matcher -from spacy.attrs import IS_PUNCT, ORTH, LOWER -from spacy.vocab import Vocab -from spacy.lang.en import English -from spacy.lookups import Lookups -from spacy.tokens import Doc, Span - -from ..util import make_tempdir - - -@pytest.mark.issue(118) -@pytest.mark.parametrize( - "patterns", - [ - [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], - [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]], - ], -) -def test_issue118(en_tokenizer, patterns): - """Test a bug that arose from having overlapping matches""" - text = ( - "how many points did lebron james score against the boston celtics last night" - ) - doc = en_tokenizer(text) - ORG = doc.vocab.strings["ORG"] - matcher = Matcher(doc.vocab) - matcher.add("BostonCeltics", patterns) - assert len(list(doc.ents)) == 0 - matches = [(ORG, start, end) for _, start, end in matcher(doc)] - assert matches == [(ORG, 9, 11), (ORG, 10, 11)] - doc.ents = matches[:1] - ents = list(doc.ents) - assert len(ents) == 1 - assert ents[0].label == ORG - assert ents[0].start == 9 - assert ents[0].end == 11 - - -@pytest.mark.issue(118) -@pytest.mark.parametrize( - "patterns", - [ - [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], - [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]], - ], -) -def test_issue118_prefix_reorder(en_tokenizer, patterns): - """Test a bug that arose from having overlapping matches""" - text = ( - "how many points did lebron james score against the boston celtics last night" - ) - doc = en_tokenizer(text) - ORG = doc.vocab.strings["ORG"] - matcher = Matcher(doc.vocab) - matcher.add("BostonCeltics", patterns) - assert len(list(doc.ents)) == 0 - matches = [(ORG, start, end) for _, start, end in matcher(doc)] - doc.ents += tuple(matches)[1:] - assert matches == [(ORG, 9, 10), (ORG, 9, 11)] - ents = doc.ents - assert len(ents) == 1 - assert ents[0].label == ORG - assert ents[0].start == 9 - assert ents[0].end == 11 - - -@pytest.mark.issue(242) -def test_issue242(en_tokenizer): - """Test overlapping multi-word phrases.""" - text = "There are different food safety standards in different countries." - patterns = [ - [{"LOWER": "food"}, {"LOWER": "safety"}], - [{"LOWER": "safety"}, {"LOWER": "standards"}], - ] - doc = en_tokenizer(text) - matcher = Matcher(doc.vocab) - matcher.add("FOOD", patterns) - matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] - match1, match2 = matches - assert match1[1] == 3 - assert match1[2] == 5 - assert match2[1] == 4 - assert match2[2] == 6 - with pytest.raises(ValueError): - # One token can only be part of one entity, so test that the matches - # can't be added as entities - doc.ents += tuple(matches) - - -@pytest.mark.issue(309) -def test_issue309(en_vocab): - """Test Issue #309: SBD fails on empty string""" - doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"]) - assert len(doc) == 1 - sents = list(doc.sents) - assert len(sents) == 1 - - -@pytest.mark.issue(351) -def test_issue351(en_tokenizer): - doc = en_tokenizer(" This is a cat.") - assert doc[0].idx == 0 - assert len(doc[0]) == 3 - assert doc[1].idx == 3 - - -@pytest.mark.issue(360) -def test_issue360(en_tokenizer): - """Test tokenization of big ellipsis""" - tokens = en_tokenizer("$45...............Asking") - assert len(tokens) > 2 - - -@pytest.mark.issue(361) -@pytest.mark.parametrize("text1,text2", [("cat", "dog")]) -def test_issue361(en_vocab, text1, text2): - """Test Issue #361: Equality of lexemes""" - assert en_vocab[text1] == en_vocab[text1] - assert en_vocab[text1] != en_vocab[text2] - - -@pytest.mark.issue(587) -def test_issue587(en_tokenizer): - """Test that Matcher doesn't segfault on particular input""" - doc = en_tokenizer("a b; c") - matcher = Matcher(doc.vocab) - matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]]) - matches = matcher(doc) - assert len(matches) == 1 - matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]]) - matches = matcher(doc) - assert len(matches) == 2 - matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]]) - matches = matcher(doc) - assert len(matches) == 2 - - -@pytest.mark.issue(588) -def test_issue588(en_vocab): - matcher = Matcher(en_vocab) - with pytest.raises(ValueError): - matcher.add("TEST", [[]]) - - -@pytest.mark.issue(590) -def test_issue590(en_vocab): - """Test overlapping matches""" - doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) - matcher = Matcher(en_vocab) - matcher.add( - "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]] - ) - matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]]) - matches = matcher(doc) - assert len(matches) == 2 - - -@pytest.mark.issue(595) -@pytest.mark.skip(reason="Old vocab-based lemmatization") -def test_issue595(): - """Test lemmatization of base forms""" - words = ["Do", "n't", "feed", "the", "dog"] - lookups = Lookups() - lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) - lookups.add_table("lemma_index", {"verb": {}}) - lookups.add_table("lemma_exc", {"verb": {}}) - vocab = Vocab() - doc = Doc(vocab, words=words) - doc[2].tag_ = "VB" - assert doc[2].text == "feed" - assert doc[2].lemma_ == "feed" - - -@pytest.mark.issue(599) -def test_issue599(en_vocab): - doc = Doc(en_vocab) - doc2 = Doc(doc.vocab) - doc2.from_bytes(doc.to_bytes()) - assert doc2.has_annotation("DEP") - - -@pytest.mark.issue(600) -def test_issue600(): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) - doc = Doc(vocab, words=["hello"]) - doc[0].tag_ = "NN" - - -@pytest.mark.issue(615) -def test_issue615(en_tokenizer): - def merge_phrases(matcher, doc, i, matches): - """Merge a phrase. We have to be careful here because we'll change the - token indices. To avoid problems, merge all the phrases once we're called - on the last match.""" - if i != len(matches) - 1: - return None - spans = [Span(doc, start, end, label=label) for label, start, end in matches] - with doc.retokenize() as retokenizer: - for span in spans: - tag = "NNP" if span.label_ else span.root.tag_ - attrs = {"tag": tag, "lemma": span.text} - retokenizer.merge(span, attrs=attrs) - doc.ents = doc.ents + (span,) - - text = "The golf club is broken" - pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] - label = "Sport_Equipment" - doc = en_tokenizer(text) - matcher = Matcher(doc.vocab) - matcher.add(label, [pattern], on_match=merge_phrases) - matcher(doc) - entities = list(doc.ents) - assert entities != [] - assert entities[0].label != 0 - - -@pytest.mark.issue(736) -@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) -def test_issue736(en_tokenizer, text, number): - """Test that times like "7am" are tokenized correctly and that numbers are - converted to string.""" - tokens = en_tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text == number - - -@pytest.mark.issue(740) -@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) -def test_issue740(en_tokenizer, text): - """Test that dates are not split and kept as one token. This behaviour is - currently inconsistent, since dates separated by hyphens are still split. - This will be hard to prevent without causing clashes with numeric ranges.""" - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.issue(743) -def test_issue743(): - doc = Doc(Vocab(), ["hello", "world"]) - token = doc[0] - s = set([token]) - items = list(s) - assert items[0] is token - - -@pytest.mark.issue(744) -@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) -def test_issue744(en_tokenizer, text): - """Test that 'were' and 'Were' are excluded from the contractions - generated by the English tokenizer exceptions.""" - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text.lower() == "were" - - -@pytest.mark.issue(759) -@pytest.mark.parametrize( - "text,is_num", [("one", True), ("ten", True), ("teneleven", False)] -) -def test_issue759(en_tokenizer, text, is_num): - tokens = en_tokenizer(text) - assert tokens[0].like_num == is_num - - -@pytest.mark.issue(775) -@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) -def test_issue775(en_tokenizer, text): - """Test that 'Shell' and 'shell' are excluded from the contractions - generated by the English tokenizer exceptions.""" - tokens = en_tokenizer(text) - assert len(tokens) == 1 - assert tokens[0].text == text - - -@pytest.mark.issue(792) -@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) -def test_issue792(en_tokenizer, text): - """Test for Issue #792: Trailing whitespace is removed after tokenization.""" - doc = en_tokenizer(text) - assert "".join([token.text_with_ws for token in doc]) == text - - -@pytest.mark.issue(792) -@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) -def test_control_issue792(en_tokenizer, text): - """Test base case for Issue #792: Non-trailing whitespace""" - doc = en_tokenizer(text) - assert "".join([token.text_with_ws for token in doc]) == text - - -@pytest.mark.issue(801) -@pytest.mark.skip( - reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218" -) -@pytest.mark.parametrize( - "text,tokens", - [ - ('"deserve,"--and', ['"', "deserve", ',"--', "and"]), - ("exception;--exclusive", ["exception", ";--", "exclusive"]), - ("day.--Is", ["day", ".--", "Is"]), - ("refinement:--just", ["refinement", ":--", "just"]), - ("memories?--To", ["memories", "?--", "To"]), - ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]), - ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]), - ], -) -def test_issue801(en_tokenizer, text, tokens): - """Test that special characters + hyphens are split correctly.""" - doc = en_tokenizer(text) - assert len(doc) == len(tokens) - assert [t.text for t in doc] == tokens - - -@pytest.mark.issue(805) -@pytest.mark.parametrize( - "text,expected_tokens", - [ - ( - "Smörsåsen används bl.a. till fisk", - ["Smörsåsen", "används", "bl.a.", "till", "fisk"], - ), - ( - "Jag kommer först kl. 13 p.g.a. diverse förseningar", - ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], - ), - ], -) -def test_issue805(sv_tokenizer, text, expected_tokens): - tokens = sv_tokenizer(text) - token_list = [token.text for token in tokens if not token.is_space] - assert expected_tokens == token_list - - -@pytest.mark.issue(850) -def test_issue850(): - """The variable-length pattern matches the succeeding token. Check we - handle the ambiguity correctly.""" - vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) - matcher = Matcher(vocab) - pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] - matcher.add("FarAway", [pattern]) - doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) - match = matcher(doc) - assert len(match) == 1 - ent_id, start, end = match[0] - assert start == 0 - assert end == 4 - - -@pytest.mark.issue(850) -def test_issue850_basic(): - """Test Matcher matches with '*' operator and Boolean flag""" - vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) - matcher = Matcher(vocab) - pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] - matcher.add("FarAway", [pattern]) - doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) - match = matcher(doc) - assert len(match) == 1 - ent_id, start, end = match[0] - assert start == 0 - assert end == 4 - - -@pytest.mark.issue(852) -@pytest.mark.skip( - reason="French exception list is not enabled in the default tokenizer anymore" -) -@pytest.mark.parametrize( - "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"] -) -def test_issue852(fr_tokenizer, text): - """Test that French tokenizer exceptions are imported correctly.""" - tokens = fr_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.issue(859) -@pytest.mark.parametrize( - "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] -) -def test_issue859(en_tokenizer, text): - """Test that no extra space is added in doc.text method.""" - doc = en_tokenizer(text) - assert doc.text == text - - -@pytest.mark.issue(886) -@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) -def test_issue886(en_tokenizer, text): - """Test that token.idx matches the original text index for texts with newlines.""" - doc = en_tokenizer(text) - for token in doc: - assert len(token.text) == len(token.text_with_ws) - assert text[token.idx] == token.text[0] - - -@pytest.mark.issue(891) -@pytest.mark.parametrize("text", ["want/need"]) -def test_issue891(en_tokenizer, text): - """Test that / infixes are split correctly.""" - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "/" - - -@pytest.mark.issue(912) -@pytest.mark.skip(reason="Old vocab-based lemmatization") -@pytest.mark.parametrize( - "text,tag,lemma", - [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")], -) -def test_issue912(en_vocab, text, tag, lemma): - """Test base-forms are preserved.""" - doc = Doc(en_vocab, words=[text]) - doc[0].tag_ = tag - assert doc[0].lemma_ == lemma - - -@pytest.mark.issue(957) -@pytest.mark.slow -def test_issue957(en_tokenizer): - """Test that spaCy doesn't hang on many punctuation characters. - If this test hangs, check (new) regular expressions for conflicting greedy operators - """ - # Skip test if pytest-timeout is not installed - pytest.importorskip("pytest_timeout") - for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]: - string = "0" - for i in range(1, 100): - string += punct + str(i) - doc = en_tokenizer(string) - assert doc - - -@pytest.mark.issue(999) -def test_issue999(): - """Test that adding entities and resuming training works passably OK. - There are two issues here: - 1) We have to re-add labels. This isn't very nice. - 2) There's no way to set the learning rate for the weight update, so we - end up out-of-scale, causing it to learn too fast. - """ - TRAIN_DATA = [ - ["hey", []], - ["howdy", []], - ["hey there", []], - ["hello", []], - ["hi", []], - ["i'm looking for a place to eat", []], - ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]], - ["show me chinese restaurants", [(8, 15, "CUISINE")]], - ["show me chines restaurants", [(8, 14, "CUISINE")]], - ] - nlp = English() - ner = nlp.add_pipe("ner") - for _, offsets in TRAIN_DATA: - for start, end, label in offsets: - ner.add_label(label) - nlp.initialize() - for itn in range(20): - random.shuffle(TRAIN_DATA) - for raw_text, entity_offsets in TRAIN_DATA: - example = Example.from_dict( - nlp.make_doc(raw_text), {"entities": entity_offsets} - ) - nlp.update([example]) - - with make_tempdir() as model_dir: - nlp.to_disk(model_dir) - nlp2 = util.load_model_from_path(model_dir) - - for raw_text, entity_offsets in TRAIN_DATA: - doc = nlp2(raw_text) - ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} - for start, end, label in entity_offsets: - if (start, end) in ents: - assert ents[(start, end)] == label - break - else: - if entity_offsets: - raise Exception(ents) diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py deleted file mode 100644 index 0a60e4477..000000000 --- a/spacy/tests/regression/test_issue1001-1500.py +++ /dev/null @@ -1,174 +0,0 @@ -import pytest -import re -from spacy.tokens import Doc -from spacy.vocab import Vocab -from spacy.lang.en import English -from spacy.lang.lex_attrs import LEX_ATTRS -from spacy.matcher import Matcher -from spacy.tokenizer import Tokenizer -from spacy.symbols import ORTH, LEMMA, POS - - -@pytest.mark.issue(1061) -def test_issue1061(): - """Test special-case works after tokenizing. Was caching problem.""" - text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." - tokenizer = English().tokenizer - doc = tokenizer(text) - assert "MATH" in [w.text for w in doc] - assert "_MATH_" not in [w.text for w in doc] - - tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) - doc = tokenizer(text) - assert "_MATH_" in [w.text for w in doc] - assert "MATH" not in [w.text for w in doc] - - # For sanity, check it works when pipeline is clean. - tokenizer = English().tokenizer - tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) - doc = tokenizer(text) - assert "_MATH_" in [w.text for w in doc] - assert "MATH" not in [w.text for w in doc] - - -@pytest.mark.skip( - reason="Can not be fixed without variable-width look-behind (which we don't want)" -) -@pytest.mark.issue(1235) -def test_issue1235(): - """Test that g is not split of if preceded by a number and a letter""" - nlp = English() - testwords = "e2g 2g 52g" - doc = nlp(testwords) - assert len(doc) == 5 - assert doc[0].text == "e2g" - assert doc[1].text == "2" - assert doc[2].text == "g" - assert doc[3].text == "52" - assert doc[4].text == "g" - - -@pytest.mark.issue(1242) -def test_issue1242(): - nlp = English() - doc = nlp("") - assert len(doc) == 0 - docs = list(nlp.pipe(["", "hello"])) - assert len(docs[0]) == 0 - assert len(docs[1]) == 1 - - -@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases") -@pytest.mark.issue(1250) -def test_issue1250(): - """Test cached special cases.""" - special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] - nlp = English() - nlp.tokenizer.add_special_case("reimbur", special_case) - lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")] - assert lemmas == ["reimburse", ",", "reimburse", "..."] - lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")] - assert lemmas == ["reimburse", ",", "reimburse", "..."] - - -@pytest.mark.issue(1257) -def test_issue1257(): - """Test that tokens compare correctly.""" - doc1 = Doc(Vocab(), words=["a", "b", "c"]) - doc2 = Doc(Vocab(), words=["a", "c", "e"]) - assert doc1[0] != doc2[0] - assert not doc1[0] == doc2[0] - - -@pytest.mark.issue(1375) -def test_issue1375(): - """Test that token.nbor() raises IndexError for out-of-bounds access.""" - doc = Doc(Vocab(), words=["0", "1", "2"]) - with pytest.raises(IndexError): - assert doc[0].nbor(-1) - assert doc[1].nbor(-1).text == "0" - with pytest.raises(IndexError): - assert doc[2].nbor(1) - assert doc[1].nbor(1).text == "2" - - -@pytest.mark.issue(1434) -def test_issue1434(): - """Test matches occur when optional element at end of short doc.""" - pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] - vocab = Vocab(lex_attr_getters=LEX_ATTRS) - hello_world = Doc(vocab, words=["Hello", "World"]) - hello = Doc(vocab, words=["Hello"]) - matcher = Matcher(vocab) - matcher.add("MyMatcher", [pattern]) - matches = matcher(hello_world) - assert matches - matches = matcher(hello) - assert matches - - -@pytest.mark.parametrize( - "string,start,end", - [ - ("a", 0, 1), - ("a b", 0, 2), - ("a c", 0, 1), - ("a b c", 0, 2), - ("a b b c", 0, 3), - ("a b b", 0, 3), - ], -) -@pytest.mark.issue(1450) -def test_issue1450(string, start, end): - """Test matcher works when patterns end with * operator.""" - pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] - matcher = Matcher(Vocab()) - matcher.add("TSTEND", [pattern]) - doc = Doc(Vocab(), words=string.split()) - matches = matcher(doc) - if start is None or end is None: - assert matches == [] - assert matches[-1][1] == start - assert matches[-1][2] == end - - -@pytest.mark.issue(1488) -def test_issue1488(): - prefix_re = re.compile(r"""[\[\("']""") - suffix_re = re.compile(r"""[\]\)"']""") - infix_re = re.compile(r"""[-~\.]""") - simple_url_re = re.compile(r"""^https?://""") - - def my_tokenizer(nlp): - return Tokenizer( - nlp.vocab, - {}, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=simple_url_re.match, - ) - - nlp = English() - nlp.tokenizer = my_tokenizer(nlp) - doc = nlp("This is a test.") - for token in doc: - assert token.text - - -@pytest.mark.issue(1494) -def test_issue1494(): - infix_re = re.compile(r"""[^a-z]""") - test_cases = [ - ("token 123test", ["token", "1", "2", "3", "test"]), - ("token 1test", ["token", "1test"]), - ("hello...test", ["hello", ".", ".", ".", "test"]), - ] - - def new_tokenizer(nlp): - return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) - - nlp = English() - nlp.tokenizer = new_tokenizer(nlp) - for text, expected in test_cases: - assert [token.text for token in nlp(text)] == expected diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py deleted file mode 100644 index 07f173843..000000000 --- a/spacy/tests/regression/test_issue1501-2000.py +++ /dev/null @@ -1,375 +0,0 @@ -import pytest -import gc -import numpy -import copy - -from spacy.training import Example -from spacy.lang.en import English -from spacy.lang.en.stop_words import STOP_WORDS -from spacy.lang.lex_attrs import is_stop -from spacy.vectors import Vectors -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.tokens import Doc, Span, Token -from spacy.attrs import HEAD, DEP -from spacy.matcher import Matcher - -from ..util import make_tempdir - - -@pytest.mark.issue(1506) -def test_issue1506(): - def string_generator(): - for _ in range(10001): - yield "It's sentence produced by that bug." - for _ in range(10001): - yield "I erase some hbdsaj lemmas." - for _ in range(10001): - yield "I erase lemmas." - for _ in range(10001): - yield "It's sentence produced by that bug." - for _ in range(10001): - yield "It's sentence produced by that bug." - - nlp = English() - for i, d in enumerate(nlp.pipe(string_generator())): - # We should run cleanup more than one time to actually cleanup data. - # In first run — clean up only mark strings as «not hitted». - if i == 10000 or i == 20000 or i == 30000: - gc.collect() - for t in d: - str(t.lemma_) - - -@pytest.mark.issue(1518) -def test_issue1518(): - """Test vectors.resize() works.""" - vectors = Vectors(shape=(10, 10)) - vectors.add("hello", row=2) - vectors.resize((5, 9)) - - -@pytest.mark.issue(1537) -def test_issue1537(): - """Test that Span.as_doc() doesn't segfault.""" - string = "The sky is blue . The man is pink . The dog is purple ." - doc = Doc(Vocab(), words=string.split()) - doc[0].sent_start = True - for word in doc[1:]: - if word.nbor(-1).text == ".": - word.sent_start = True - else: - word.sent_start = False - sents = list(doc.sents) - sent0 = sents[0].as_doc() - sent1 = sents[1].as_doc() - assert isinstance(sent0, Doc) - assert isinstance(sent1, Doc) - - -# TODO: Currently segfaulting, due to l_edge and r_edge misalignment -@pytest.mark.issue(1537) -# def test_issue1537_model(): -# nlp = load_spacy('en') -# doc = nlp('The sky is blue. The man is pink. The dog is purple.') -# sents = [s.as_doc() for s in doc.sents] -# print(list(sents[0].noun_chunks)) -# print(list(sents[1].noun_chunks)) - - -@pytest.mark.issue(1539) -def test_issue1539(): - """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" - v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) - v.resize((100, 100)) - - -@pytest.mark.issue(1547) -def test_issue1547(): - """Test that entity labels still match after merging tokens.""" - words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] - doc = Doc(Vocab(), words=words) - doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[5:7]) - assert [ent.text for ent in doc.ents] - - -@pytest.mark.issue(1612) -def test_issue1612(en_tokenizer): - doc = en_tokenizer("The black cat purrs.") - span = doc[1:3] - assert span.orth_ == span.text - - -@pytest.mark.issue(1654) -def test_issue1654(): - nlp = Language(Vocab()) - assert not nlp.pipeline - - @Language.component("component") - def component(doc): - return doc - - nlp.add_pipe("component", name="1") - nlp.add_pipe("component", name="2", after="1") - nlp.add_pipe("component", name="3", after="2") - assert nlp.pipe_names == ["1", "2", "3"] - nlp2 = Language(Vocab()) - assert not nlp2.pipeline - nlp2.add_pipe("component", name="3") - nlp2.add_pipe("component", name="2", before="3") - nlp2.add_pipe("component", name="1", before="2") - assert nlp2.pipe_names == ["1", "2", "3"] - - -@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) -@pytest.mark.issue(1698) -def test_issue1698(en_tokenizer, text): - doc = en_tokenizer(text) - assert len(doc) == 1 - assert not doc[0].like_url - - -@pytest.mark.issue(1727) -def test_issue1727(): - """Test that models with no pretrained vectors can be deserialized - correctly after vectors are added.""" - nlp = Language(Vocab()) - data = numpy.ones((3, 300), dtype="f") - vectors = Vectors(data=data, keys=["I", "am", "Matt"]) - tagger = nlp.create_pipe("tagger") - tagger.add_label("PRP") - assert tagger.cfg.get("pretrained_dims", 0) == 0 - tagger.vocab.vectors = vectors - with make_tempdir() as path: - tagger.to_disk(path) - tagger = nlp.create_pipe("tagger").from_disk(path) - assert tagger.cfg.get("pretrained_dims", 0) == 0 - - -@pytest.mark.issue(1757) -def test_issue1757(): - """Test comparison against None doesn't cause segfault.""" - doc = Doc(Vocab(), words=["a", "b", "c"]) - assert not doc[0] < None - assert not doc[0] is None - assert doc[0] >= None - assert not doc[:2] < None - assert not doc[:2] is None - assert doc[:2] >= None - assert not doc.vocab["a"] is None - assert not doc.vocab["a"] < None - - -@pytest.mark.issue(1758) -def test_issue1758(en_tokenizer): - """Test that "would've" is handled by the English tokenizer exceptions.""" - tokens = en_tokenizer("would've") - assert len(tokens) == 2 - - -@pytest.mark.issue(1773) -def test_issue1773(en_tokenizer): - """Test that spaces don't receive a POS but no TAG. This is the root cause - of the serialization issue reported in #1773.""" - doc = en_tokenizer("\n") - if doc[0].pos_ == "SPACE": - assert doc[0].tag_ != "" - - -@pytest.mark.issue(1799) -def test_issue1799(): - """Test sentence boundaries are deserialized correctly, even for - non-projective sentences.""" - heads_deps = numpy.asarray( - [ - [1, 397], - [4, 436], - [2, 426], - [1, 402], - [0, 8206900633647566924], - [18446744073709551615, 440], - [18446744073709551614, 442], - ], - dtype="uint64", - ) - doc = Doc(Vocab(), words="Just what I was looking for .".split()) - doc.vocab.strings.add("ROOT") - doc = doc.from_array([HEAD, DEP], heads_deps) - assert len(list(doc.sents)) == 1 - - -@pytest.mark.issue(1807) -def test_issue1807(): - """Test vocab.set_vector also adds the word to the vocab.""" - vocab = Vocab(vectors_name="test_issue1807") - assert "hello" not in vocab - vocab.set_vector("hello", numpy.ones((50,), dtype="f")) - assert "hello" in vocab - - -@pytest.mark.issue(1834) -def test_issue1834(): - """Test that sentence boundaries & parse/tag flags are not lost - during serialization.""" - words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"] - doc = Doc(Vocab(), words=words) - doc[6].is_sent_start = True - new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - assert new_doc[6].sent_start - assert not new_doc.has_annotation("DEP") - assert not new_doc.has_annotation("TAG") - doc = Doc( - Vocab(), - words=words, - tags=["TAG"] * len(words), - heads=[0, 0, 0, 0, 0, 0, 6, 6, 6], - deps=["dep"] * len(words), - ) - new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - assert new_doc[6].sent_start - assert new_doc.has_annotation("DEP") - assert new_doc.has_annotation("TAG") - - -@pytest.mark.issue(1868) -def test_issue1868(): - """Test Vocab.__contains__ works with int keys.""" - vocab = Vocab() - lex = vocab["hello"] - assert lex.orth in vocab - assert lex.orth_ in vocab - assert "some string" not in vocab - int_id = vocab.strings.add("some string") - assert int_id not in vocab - - -@pytest.mark.issue(1883) -def test_issue1883(): - matcher = Matcher(Vocab()) - matcher.add("pat1", [[{"orth": "hello"}]]) - doc = Doc(matcher.vocab, words=["hello"]) - assert len(matcher(doc)) == 1 - new_matcher = copy.deepcopy(matcher) - new_doc = Doc(new_matcher.vocab, words=["hello"]) - assert len(new_matcher(new_doc)) == 1 - - -@pytest.mark.parametrize("word", ["the"]) -@pytest.mark.issue(1889) -def test_issue1889(word): - assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) - - -@pytest.mark.skip(reason="obsolete with the config refactor of v.3") -@pytest.mark.issue(1915) -def test_issue1915(): - cfg = {"hidden_depth": 2} # should error out - nlp = Language() - ner = nlp.add_pipe("ner") - ner.add_label("answer") - with pytest.raises(ValueError): - nlp.initialize(**cfg) - - -@pytest.mark.issue(1945) -def test_issue1945(): - """Test regression in Matcher introduced in v2.0.6.""" - matcher = Matcher(Vocab()) - matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) - doc = Doc(matcher.vocab, words=["a", "a", "a"]) - matches = matcher(doc) # we should see two overlapping matches here - assert len(matches) == 2 - assert matches[0][1:] == (0, 2) - assert matches[1][1:] == (1, 3) - - -@pytest.mark.issue(1963) -def test_issue1963(en_tokenizer): - """Test that doc.merge() resizes doc.tensor""" - doc = en_tokenizer("a b c d") - doc.tensor = numpy.ones((len(doc), 128), dtype="f") - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[0:2]) - assert len(doc) == 3 - assert doc.tensor.shape == (3, 128) - - -@pytest.mark.parametrize("label", ["U-JOB-NAME"]) -@pytest.mark.issue(1967) -def test_issue1967(label): - nlp = Language() - config = {} - ner = nlp.create_pipe("ner", config=config) - example = Example.from_dict( - Doc(ner.vocab, words=["word"]), - { - "ids": [0], - "words": ["word"], - "tags": ["tag"], - "heads": [0], - "deps": ["dep"], - "entities": [label], - }, - ) - assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] - - -@pytest.mark.issue(1971) -def test_issue1971(en_vocab): - # Possibly related to #2675 and #2671? - matcher = Matcher(en_vocab) - pattern = [ - {"ORTH": "Doe"}, - {"ORTH": "!", "OP": "?"}, - {"_": {"optional": True}, "OP": "?"}, - {"ORTH": "!", "OP": "?"}, - ] - Token.set_extension("optional", default=False) - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) - # We could also assert length 1 here, but this is more conclusive, because - # the real problem here is that it returns a duplicate match for a match_id - # that's not actually in the vocab! - matches = matcher(doc) - assert all([match_id in en_vocab.strings for match_id, start, end in matches]) - - -def test_issue_1971_2(en_vocab): - matcher = Matcher(en_vocab) - pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] - pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] - doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) - matcher.add("TEST1", [pattern1, pattern2]) - matches = matcher(doc) - assert len(matches) == 2 - - -def test_issue_1971_3(en_vocab): - """Test that pattern matches correctly for multiple extension attributes.""" - Token.set_extension("a", default=1, force=True) - Token.set_extension("b", default=2, force=True) - doc = Doc(en_vocab, words=["hello", "world"]) - matcher = Matcher(en_vocab) - matcher.add("A", [[{"_": {"a": 1}}]]) - matcher.add("B", [[{"_": {"b": 2}}]]) - matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) - assert len(matches) == 4 - assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) - - -def test_issue_1971_4(en_vocab): - """Test that pattern matches correctly with multiple extension attribute - values on a single token. - """ - Token.set_extension("ext_a", default="str_a", force=True) - Token.set_extension("ext_b", default="str_b", force=True) - matcher = Matcher(en_vocab) - doc = Doc(en_vocab, words=["this", "is", "text"]) - pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 - matcher.add("TEST", [pattern]) - matches = matcher(doc) - # Uncommenting this caused a segmentation fault - assert len(matches) == 1 - assert matches[0] == (en_vocab.strings["TEST"], 0, 3) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py deleted file mode 100644 index a07360c2c..000000000 --- a/spacy/tests/regression/test_issue2001-2500.py +++ /dev/null @@ -1,152 +0,0 @@ -import pytest -import numpy -from spacy.tokens import Doc -from spacy.matcher import Matcher -from spacy.displacy import render -from spacy.training import iob_to_biluo -from spacy.lang.it import Italian -from spacy.lang.en import English - -from ..util import add_vecs_to_vocab - - -@pytest.mark.skip( - reason="Can not be fixed without iterative looping between prefix/suffix and infix" -) -@pytest.mark.issue(2070) -def test_issue2070(): - """Test that checks that a dot followed by a quote is handled - appropriately. - """ - # Problem: The dot is now properly split off, but the prefix/suffix rules - # are not applied again afterwards. This means that the quote will still be - # attached to the remaining token. - nlp = English() - doc = nlp('First sentence."A quoted sentence" he said ...') - assert len(doc) == 11 - - -@pytest.mark.issue(2179) -def test_issue2179(): - """Test that spurious 'extra_labels' aren't created when initializing NER.""" - nlp = Italian() - ner = nlp.add_pipe("ner") - ner.add_label("CITIZENSHIP") - nlp.initialize() - nlp2 = Italian() - nlp2.add_pipe("ner") - assert len(nlp2.get_pipe("ner").labels) == 0 - model = nlp2.get_pipe("ner").model - model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves) - nlp2.from_bytes(nlp.to_bytes()) - assert "extra_labels" not in nlp2.get_pipe("ner").cfg - assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) - - -@pytest.mark.issue(2203) -def test_issue2203(en_vocab): - """Test that lemmas are set correctly in doc.from_array.""" - words = ["I", "'ll", "survive"] - tags = ["PRP", "MD", "VB"] - lemmas = ["-PRON-", "will", "survive"] - tag_ids = [en_vocab.strings.add(tag) for tag in tags] - lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] - doc = Doc(en_vocab, words=words) - # Work around lemma corruption problem and set lemmas after tags - doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) - doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) - assert [t.tag_ for t in doc] == tags - assert [t.lemma_ for t in doc] == lemmas - # We need to serialize both tag and lemma, since this is what causes the bug - doc_array = doc.to_array(["TAG", "LEMMA"]) - new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) - assert [t.tag_ for t in new_doc] == tags - assert [t.lemma_ for t in new_doc] == lemmas - - -@pytest.mark.issue(2219) -def test_issue2219(en_vocab): - vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] - add_vecs_to_vocab(en_vocab, vectors) - [(word1, vec1), (word2, vec2)] = vectors - doc = Doc(en_vocab, words=[word1, word2]) - assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) - - -@pytest.mark.issue(2361) -def test_issue2361(de_vocab): - chars = ("<", ">", "&", """) - words = ["<", ">", "&", '"'] - doc = Doc(de_vocab, words=words, deps=["dep"] * len(words)) - html = render(doc) - for char in chars: - assert char in html - - -@pytest.mark.issue(2385) -def test_issue2385(): - """Test that IOB tags are correctly converted to BILUO tags.""" - # fix bug in labels with a 'b' character - tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") - assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] - # maintain support for iob1 format - tags2 = ("I-ORG", "I-ORG", "B-ORG") - assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] - # maintain support for iob2 format - tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") - assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"] - - -@pytest.mark.parametrize( - "tags", - [ - ("B-ORG", "L-ORG"), - ("B-PERSON", "I-PERSON", "L-PERSON"), - ("U-BRAWLER", "U-BRAWLER"), - ], -) -@pytest.mark.issue(2385) -def test_issue2385_biluo(tags): - """Test that BILUO-compatible tags aren't modified.""" - assert iob_to_biluo(tags) == list(tags) - - -@pytest.mark.issue(2396) -def test_issue2396(en_vocab): - words = ["She", "created", "a", "test", "for", "spacy"] - heads = [1, 1, 3, 1, 3, 4] - deps = ["dep"] * len(heads) - matrix = numpy.array( - [ - [0, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 2, 3, 3, 3], - [1, 1, 3, 3, 3, 3], - [1, 1, 3, 3, 4, 4], - [1, 1, 3, 3, 4, 5], - ], - dtype=numpy.int32, - ) - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - span = doc[:] - assert (doc.get_lca_matrix() == matrix).all() - assert (span.get_lca_matrix() == matrix).all() - - -@pytest.mark.issue(2464) -def test_issue2464(en_vocab): - """Test problem with successive ?. This is the same bug, so putting it here.""" - matcher = Matcher(en_vocab) - doc = Doc(en_vocab, words=["a", "b"]) - matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]]) - matches = matcher(doc) - assert len(matches) == 3 - - -@pytest.mark.issue(2482) -def test_issue2482(): - """Test we can serialize and deserialize a blank NER or parser model.""" - nlp = Italian() - nlp.add_pipe("ner") - b = nlp.to_bytes() - Italian().from_bytes(b) diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py deleted file mode 100644 index cbb7f0621..000000000 --- a/spacy/tests/regression/test_issue2501-3000.py +++ /dev/null @@ -1,238 +0,0 @@ -import pytest -from spacy import displacy -from spacy.training import Example -from spacy.lang.en import English -from spacy.lang.ja import Japanese -from spacy.lang.xx import MultiLanguage -from spacy.language import Language -from spacy.matcher import Matcher -from spacy.tokens import Doc, Span -from spacy.vocab import Vocab -from spacy.compat import pickle -import numpy -import random - - -@pytest.mark.issue(2564) -def test_issue2564(): - """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" - nlp = Language() - tagger = nlp.add_pipe("tagger") - tagger.add_label("A") - nlp.initialize() - doc = nlp("hello world") - assert doc.has_annotation("TAG") - docs = nlp.pipe(["hello", "world"]) - piped_doc = next(docs) - assert piped_doc.has_annotation("TAG") - - -@pytest.mark.issue(2569) -def test_issue2569(en_tokenizer): - """Test that operator + is greedy.""" - doc = en_tokenizer("It is May 15, 1993.") - doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] - matcher = Matcher(doc.vocab) - matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]]) - matched = [doc[start:end] for _, start, end in matcher(doc)] - matched = sorted(matched, key=len, reverse=True) - assert len(matched) == 10 - assert len(matched[0]) == 4 - assert matched[0].text == "May 15, 1993" - - -@pytest.mark.parametrize( - "text", - [ - "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume", - "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:", - ], -) -@pytest.mark.issue(2626) -def test_issue2626_2835(en_tokenizer, text): - """Check that sentence doesn't cause an infinite loop in the tokenizer.""" - doc = en_tokenizer(text) - assert doc - - -@pytest.mark.issue(2656) -def test_issue2656(en_tokenizer): - """Test that tokenizer correctly splits off punctuation after numbers with - decimal points. - """ - doc = en_tokenizer("I went for 40.3, and got home by 10.0.") - assert len(doc) == 11 - assert doc[0].text == "I" - assert doc[1].text == "went" - assert doc[2].text == "for" - assert doc[3].text == "40.3" - assert doc[4].text == "," - assert doc[5].text == "and" - assert doc[6].text == "got" - assert doc[7].text == "home" - assert doc[8].text == "by" - assert doc[9].text == "10.0" - assert doc[10].text == "." - - -@pytest.mark.issue(2671) -def test_issue2671(): - """Ensure the correct entity ID is returned for matches with quantifiers. - See also #2675 - """ - nlp = English() - matcher = Matcher(nlp.vocab) - pattern_id = "test_pattern" - pattern = [ - {"LOWER": "high"}, - {"IS_PUNCT": True, "OP": "?"}, - {"LOWER": "adrenaline"}, - ] - matcher.add(pattern_id, [pattern]) - doc1 = nlp("This is a high-adrenaline situation.") - doc2 = nlp("This is a high adrenaline situation.") - matches1 = matcher(doc1) - for match_id, start, end in matches1: - assert nlp.vocab.strings[match_id] == pattern_id - matches2 = matcher(doc2) - for match_id, start, end in matches2: - assert nlp.vocab.strings[match_id] == pattern_id - - -@pytest.mark.issue(2728) -def test_issue2728(en_vocab): - """Test that displaCy ENT visualizer escapes HTML correctly.""" - doc = Doc(en_vocab, words=["test", "", "test"]) - doc.ents = [Span(doc, 0, 1, label="TEST")] - html = displacy.render(doc, style="ent") - assert "<RELEASE>" in html - doc.ents = [Span(doc, 1, 2, label="TEST")] - html = displacy.render(doc, style="ent") - assert "<RELEASE>" in html - - -@pytest.mark.issue(2754) -def test_issue2754(en_tokenizer): - """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" - a = en_tokenizer("a") - assert a[0].norm_ == "a" - am = en_tokenizer("am") - assert am[0].norm_ == "am" - - -@pytest.mark.issue(2772) -def test_issue2772(en_vocab): - """Test that deprojectivization doesn't mess up sentence boundaries.""" - # fmt: off - words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."] - # fmt: on - # A tree with a non-projective (i.e. crossing) arc - # The arcs (0, 4) and (2, 9) cross. - heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9] - deps = ["dep"] * len(heads) - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - assert doc[1].is_sent_start is False - - -@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) -@pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) -@pytest.mark.issue(2782) -def test_issue2782(text, lang_cls): - """Check that like_num handles + and - before number.""" - nlp = lang_cls() - doc = nlp(text) - assert len(doc) == 1 - assert doc[0].like_num - - -@pytest.mark.issue(2800) -def test_issue2800(): - """Test issue that arises when too many labels are added to NER model. - Used to cause segfault. - """ - nlp = English() - train_data = [] - train_data.extend( - [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] - ) - entity_types = [str(i) for i in range(1000)] - ner = nlp.add_pipe("ner") - for entity_type in list(entity_types): - ner.add_label(entity_type) - optimizer = nlp.initialize() - for i in range(20): - losses = {} - random.shuffle(train_data) - for example in train_data: - nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) - - -@pytest.mark.issue(2822) -def test_issue2822(it_tokenizer): - """Test that the abbreviation of poco is kept as one word.""" - doc = it_tokenizer("Vuoi un po' di zucchero?") - assert len(doc) == 6 - assert doc[0].text == "Vuoi" - assert doc[1].text == "un" - assert doc[2].text == "po'" - assert doc[3].text == "di" - assert doc[4].text == "zucchero" - assert doc[5].text == "?" - - -@pytest.mark.issue(2833) -def test_issue2833(en_vocab): - """Test that a custom error is raised if a token or span is pickled.""" - doc = Doc(en_vocab, words=["Hello", "world"]) - with pytest.raises(NotImplementedError): - pickle.dumps(doc[0]) - with pytest.raises(NotImplementedError): - pickle.dumps(doc[0:2]) - - -@pytest.mark.issue(2871) -def test_issue2871(): - """Test that vectors recover the correct key for spaCy reserved words.""" - words = ["dog", "cat", "SUFFIX"] - vocab = Vocab(vectors_name="test_issue2871") - vocab.vectors.resize(shape=(3, 10)) - vector_data = numpy.zeros((3, 10), dtype="f") - for word in words: - _ = vocab[word] # noqa: F841 - vocab.set_vector(word, vector_data[0]) - vocab.vectors.name = "dummy_vectors" - assert vocab["dog"].rank == 0 - assert vocab["cat"].rank == 1 - assert vocab["SUFFIX"].rank == 2 - assert vocab.vectors.find(key="dog") == 0 - assert vocab.vectors.find(key="cat") == 1 - assert vocab.vectors.find(key="SUFFIX") == 2 - - -@pytest.mark.issue(2901) -def test_issue2901(): - """Test that `nlp` doesn't fail.""" - try: - nlp = Japanese() - except ImportError: - pytest.skip() - - doc = nlp("pythonが大好きです") - assert doc - - -@pytest.mark.issue(2926) -def test_issue2926(fr_tokenizer): - """Test that the tokenizer correctly splits tokens separated by a slash (/) - ending in a digit. - """ - doc = fr_tokenizer("Learn html5/css3/javascript/jquery") - assert len(doc) == 8 - assert doc[0].text == "Learn" - assert doc[1].text == "html5" - assert doc[2].text == "/" - assert doc[3].text == "css3" - assert doc[4].text == "/" - assert doc[5].text == "javascript" - assert doc[6].text == "/" - assert doc[7].text == "jquery" diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py deleted file mode 100644 index 6220003dc..000000000 --- a/spacy/tests/regression/test_issue3001-3500.py +++ /dev/null @@ -1,272 +0,0 @@ -import pytest -from spacy import registry -from spacy.lang.en import English -from spacy.lang.de import German -from spacy.pipeline.ner import DEFAULT_NER_MODEL -from spacy.pipeline import EntityRuler, EntityRecognizer -from spacy.matcher import Matcher, PhraseMatcher -from spacy.tokens import Doc -from spacy.vocab import Vocab -from spacy.attrs import ENT_IOB, ENT_TYPE -from spacy.compat import pickle -from spacy import displacy -from spacy.vectors import Vectors -import numpy - - -@pytest.mark.issue(3002) -def test_issue3002(): - """Test that the tokenizer doesn't hang on a long list of dots""" - nlp = German() - doc = nlp( - "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl" - ) - assert len(doc) == 5 - - -@pytest.mark.issue(3009) -def test_issue3009(en_vocab): - """Test problem with matcher quantifiers""" - patterns = [ - [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}], - [ - {"ORTH": "has"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"TAG": "IN"}, - ], - [ - {"ORTH": "has"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"TAG": "IN"}, - ], - ] - words = ["also", "has", "to", "do", "with"] - tags = ["RB", "VBZ", "TO", "VB", "IN"] - pos = ["ADV", "VERB", "ADP", "VERB", "ADP"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos) - matcher = Matcher(en_vocab) - for i, pattern in enumerate(patterns): - matcher.add(str(i), [pattern]) - matches = matcher(doc) - assert matches - - -@pytest.mark.issue(3012) -def test_issue3012(en_vocab): - """Test that the is_tagged attribute doesn't get overwritten when we from_array - without tag information.""" - words = ["This", "is", "10", "%", "."] - tags = ["DT", "VBZ", "CD", "NN", "."] - pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] - ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) - assert doc.has_annotation("TAG") - expected = ("10", "NUM", "CD", "PERCENT") - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - header = [ENT_IOB, ENT_TYPE] - ent_array = doc.to_array(header) - doc.from_array(header, ent_array) - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - # Serializing then deserializing - doc_bytes = doc.to_bytes() - doc2 = Doc(en_vocab).from_bytes(doc_bytes) - assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected - - -@pytest.mark.issue(3199) -def test_issue3199(): - """Test that Span.noun_chunks works correctly if no noun chunks iterator - is available. To make this test future-proof, we're constructing a Doc - with a new Vocab here and a parse tree to make sure the noun chunks run. - """ - words = ["This", "is", "a", "sentence"] - doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) - with pytest.raises(NotImplementedError): - list(doc[0:3].noun_chunks) - - -@pytest.mark.issue(3209) -def test_issue3209(): - """Test issue that occurred in spaCy nightly where NER labels were being - mapped to classes incorrectly after loading the model, when the labels - were added using ner.add_label(). - """ - nlp = English() - ner = nlp.add_pipe("ner") - ner.add_label("ANIMAL") - nlp.initialize() - move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] - assert ner.move_names == move_names - nlp2 = English() - ner2 = nlp2.add_pipe("ner") - model = ner2.model - model.attrs["resize_output"](model, ner.moves.n_moves) - nlp2.from_bytes(nlp.to_bytes()) - assert ner2.move_names == move_names - - -@pytest.mark.issue(3248) -def test_issue3248_1(): - """Test that the PhraseMatcher correctly reports its number of rules, not - total number of patterns.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) - matcher.add("TEST2", [nlp("d")]) - assert len(matcher) == 2 - - -@pytest.mark.issue(3248) -def test_issue3248_2(): - """Test that the PhraseMatcher can be pickled correctly.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) - matcher.add("TEST2", [nlp("d")]) - data = pickle.dumps(matcher) - new_matcher = pickle.loads(data) - assert len(new_matcher) == len(matcher) - - -@pytest.mark.issue(3277) -def test_issue3277(es_tokenizer): - """Test that hyphens are split correctly as prefixes.""" - doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") - assert len(doc) == 14 - assert doc[0].text == "\u2014" - assert doc[5].text == "\u2013" - assert doc[9].text == "\u2013" - - -@pytest.mark.issue(3288) -def test_issue3288(en_vocab): - """Test that retokenization works correctly via displaCy when punctuation - is merged onto the preceeding token and tensor is resized.""" - words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] - heads = [1, 1, 1, 4, 4, 6, 4, 4] - deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - doc.tensor = numpy.zeros((len(words), 96), dtype="float32") - displacy.render(doc) - - -@pytest.mark.issue(3289) -def test_issue3289(): - """Test that Language.to_bytes handles serializing a pipeline component - with an uninitialized model.""" - nlp = English() - nlp.add_pipe("textcat") - bytes_data = nlp.to_bytes() - new_nlp = English() - new_nlp.add_pipe("textcat") - new_nlp.from_bytes(bytes_data) - - -@pytest.mark.issue(3328) -def test_issue3328(en_vocab): - doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) - matcher = Matcher(en_vocab) - patterns = [ - [{"LOWER": {"IN": ["hello", "how"]}}], - [{"LOWER": {"IN": ["you", "doing"]}}], - ] - matcher.add("TEST", patterns) - matches = matcher(doc) - assert len(matches) == 4 - matched_texts = [doc[start:end].text for _, start, end in matches] - assert matched_texts == ["Hello", "how", "you", "doing"] - - -@pytest.mark.issue(3331) -def test_issue3331(en_vocab): - """Test that duplicate patterns for different rules result in multiple - matches, one per rule. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])]) - matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])]) - doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) - matches = matcher(doc) - assert len(matches) == 2 - match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] - assert sorted(match_ids) == ["A", "B"] - - -@pytest.mark.issue(3345) -def test_issue3345(): - """Test case where preset entity crosses sentence boundary.""" - nlp = English() - doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) - doc[4].is_sent_start = True - ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - cfg = {"model": DEFAULT_NER_MODEL} - model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(doc.vocab, model) - # Add the OUT action. I wouldn't have thought this would be necessary... - ner.moves.add_action(5, "") - ner.add_label("GPE") - doc = ruler(doc) - # Get into the state just before "New" - state = ner.moves.init_batch([doc])[0] - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - # Check that B-GPE is valid. - assert ner.moves.is_valid(state, "B-GPE") - - -@pytest.mark.issue(3412) -def test_issue3412(): - data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") - vectors = Vectors(data=data, keys=["A", "B", "C"]) - keys, best_rows, scores = vectors.most_similar( - numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f") - ) - assert best_rows[0] == 2 - - -@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") -@pytest.mark.issue(3449) -def test_issue3449(): - nlp = English() - nlp.add_pipe("sentencizer") - text1 = "He gave the ball to I. Do you want to go to the movies with I?" - text2 = "He gave the ball to I. Do you want to go to the movies with I?" - text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" - t1 = nlp(text1) - t2 = nlp(text2) - t3 = nlp(text3) - assert t1[5].text == "I" - assert t2[5].text == "I" - assert t3[5].text == "I" - - -@pytest.mark.issue(3456) -def test_issue3456(): - # this crashed because of a padding error in layer.ops.unflatten in thinc - nlp = English() - tagger = nlp.add_pipe("tagger") - tagger.add_label("A") - nlp.initialize() - list(nlp.pipe(["hi", ""])) - - -@pytest.mark.issue(3468) -def test_issue3468(): - """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can - be restored after serialization.""" - nlp = English() - nlp.add_pipe("sentencizer") - doc = nlp("Hello world") - assert doc[0].is_sent_start - assert doc.has_annotation("SENT_START") - assert len(list(doc.sents)) == 1 - doc_bytes = doc.to_bytes() - new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) - assert new_doc[0].is_sent_start - assert new_doc.has_annotation("SENT_START") - assert len(list(new_doc.sents)) == 1 diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py deleted file mode 100644 index 5d9bc4e83..000000000 --- a/spacy/tests/regression/test_issue3501-4000.py +++ /dev/null @@ -1,492 +0,0 @@ -import pytest -from spacy.language import Language -from spacy.vocab import Vocab -from spacy.pipeline import EntityRuler, DependencyParser -from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL -from spacy import displacy, load -from spacy.displacy import parse_deps -from spacy.tokens import Doc, Token -from spacy.matcher import Matcher, PhraseMatcher -from spacy.errors import MatchPatternError -from spacy.util import minibatch -from spacy.training import Example -from spacy.lang.hi import Hindi -from spacy.lang.es import Spanish -from spacy.lang.en import English -from spacy.attrs import IS_ALPHA -from spacy import registry -from thinc.api import compounding -import spacy -import srsly -import numpy - -from ..util import make_tempdir - - -@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) -@pytest.mark.issue(3521) -def test_issue3521(en_tokenizer, word): - tok = en_tokenizer(word)[1] - # 'not' and 'would' should be stopwords, also in their abbreviated forms - assert tok.is_stop - - -def test_issue_3526_1(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - ruler_bytes = ruler.to_bytes() - assert len(ruler) == len(patterns) - assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(ruler_bytes) - assert len(new_ruler) == len(ruler) - assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -def test_issue_3526_2(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -def test_issue_3526_3(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite - - -def test_issue_3526_4(en_vocab): - nlp = Language(vocab=en_vocab) - patterns = [{"label": "ORG", "pattern": "Apple"}] - config = {"overwrite_ents": True} - ruler = nlp.add_pipe("entity_ruler", config=config) - ruler.add_patterns(patterns) - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - ruler = nlp.get_pipe("entity_ruler") - assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True - nlp2 = load(tmpdir) - new_ruler = nlp2.get_pipe("entity_ruler") - assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True - - -@pytest.mark.issue(3531) -def test_issue3531(): - """Test that displaCy renderer doesn't require "settings" key.""" - example_dep = { - "words": [ - {"text": "But", "tag": "CCONJ"}, - {"text": "Google", "tag": "PROPN"}, - {"text": "is", "tag": "VERB"}, - {"text": "starting", "tag": "VERB"}, - {"text": "from", "tag": "ADP"}, - {"text": "behind.", "tag": "ADV"}, - ], - "arcs": [ - {"start": 0, "end": 3, "label": "cc", "dir": "left"}, - {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, - {"start": 2, "end": 3, "label": "aux", "dir": "left"}, - {"start": 3, "end": 4, "label": "prep", "dir": "right"}, - {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, - ], - } - example_ent = { - "text": "But Google is starting from behind.", - "ents": [{"start": 4, "end": 10, "label": "ORG"}], - } - dep_html = displacy.render(example_dep, style="dep", manual=True) - assert dep_html - ent_html = displacy.render(example_ent, style="ent", manual=True) - assert ent_html - - -@pytest.mark.issue(3540) -def test_issue3540(en_vocab): - words = ["I", "live", "in", "NewYork", "right", "now"] - tensor = numpy.asarray( - [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], - dtype="f", - ) - doc = Doc(en_vocab, words=words) - doc.tensor = tensor - gold_text = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.text for token in doc] == gold_text - gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] - for i, lemma in enumerate(gold_lemma): - doc[i].lemma_ = lemma - assert [token.lemma_ for token in doc] == gold_lemma - vectors_1 = [token.vector for token in doc] - assert len(vectors_1) == len(doc) - - with doc.retokenize() as retokenizer: - heads = [(doc[3], 1), doc[2]] - attrs = { - "POS": ["PROPN", "PROPN"], - "LEMMA": ["New", "York"], - "DEP": ["pobj", "compound"], - } - retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) - - gold_text = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.text for token in doc] == gold_text - gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - vectors_2 = [token.vector for token in doc] - assert len(vectors_2) == len(doc) - assert vectors_1[0].tolist() == vectors_2[0].tolist() - assert vectors_1[1].tolist() == vectors_2[1].tolist() - assert vectors_1[2].tolist() == vectors_2[2].tolist() - assert vectors_1[4].tolist() == vectors_2[5].tolist() - assert vectors_1[5].tolist() == vectors_2[6].tolist() - - -@pytest.mark.issue(3549) -def test_issue3549(en_vocab): - """Test that match pattern validation doesn't raise on empty errors.""" - matcher = Matcher(en_vocab, validate=True) - pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] - matcher.add("GOOD", [pattern]) - with pytest.raises(MatchPatternError): - matcher.add("BAD", [[{"X": "Y"}]]) - - -@pytest.mark.skip("Matching currently only works on strings and integers") -@pytest.mark.issue(3555) -def test_issue3555(en_vocab): - """Test that custom extensions with default None don't break matcher.""" - Token.set_extension("issue3555", default=None) - matcher = Matcher(en_vocab) - pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["have", "apple"]) - matcher(doc) - - -@pytest.mark.issue(3611) -def test_issue3611(): - """Test whether adding n-grams in the textcat works even when n > token length of some docs""" - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - nlp = spacy.blank("en") - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - # add a text categorizer component - model = { - "@architectures": "spacy.TextCatBOW.v1", - "exclusive_classes": True, - "ngram_size": 2, - "no_output_layer": False, - } - textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) - for label in unique_classes: - textcat.add_label(label) - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.initialize() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) - - -@pytest.mark.issue(3625) -def test_issue3625(): - """Test that default punctuation rules applies to hindi unicode characters""" - nlp = Hindi() - doc = nlp("hi. how हुए. होटल, होटल") - expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] - assert [token.text for token in doc] == expected - - -@pytest.mark.issue(3803) -def test_issue3803(): - """Test that spanish num-like tokens have True for like_num attribute.""" - nlp = Spanish() - text = "2 dos 1000 mil 12 doce" - doc = nlp(text) - - assert [t.like_num for t in doc] == [True, True, True, True, True, True] - - -def _parser_example(parser): - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - return Example.from_dict(doc, gold) - - -@pytest.mark.issue(3830) -def test_issue3830_no_subtok(): - """Test that the parser doesn't have subtok label if not learn_tokens""" - config = { - "learn_tokens": False, - } - model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] - parser = DependencyParser(Vocab(), model, **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.initialize(lambda: [_parser_example(parser)]) - assert "subtok" not in parser.labels - - -@pytest.mark.issue(3830) -def test_issue3830_with_subtok(): - """Test that the parser does have subtok label if learn_tokens=True.""" - config = { - "learn_tokens": True, - } - model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] - parser = DependencyParser(Vocab(), model, **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.initialize(lambda: [_parser_example(parser)]) - assert "subtok" in parser.labels - - -@pytest.mark.issue(3839) -def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string""" - doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) - matcher = Matcher(en_vocab) - match_id = "PATTERN" - pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] - pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] - matcher.add(match_id, [pattern1]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - matcher = Matcher(en_vocab) - matcher.add(match_id, [pattern2]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - - -@pytest.mark.parametrize( - "sentence", - [ - "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", - "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", - "It was a missed assignment, but it shouldn't have resulted in a turnover ...", - ], -) -@pytest.mark.issue(3869) -def test_issue3869(sentence): - """Test that the Doc's count_by function works consistently""" - nlp = English() - doc = nlp(sentence) - count = 0 - for token in doc: - count += token.is_alpha - assert count == doc.count_by(IS_ALPHA).get(1, 0) - - -@pytest.mark.issue(3879) -def test_issue3879(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) - assert len(doc) == 5 - pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] - matcher = Matcher(en_vocab) - matcher.add("TEST", [pattern]) - assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' - - -@pytest.mark.issue(3880) -def test_issue3880(): - """Test that `nlp.pipe()` works when an empty string ends the batch. - - Fixed in v7.0.5 of Thinc. - """ - texts = ["hello", "world", "", ""] - nlp = English() - nlp.add_pipe("parser").add_label("dep") - nlp.add_pipe("ner").add_label("PERSON") - nlp.add_pipe("tagger").add_label("NN") - nlp.initialize() - for doc in nlp.pipe(texts): - pass - - -@pytest.mark.issue(3882) -def test_issue3882(en_vocab): - """Test that displaCy doesn't serialize the doc.user_data when making a - copy of the Doc. - """ - doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"]) - doc.user_data["test"] = set() - parse_deps(doc) - - -@pytest.mark.issue(3951) -def test_issue3951(en_vocab): - """Test that combinations of optional rules are matched correctly.""" - matcher = Matcher(en_vocab) - pattern = [ - {"LOWER": "hello"}, - {"LOWER": "this", "OP": "?"}, - {"OP": "?"}, - {"LOWER": "world"}, - ] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) - matches = matcher(doc) - assert len(matches) == 0 - - -@pytest.mark.issue(3959) -def test_issue3959(): - """Ensure that a modified pos attribute is serialized correctly.""" - nlp = English() - doc = nlp( - "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" - ) - assert doc[0].pos_ == "" - doc[0].pos_ = "NOUN" - assert doc[0].pos_ == "NOUN" - # usually this is already True when starting from proper models instead of blank English - with make_tempdir() as tmp_dir: - file_path = tmp_dir / "my_doc" - doc.to_disk(file_path) - doc2 = nlp("") - doc2.from_disk(file_path) - assert doc2[0].pos_ == "NOUN" - - -@pytest.mark.issue(3962) -def test_issue3962(en_vocab): - """Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - # fmt: off - words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] - heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7] - deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] - # fmt: on - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - span2 = doc[1:5] # "jests at scars ," - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - # head set to itself, being the new artificial root - assert doc2[0].head.text == "jests" - assert doc2[0].dep_ == "dep" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" # head set to the new artificial root - assert doc2[3].dep_ == "dep" - # We should still have 1 sentence - assert len(list(doc2.sents)) == 1 - span3 = doc[6:9] # "never felt a" - doc3 = span3.as_doc() - doc3_json = doc3.to_json() - assert doc3_json - assert doc3[0].head.text == "felt" - assert doc3[0].dep_ == "neg" - assert doc3[1].head.text == "felt" - assert doc3[1].dep_ == "ROOT" - assert doc3[2].head.text == "felt" # head set to ancestor - assert doc3[2].dep_ == "dep" - # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" - assert len(list(doc3.sents)) == 1 - - -@pytest.mark.issue(3962) -def test_issue3962_long(en_vocab): - """Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - # fmt: off - words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] - heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7] - deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] - # fmt: on - two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - span2 = two_sent_doc[1:7] # "jests at scars. They never" - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - # head set to itself, being the new artificial root (in sentence 1) - assert doc2[0].head.text == "jests" - assert doc2[0].dep_ == "ROOT" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" - assert doc2[3].dep_ == "punct" - # head set to itself, being the new artificial root (in sentence 2) - assert doc2[4].head.text == "They" - assert doc2[4].dep_ == "dep" - # head set to the new artificial head (in sentence 2) - assert doc2[4].head.text == "They" - assert doc2[4].dep_ == "dep" - # We should still have 2 sentences - sents = list(doc2.sents) - assert len(sents) == 2 - assert sents[0].text == "jests at scars ." - assert sents[1].text == "They never" - - -@pytest.mark.issue(3972) -def test_issue3972(en_vocab): - """Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) - matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) - doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) - matches = matcher(doc) - - assert len(matches) == 2 - - # We should have a match for each of the two rules - found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] - assert "A" in found_ids - assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py deleted file mode 100644 index 7b7c304a3..000000000 --- a/spacy/tests/regression/test_issue4001-4500.py +++ /dev/null @@ -1,447 +0,0 @@ -import pytest -from spacy.pipeline import TrainablePipe -from spacy.matcher import PhraseMatcher, Matcher -from spacy.tokens import Doc, Span, DocBin -from spacy.training import Example, Corpus -from spacy.training.converters import json_to_docs -from spacy.vocab import Vocab -from spacy.lang.en import English -from spacy.util import minibatch, ensure_path, load_model -from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex -from spacy.tokenizer import Tokenizer -from spacy.lang.el import Greek -from spacy.language import Language -import spacy -from thinc.api import compounding - -from ..util import make_tempdir - - -@pytest.mark.issue(4002) -def test_issue4002(en_vocab): - """Test that the PhraseMatcher can match on overwritten NORM attributes.""" - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern1 = Doc(en_vocab, words=["c", "d"]) - assert [t.norm_ for t in pattern1] == ["c", "d"] - matcher.add("TEST", [pattern1]) - doc = Doc(en_vocab, words=["a", "b", "c", "d"]) - assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] - matches = matcher(doc) - assert len(matches) == 1 - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern2 = Doc(en_vocab, words=["1", "2"]) - pattern2[0].norm_ = "c" - pattern2[1].norm_ = "d" - assert [t.norm_ for t in pattern2] == ["c", "d"] - matcher.add("TEST", [pattern2]) - matches = matcher(doc) - assert len(matches) == 1 - - -@pytest.mark.issue(4030) -def test_issue4030(): - """Test whether textcat works fine with empty doc""" - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - nlp = spacy.blank("en") - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - # add a text categorizer component - model = { - "@architectures": "spacy.TextCatBOW.v1", - "exclusive_classes": True, - "ngram_size": 2, - "no_output_layer": False, - } - textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) - for label in unique_classes: - textcat.add_label(label) - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.initialize() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) - # processing of an empty doc should result in 0.0 for all categories - doc = nlp("") - assert doc.cats["offensive"] == 0.0 - assert doc.cats["inoffensive"] == 0.0 - - -@pytest.mark.issue(4042) -def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" - nlp = English() - # add ner pipe - ner = nlp.add_pipe("ner") - ner.add_label("SOME_LABEL") - nlp.initialize() - # Add entity ruler - patterns = [ - {"label": "MY_ORG", "pattern": "Apple"}, - {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, - ] - # works fine with "after" - ruler = nlp.add_pipe("entity_ruler", before="ner") - ruler.add_patterns(patterns) - doc1 = nlp("What do you think about Apple ?") - assert doc1.ents[0].label_ == "MY_ORG" - - with make_tempdir() as d: - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - nlp2 = load_model(output_dir) - doc2 = nlp2("What do you think about Apple ?") - assert doc2.ents[0].label_ == "MY_ORG" - - -@pytest.mark.issue(4042) -def test_issue4042_bug2(): - """ - Test that serialization of an NER works fine when new labels were added. - This is the second bug of two bugs underlying the issue 4042. - """ - nlp1 = English() - # add ner pipe - ner1 = nlp1.add_pipe("ner") - ner1.add_label("SOME_LABEL") - nlp1.initialize() - # add a new label to the doc - doc1 = nlp1("What do you think about Apple ?") - assert len(ner1.labels) == 1 - assert "SOME_LABEL" in ner1.labels - apple_ent = Span(doc1, 5, 6, label="MY_ORG") - doc1.ents = list(doc1.ents) + [apple_ent] - # Add the label explicitly. Previously we didn't require this. - ner1.add_label("MY_ORG") - ner1(doc1) - assert len(ner1.labels) == 2 - assert "SOME_LABEL" in ner1.labels - assert "MY_ORG" in ner1.labels - with make_tempdir() as d: - # assert IO goes fine - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - ner1.to_disk(output_dir) - config = {} - ner2 = nlp1.create_pipe("ner", config=config) - ner2.from_disk(output_dir) - assert len(ner2.labels) == 2 - - -@pytest.mark.issue(4054) -def test_issue4054(en_vocab): - """Test that a new blank model can be made with a vocab from file, - and that serialization does not drop the language at any point.""" - nlp1 = English() - vocab1 = nlp1.vocab - with make_tempdir() as d: - vocab_dir = ensure_path(d / "vocab") - if not vocab_dir.exists(): - vocab_dir.mkdir() - vocab1.to_disk(vocab_dir) - vocab2 = Vocab().from_disk(vocab_dir) - nlp2 = spacy.blank("en", vocab=vocab2) - nlp_dir = ensure_path(d / "nlp") - if not nlp_dir.exists(): - nlp_dir.mkdir() - nlp2.to_disk(nlp_dir) - nlp3 = load_model(nlp_dir) - assert nlp3.lang == "en" - - -@pytest.mark.issue(4120) -def test_issue4120(en_vocab): - """Test that matches without a final {OP: ?} token are returned.""" - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) - doc1 = Doc(en_vocab, words=["a"]) - assert len(matcher(doc1)) == 1 # works - doc2 = Doc(en_vocab, words=["a", "b", "c"]) - assert len(matcher(doc2)) == 2 # fixed - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) - doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc3)) == 2 # works - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) - doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc4)) == 3 # fixed - - -@pytest.mark.issue(4133) -def test_issue4133(en_vocab): - nlp = English() - vocab_bytes = nlp.vocab.to_bytes() - words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] - pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] - doc = Doc(en_vocab, words=words) - for i, token in enumerate(doc): - token.pos_ = pos[i] - # usually this is already True when starting from proper models instead of blank English - doc_bytes = doc.to_bytes() - vocab = Vocab() - vocab = vocab.from_bytes(vocab_bytes) - doc = Doc(vocab).from_bytes(doc_bytes) - actual = [] - for token in doc: - actual.append(token.pos_) - assert actual == pos - - -@pytest.mark.issue(4190) -def test_issue4190(): - def customize_tokenizer(nlp): - prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) - infix_re = compile_infix_regex(nlp.Defaults.infixes) - # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') - exceptions = { - k: v - for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() - if not (len(k) == 2 and k[1] == ".") - } - new_tokenizer = Tokenizer( - nlp.vocab, - exceptions, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=nlp.tokenizer.token_match, - ) - nlp.tokenizer = new_tokenizer - - test_string = "Test c." - # Load default language - nlp_1 = English() - doc_1a = nlp_1(test_string) - result_1a = [token.text for token in doc_1a] # noqa: F841 - # Modify tokenizer - customize_tokenizer(nlp_1) - doc_1b = nlp_1(test_string) - result_1b = [token.text for token in doc_1b] - # Save and Reload - with make_tempdir() as model_dir: - nlp_1.to_disk(model_dir) - nlp_2 = load_model(model_dir) - # This should be the modified tokenizer - doc_2 = nlp_2(test_string) - result_2 = [token.text for token in doc_2] - assert result_1b == result_2 - - -@pytest.mark.issue(4267) -def test_issue4267(): - """Test that running an entity_ruler after ner gives consistent results""" - nlp = English() - ner = nlp.add_pipe("ner") - ner.add_label("PEOPLE") - nlp.initialize() - assert "ner" in nlp.pipe_names - # assert that we have correct IOB annotations - doc1 = nlp("hi") - assert doc1.has_annotation("ENT_IOB") - for token in doc1: - assert token.ent_iob == 2 - # add entity ruler and run again - patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - assert "entity_ruler" in nlp.pipe_names - assert "ner" in nlp.pipe_names - # assert that we still have correct IOB annotations - doc2 = nlp("hi") - assert doc2.has_annotation("ENT_IOB") - for token in doc2: - assert token.ent_iob == 2 - - -@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab") -@pytest.mark.issue(4272) -def test_issue4272(): - """Test that lookup table can be accessed from Token.lemma if no POS tags - are available.""" - nlp = Greek() - doc = nlp("Χθες") - assert doc[0].lemma_ - - -def test_multiple_predictions(): - class DummyPipe(TrainablePipe): - def __init__(self): - self.model = "dummy_model" - - def predict(self, docs): - return ([1, 2, 3], [4, 5, 6]) - - def set_annotations(self, docs, scores): - return docs - - nlp = Language() - doc = nlp.make_doc("foo") - dummy_pipe = DummyPipe() - dummy_pipe(doc) - - -@pytest.mark.issue(4313) -def test_issue4313(): - """This should not crash or exit with some strange error code""" - beam_width = 16 - beam_density = 0.0001 - nlp = English() - config = { - "beam_width": beam_width, - "beam_density": beam_density, - } - ner = nlp.add_pipe("beam_ner", config=config) - ner.add_label("SOME_LABEL") - nlp.initialize() - # add a new label to the doc - doc = nlp("What do you think about Apple ?") - assert len(ner.labels) == 1 - assert "SOME_LABEL" in ner.labels - apple_ent = Span(doc, 5, 6, label="MY_ORG") - doc.ents = list(doc.ents) + [apple_ent] - - # ensure the beam_parse still works with the new label - docs = [doc] - ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density) - assert len(ner.labels) == 2 - assert "MY_ORG" in ner.labels - - -@pytest.mark.issue(4348) -def test_issue4348(): - """Test that training the tagger with empty data, doesn't throw errors""" - nlp = English() - example = Example.from_dict(nlp.make_doc(""), {"tags": []}) - TRAIN_DATA = [example, example] - tagger = nlp.add_pipe("tagger") - tagger.add_label("A") - optimizer = nlp.initialize() - for i in range(5): - losses = {} - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - - -@pytest.mark.issue(4367) -def test_issue4367(): - """Test that docbin init goes well""" - DocBin() - DocBin(attrs=["LEMMA"]) - DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) - - -@pytest.mark.issue(4373) -def test_issue4373(): - """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" - matcher = Matcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - matcher = PhraseMatcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - - -@pytest.mark.issue(4402) -def test_issue4402(): - json_data = { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], - } - nlp = English() - attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] - with make_tempdir() as tmpdir: - output_file = tmpdir / "test4402.spacy" - docs = json_to_docs([json_data]) - data = DocBin(docs=docs, attrs=attrs).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) - reader = Corpus(output_file) - train_data = list(reader(nlp)) - assert len(train_data) == 2 - - split_train_data = [] - for eg in train_data: - split_train_data.extend(eg.split_sents()) - assert len(split_train_data) == 4 diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py deleted file mode 100644 index 07a00d2b7..000000000 --- a/spacy/tests/regression/test_issue4501-5000.py +++ /dev/null @@ -1,266 +0,0 @@ -import pytest -from spacy.tokens import Doc, Span, DocBin -from spacy.training import Example -from spacy.training.converters.conllu_to_docs import conllu_to_docs -from spacy.lang.en import English -from spacy.kb import KnowledgeBase -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.util import ensure_path, load_model_from_path -import numpy -import pickle -from thinc.api import NumpyOps, get_current_ops - -from ..util import make_tempdir - - -@pytest.mark.issue(4528) -def test_issue4528(en_vocab): - """Test that user_data is correctly serialized in DocBin.""" - doc = Doc(en_vocab, words=["hello", "world"]) - doc.user_data["foo"] = "bar" - # This is how extension attribute values are stored in the user data - doc.user_data[("._.", "foo", None, None)] = "bar" - doc_bin = DocBin(store_user_data=True) - doc_bin.add(doc) - doc_bin_bytes = doc_bin.to_bytes() - new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) - new_doc = list(new_doc_bin.get_docs(en_vocab))[0] - assert new_doc.user_data["foo"] == "bar" - assert new_doc.user_data[("._.", "foo", None, None)] == "bar" - - -@pytest.mark.parametrize( - "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] -) -def test_gold_misaligned(en_tokenizer, text, words): - doc = en_tokenizer(text) - Example.from_dict(doc, {"words": words}) - - -@pytest.mark.issue(4651) -def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialized correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - specified. - """ - text = "Spacy is a python library for nlp" - nlp = English() - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) - ruler.add_patterns(patterns) - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - assert res == res_reloaded - - -@pytest.mark.issue(4651) -def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialized correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - not specified. - """ - text = "Spacy is a python library for nlp" - nlp = English() - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - assert res == res_reloaded - - -@pytest.mark.issue(4665) -def test_issue4665(): - """ - conllu_to_docs should not raise an exception if the HEAD column contains an - underscore - """ - input_data = """ -1 [ _ PUNCT -LRB- _ _ punct _ _ -2 This _ DET DT _ _ det _ _ -3 killing _ NOUN NN _ _ nsubj _ _ -4 of _ ADP IN _ _ case _ _ -5 a _ DET DT _ _ det _ _ -6 respected _ ADJ JJ _ _ amod _ _ -7 cleric _ NOUN NN _ _ nmod _ _ -8 will _ AUX MD _ _ aux _ _ -9 be _ AUX VB _ _ aux _ _ -10 causing _ VERB VBG _ _ root _ _ -11 us _ PRON PRP _ _ iobj _ _ -12 trouble _ NOUN NN _ _ dobj _ _ -13 for _ ADP IN _ _ case _ _ -14 years _ NOUN NNS _ _ nmod _ _ -15 to _ PART TO _ _ mark _ _ -16 come _ VERB VB _ _ acl _ _ -17 . _ PUNCT . _ _ punct _ _ -18 ] _ PUNCT -RRB- _ _ punct _ _ -""" - conllu_to_docs(input_data) - - -@pytest.mark.issue(4674) -def test_issue4674(): - """Test that setting entities with overlapping identifiers does not mess up IO""" - nlp = English() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) - vector1 = [0.9, 1.1, 1.01] - vector2 = [1.8, 2.25, 2.01] - with pytest.warns(UserWarning): - kb.set_entities( - entity_list=["Q1", "Q1"], - freq_list=[32, 111], - vector_list=[vector1, vector2], - ) - assert kb.get_size_entities() == 1 - # dumping to file & loading back in - with make_tempdir() as d: - dir_path = ensure_path(d) - if not dir_path.exists(): - dir_path.mkdir() - file_path = dir_path / "kb" - kb.to_disk(str(file_path)) - kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) - kb2.from_disk(str(file_path)) - assert kb2.get_size_entities() == 1 - - -@pytest.mark.skip(reason="API change: disable just disables, new exclude arg") -@pytest.mark.issue(4707) -def test_issue4707(): - """Tests that disabled component names are also excluded from nlp.from_disk - by default when loading a model. - """ - nlp = English() - nlp.add_pipe("sentencizer") - nlp.add_pipe("entity_ruler") - assert nlp.pipe_names == ["sentencizer", "entity_ruler"] - exclude = ["tokenizer", "sentencizer"] - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir, exclude=exclude) - new_nlp = load_model_from_path(tmpdir, disable=exclude) - assert "sentencizer" not in new_nlp.pipe_names - assert "entity_ruler" in new_nlp.pipe_names - - -@pytest.mark.issue(4725) -def test_issue4725_1(): - """Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") - nlp = English(vocab=vocab) - config = { - "update_with_oracle_cut_size": 111, - } - ner = nlp.create_pipe("ner", config=config) - with make_tempdir() as tmp_path: - with (tmp_path / "ner.pkl").open("wb") as file_: - pickle.dump(ner, file_) - assert ner.cfg["update_with_oracle_cut_size"] == 111 - - with (tmp_path / "ner.pkl").open("rb") as file_: - ner2 = pickle.load(file_) - assert ner2.cfg["update_with_oracle_cut_size"] == 111 - - -@pytest.mark.issue(4725) -def test_issue4725_2(): - if isinstance(get_current_ops, NumpyOps): - # ensures that this runs correctly and doesn't hang or crash because of the global vectors - # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), - # or because of issues with pickling the NER (cf test_issue4725_1) - vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") - data[0] = 1.0 - data[1] = 2.0 - vocab.set_vector("cat", data[0]) - vocab.set_vector("dog", data[1]) - nlp = English(vocab=vocab) - nlp.add_pipe("ner") - nlp.initialize() - docs = ["Kurt is in London."] * 10 - for _ in nlp.pipe(docs, batch_size=2, n_process=2): - pass - - -@pytest.mark.issue(4849) -def test_issue4849(): - nlp = English() - patterns = [ - {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, - {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, - ] - ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) - ruler.add_patterns(patterns) - text = """ - The left is starting to take aim at Democratic front-runner Joe Biden. - Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." - """ - # USING 1 PROCESS - count_ents = 0 - for doc in nlp.pipe([text], n_process=1): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - # USING 2 PROCESSES - if isinstance(get_current_ops, NumpyOps): - count_ents = 0 - for doc in nlp.pipe([text], n_process=2): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - - -@Language.factory("my_pipe") -class CustomPipe: - def __init__(self, nlp, name="my_pipe"): - self.name = name - Span.set_extension("my_ext", getter=self._get_my_ext) - Doc.set_extension("my_ext", default=None) - - def __call__(self, doc): - gathered_ext = [] - for sent in doc.sents: - sent_ext = self._get_my_ext(sent) - sent._.set("my_ext", sent_ext) - gathered_ext.append(sent_ext) - - doc._.set("my_ext", "\n".join(gathered_ext)) - return doc - - @staticmethod - def _get_my_ext(span): - return str(span.end) - - -@pytest.mark.issue(4903) -def test_issue4903(): - """Ensure that this runs correctly and doesn't hang or crash on Windows / - macOS.""" - nlp = English() - nlp.add_pipe("sentencizer") - nlp.add_pipe("my_pipe", after="sentencizer") - text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - if isinstance(get_current_ops(), NumpyOps): - docs = list(nlp.pipe(text, n_process=2)) - assert docs[0].text == "I like bananas." - assert docs[1].text == "Do you like them?" - assert docs[2].text == "No, I prefer wasabi." - - -@pytest.mark.issue(4924) -def test_issue4924(): - nlp = Language() - example = Example.from_dict(nlp.make_doc(""), {}) - nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py deleted file mode 100644 index e1f5231e7..000000000 --- a/spacy/tests/regression/test_issue5001-5500.py +++ /dev/null @@ -1,149 +0,0 @@ -import numpy -from spacy.tokens import Doc, DocBin -from spacy.attrs import DEP, POS, TAG -from spacy.lang.en import English -from spacy.language import Language -from spacy.lang.en.syntax_iterators import noun_chunks -from spacy.vocab import Vocab -import spacy -from thinc.api import get_current_ops -import pytest - -from ...util import make_tempdir - - -@pytest.mark.issue(5048) -def test_issue5048(en_vocab): - words = ["This", "is", "a", "sentence"] - pos_s = ["DET", "VERB", "DET", "NOUN"] - spaces = [" ", " ", " ", ""] - deps_s = ["dep", "adj", "nn", "atm"] - tags_s = ["DT", "VBZ", "DT", "NN"] - strings = en_vocab.strings - for w in words: - strings.add(w) - deps = [strings.add(d) for d in deps_s] - pos = [strings.add(p) for p in pos_s] - tags = [strings.add(t) for t in tags_s] - attrs = [POS, DEP, TAG] - array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") - doc = Doc(en_vocab, words=words, spaces=spaces) - doc.from_array(attrs, array) - v1 = [(token.text, token.pos_, token.tag_) for token in doc] - doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) - v2 = [(token.text, token.pos_, token.tag_) for token in doc2] - assert v1 == v2 - - -@pytest.mark.issue(5082) -def test_issue5082(): - # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens - nlp = English() - vocab = nlp.vocab - array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32) - array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32) - array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32) - array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32) - array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32) - vocab.set_vector("I", array1) - vocab.set_vector("like", array2) - vocab.set_vector("David", array3) - vocab.set_vector("Bowie", array4) - text = "I like David Bowie" - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]} - ] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - parsed_vectors_1 = [t.vector for t in nlp(text)] - assert len(parsed_vectors_1) == 4 - ops = get_current_ops() - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4) - nlp.add_pipe("merge_entities") - parsed_vectors_2 = [t.vector for t in nlp(text)] - assert len(parsed_vectors_2) == 3 - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) - - -@pytest.mark.issue(5137) -def test_issue5137(): - factory_name = "test_issue5137" - pipe_name = "my_component" - - @Language.factory(factory_name) - class MyComponent: - def __init__(self, nlp, name=pipe_name, categories="all_categories"): - self.nlp = nlp - self.categories = categories - self.name = name - - def __call__(self, doc): - pass - - def to_disk(self, path, **kwargs): - pass - - def from_disk(self, path, **cfg): - pass - - nlp = English() - my_component = nlp.add_pipe(factory_name, name=pipe_name) - assert my_component.categories == "all_categories" - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - overrides = {"components": {pipe_name: {"categories": "my_categories"}}} - nlp2 = spacy.load(tmpdir, config=overrides) - assert nlp2.get_pipe(pipe_name).categories == "my_categories" - - -@pytest.mark.issue(5141) -def test_issue5141(en_vocab): - """Ensure an empty DocBin does not crash on serialization""" - doc_bin = DocBin(attrs=["DEP", "HEAD"]) - assert list(doc_bin.get_docs(en_vocab)) == [] - doc_bin_bytes = doc_bin.to_bytes() - doc_bin_2 = DocBin().from_bytes(doc_bin_bytes) - assert list(doc_bin_2.get_docs(en_vocab)) == [] - - -@pytest.mark.issue(5152) -def test_issue5152(): - # Test that the comparison between a Span and a Token, goes well - # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) - nlp = English() - text = nlp("Talk about being boring!") - text_var = nlp("Talk of being boring!") - y = nlp("Let") - span = text[0:3] # Talk about being - span_2 = text[0:3] # Talk about being - span_3 = text_var[0:3] # Talk of being - token = y[0] # Let - with pytest.warns(UserWarning): - assert span.similarity(token) == 0.0 - assert span.similarity(span_2) == 1.0 - with pytest.warns(UserWarning): - assert span_2.similarity(span_3) < 1.0 - - -@pytest.mark.issue(5458) -def test_issue5458(): - # Test that the noun chuncker does not generate overlapping spans - # fmt: off - words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] - vocab = Vocab(strings=words) - deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] - pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] - heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] - # fmt: on - en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps) - en_doc.noun_chunks_iterator = noun_chunks - - # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" - nlp = English() - merge_nps = nlp.create_pipe("merge_noun_chunks") - merge_nps(en_doc) diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py deleted file mode 100644 index 87c40ec2a..000000000 --- a/spacy/tests/regression/test_issue5501-6000.py +++ /dev/null @@ -1,95 +0,0 @@ -import pytest -from numpy.testing import assert_almost_equal -from thinc.api import Config, fix_random_seed, get_current_ops - -from spacy.lang.en import English -from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config -from spacy.pipeline.textcat import single_label_cnn_config -from spacy.pipeline.textcat_multilabel import multi_label_default_config -from spacy.pipeline.textcat_multilabel import multi_label_bow_config -from spacy.pipeline.textcat_multilabel import multi_label_cnn_config -from spacy.tokens import Span -from spacy import displacy -from spacy.pipeline import merge_entities -from spacy.training import Example - - -@pytest.mark.parametrize( - "textcat_config", - [ - single_label_default_config, - single_label_bow_config, - single_label_cnn_config, - multi_label_default_config, - multi_label_bow_config, - multi_label_cnn_config, - ], -) -@pytest.mark.issue(5551) -def test_issue5551(textcat_config): - """Test that after fixing the random seed, the results of the pipeline are truly identical""" - component = "textcat" - - pipe_cfg = Config().from_str(textcat_config) - results = [] - for i in range(3): - fix_random_seed(0) - nlp = English() - text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g." - annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}} - pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) - for label in set(annots["cats"]): - pipe.add_label(label) - # Train - nlp.initialize() - doc = nlp.make_doc(text) - nlp.update([Example.from_dict(doc, annots)]) - # Store the result of each iteration - result = pipe.model.predict([doc]) - results.append(result[0]) - # All results should be the same because of the fixed seed - assert len(results) == 3 - ops = get_current_ops() - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) - - -@pytest.mark.issue(5838) -def test_issue5838(): - # Displacy's EntityRenderer break line - # not working after last entity - sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n" - nlp = English() - doc = nlp(sample_text) - doc.ents = [Span(doc, 7, 8, label="test")] - html = displacy.render(doc, style="ent") - found = html.count("
") - assert found == 4 - - -@pytest.mark.issue(5918) -def test_issue5918(): - # Test edge case when merging entities. - nlp = English() - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - {"label": "ORG", "pattern": "Digicon Inc"}, - {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, - {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"}, - ] - ruler.add_patterns(patterns) - - text = """ - Digicon Inc said it has completed the previously-announced disposition - of its computer systems division to an investment group led by - Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate. - """ - doc = nlp(text) - assert len(doc.ents) == 3 - # make it so that the third span's head is within the entity (ent_iob=I) - # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents. - # TODO: test for logging here - # with pytest.warns(UserWarning): - # doc[29].head = doc[33] - doc = merge_entities(doc) - assert len(doc.ents) == 3 diff --git a/spacy/tests/regression/test_issue6001-6500.py b/spacy/tests/regression/test_issue6001-6500.py deleted file mode 100644 index cb27d39e4..000000000 --- a/spacy/tests/regression/test_issue6001-6500.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy.util import filter_spans -from pydantic import ValidationError -from spacy.schemas import TokenPattern, TokenPatternSchema -import pytest - - -@pytest.mark.issue(6207) -def test_issue6207(en_tokenizer): - doc = en_tokenizer("zero one two three four five six") - - # Make spans - s1 = doc[:4] - s2 = doc[3:6] # overlaps with s1 - s3 = doc[5:7] # overlaps with s2, not s1 - - result = filter_spans((s1, s2, s3)) - assert s1 in result - assert s2 not in result - assert s3 in result - - -@pytest.mark.issue(6258) -def test_issue6258(): - """Test that the non-empty constraint pattern field is respected""" - # These one is valid - TokenPatternSchema(pattern=[TokenPattern()]) - # But an empty pattern list should fail to validate - # based on the schema's constraint - with pytest.raises(ValidationError): - TokenPatternSchema(pattern=[]) diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py deleted file mode 100644 index 84517d79b..000000000 --- a/spacy/tests/regression/test_issue6501-7000.py +++ /dev/null @@ -1,238 +0,0 @@ -import pytest -from spacy.lang.en import English -import numpy as np -import spacy -from spacy.tokens import Doc -from spacy.matcher import PhraseMatcher -from spacy.tokens import DocBin -from spacy.util import load_config_from_str -from spacy.training import Example -from spacy.training.initialize import init_nlp -import pickle - -from ..util import make_tempdir - - -@pytest.mark.issue(6730) -def test_issue6730(en_vocab): - """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" - from spacy.kb import KnowledgeBase - - kb = KnowledgeBase(en_vocab, entity_vector_length=3) - kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) - - with pytest.raises(ValueError): - kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) - assert kb.contains_alias("") is False - - kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) - kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) - - with make_tempdir() as tmp_dir: - kb.to_disk(tmp_dir) - kb.from_disk(tmp_dir) - assert kb.get_size_aliases() == 2 - assert set(kb.get_alias_strings()) == {"x", "y"} - - -@pytest.mark.issue(6755) -def test_issue6755(en_tokenizer): - doc = en_tokenizer("This is a magnificent sentence.") - span = doc[:0] - assert span.text_with_ws == "" - assert span.text == "" - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,label", - [("Welcome to Mumbai, my friend", 11, 17, "GPE")], -) -@pytest.mark.issue(6815) -def test_issue6815_1(sentence, start_idx, end_idx, label): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, label=label) - assert span.label_ == label - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] -) -@pytest.mark.issue(6815) -def test_issue6815_2(sentence, start_idx, end_idx, kb_id): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) - assert span.kb_id == kb_id - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,vector", - [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], -) -@pytest.mark.issue(6815) -def test_issue6815_3(sentence, start_idx, end_idx, vector): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, vector=vector) - assert (span.vector == vector).all() - - -@pytest.mark.issue(6839) -def test_issue6839(en_vocab): - """Ensure that PhraseMatcher accepts Span as input""" - # fmt: off - words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] - # fmt: on - doc = Doc(en_vocab, words=words) - span = doc[:8] - pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) - matcher = PhraseMatcher(en_vocab) - matcher.add("SPACY", [pattern]) - matches = matcher(span) - assert matches - - -CONFIG_ISSUE_6908 = """ -[paths] -train = "TRAIN_PLACEHOLDER" -raw = null -init_tok2vec = null -vectors = null - -[system] -seed = 0 -gpu_allocator = null - -[nlp] -lang = "en" -pipeline = ["textcat"] -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -batch_size = 1000 - -[components] - -[components.textcat] -factory = "TEXTCAT_PLACEHOLDER" - -[corpora] - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - - -[training] -train_corpus = "corpora.train" -dev_corpus = "corpora.dev" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -frozen_components = [] -before_to_disk = null - -[pretraining] - -[initialize] -vectors = ${paths.vectors} -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.components.textcat] -labels = ['label1', 'label2'] - -[initialize.tokenizer] -""" - - -@pytest.mark.parametrize( - "component_name", - ["textcat", "textcat_multilabel"], -) -@pytest.mark.issue(6908) -def test_issue6908(component_name): - """Test intializing textcat with labels in a list""" - - def create_data(out_file): - nlp = spacy.blank("en") - doc = nlp.make_doc("Some text") - doc.cats = {"label1": 0, "label2": 1} - out_data = DocBin(docs=[doc]).to_bytes() - with out_file.open("wb") as file_: - file_.write(out_data) - - with make_tempdir() as tmp_path: - train_path = tmp_path / "train.spacy" - create_data(train_path) - config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) - config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) - config = load_config_from_str(config_str) - init_nlp(config) - - -CONFIG_ISSUE_6950 = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.ner] -factory = "ner" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -@pytest.mark.issue(6950) -def test_issue6950(): - """Test that the nlp object with initialized tok2vec with listeners pickles - correctly (and doesn't have lambdas). - """ - nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) - nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) - pickle.dumps(nlp) - nlp("hello") - pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py deleted file mode 100644 index 1164e85b9..000000000 --- a/spacy/tests/regression/test_issue7001-8000.py +++ /dev/null @@ -1,288 +0,0 @@ -import pytest -from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type -from spacy.lang.en import English -from spacy.training import Example -from spacy.tokens.doc import Doc -from spacy.vocab import Vocab -from spacy.kb import KnowledgeBase -from spacy.pipeline._parser_internals.arc_eager import ArcEager -from spacy.util import load_config_from_str, load_config -from spacy.cli.init_config import fill_config -from thinc.api import Config -from wasabi import msg - -from ..util import make_tempdir - - -@pytest.mark.issue(7019) -def test_issue7019(): - scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} - print_textcats_auc_per_cat(msg, scores) - scores = { - "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, - "LABEL_B": {"p": None, "r": None, "f": None}, - } - print_prf_per_type(msg, scores, name="foo", type="bar") - - -CONFIG_7029 = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -@pytest.mark.issue(7029) -def test_issue7029(): - """Test that an empty document doesn't mess up an entire batch.""" - TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), - ] - nlp = English.from_config(load_config_from_str(CONFIG_7029)) - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(50): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] - docs1 = list(nlp.pipe(texts, batch_size=1)) - docs2 = list(nlp.pipe(texts, batch_size=4)) - assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] - - -@pytest.mark.issue(7055) -def test_issue7055(): - """Test that fill-config doesn't turn sourced components into factories.""" - source_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, - "components": { - "tok2vec": {"factory": "tok2vec"}, - "tagger": {"factory": "tagger"}, - }, - } - source_nlp = English.from_config(source_cfg) - with make_tempdir() as dir_path: - # We need to create a loadable source pipeline - source_path = dir_path / "test_model" - source_nlp.to_disk(source_path) - base_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, - "components": { - "tok2vec": {"source": str(source_path)}, - "tagger": {"source": str(source_path)}, - "ner": {"factory": "ner"}, - }, - } - base_cfg = Config(base_cfg) - base_path = dir_path / "base.cfg" - base_cfg.to_disk(base_path) - output_path = dir_path / "config.cfg" - fill_config(output_path, base_path, silent=True) - filled_cfg = load_config(output_path) - assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) - assert filled_cfg["components"]["tagger"]["source"] == str(source_path) - assert filled_cfg["components"]["ner"]["factory"] == "ner" - assert "model" in filled_cfg["components"]["ner"] - - -@pytest.mark.issue(7056) -def test_issue7056(): - """Test that the Unshift transition works properly, and doesn't cause - sentence segmentation errors.""" - vocab = Vocab() - ae = ArcEager( - vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) - ) - doc = Doc(vocab, words="Severe pain , after trauma".split()) - state = ae.init_batch([doc])[0] - ae.apply_transition(state, "S") - ae.apply_transition(state, "L-amod") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "R-pobj") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - assert not state.eol() - - -def test_partial_links(): - # Test that having some entities on the doc without gold links, doesn't crash - TRAIN_DATA = [ - ( - "Russ Cochran his reprints include EC Comics.", - { - "links": {(0, 12): {"Q2146908": 1.0}}, - "entities": [(0, 12, "PERSON")], - "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], - }, - ) - ] - nlp = English() - vector_length = 3 - train_examples = [] - for text, annotation in TRAIN_DATA: - doc = nlp(text) - train_examples.append(Example.from_dict(doc, annotation)) - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) - return mykb - - # Create and train the Entity Linker - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # adding additional components that are required for the entity_linker - nlp.add_pipe("sentencizer", first=True) - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, - {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - - # this will run the pipeline on the examples and shouldn't crash - results = nlp.evaluate(train_examples) - assert "PERSON" in results["ents_per_type"] - assert "PERSON" in results["nel_f_per_type"] - assert "ORG" in results["ents_per_type"] - assert "ORG" not in results["nel_f_per_type"] - - -@pytest.mark.issue(7065) -def test_issue7065(): - text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." - nlp = English() - nlp.add_pipe("sentencizer") - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - { - "label": "THING", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - } - ] - ruler.add_patterns(patterns) - - doc = nlp(text) - sentences = [s for s in doc.sents] - assert len(sentences) == 2 - sent0 = sentences[0] - ent = doc.ents[0] - assert ent.start < sent0.end < ent.end - assert sentences.index(ent.sent) == 0 - - -@pytest.mark.issue(7065) -def test_issue7065_b(): - # Test that the NEL doesn't crash when an entity crosses a sentence boundary - nlp = English() - vector_length = 3 - nlp.add_pipe("sentencizer") - text = "Mahler 's Symphony No. 8 was beautiful." - entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = { - (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, - } - sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] - doc = nlp(text) - example = Example.from_dict( - doc, {"entities": entities, "links": links, "sent_starts": sent_starts} - ) - train_examples = [example] - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias( - alias="No. 8", - entities=["Q270853"], - probabilities=[1.0], - ) - mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias( - alias="Mahler", - entities=["Q7304"], - probabilities=[1.0], - ) - return mykb - - # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - # train the NEL pipe - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # Add a custom rule-based component to mimick NER - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - { - "label": "WORK", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - }, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - # test the trained model - this should not throw E148 - doc = nlp(text) - assert doc diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py deleted file mode 100644 index d9b3967ff..000000000 --- a/spacy/tests/regression/test_issue7716.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest -from thinc.api import Adam -from spacy.attrs import NORM -from spacy.vocab import Vocab -from spacy import registry -from spacy.training import Example -from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL -from spacy.tokens import Doc -from spacy.pipeline import DependencyParser - - -@pytest.fixture -def vocab(): - return Vocab(lex_attr_getters={NORM: lambda s: s}) - - -def _parser_example(parser): - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - return Example.from_dict(doc, gold) - - -@pytest.fixture -def parser(vocab): - vocab.strings.add("ROOT") - cfg = {"model": DEFAULT_PARSER_MODEL} - model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model) - parser.cfg["token_vector_width"] = 4 - parser.cfg["hidden_width"] = 32 - # parser.add_label('right') - parser.add_label("left") - parser.initialize(lambda: [_parser_example(parser)]) - sgd = Adam(0.001) - - for i in range(10): - losses = {} - doc = Doc(vocab, words=["a", "b", "c", "d"]) - example = Example.from_dict( - doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} - ) - parser.update([example], sgd=sgd, losses=losses) - return parser - - -@pytest.mark.issue(7716) -@pytest.mark.xfail(reason="Not fixed yet") -def test_partial_annotation(parser): - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - doc[2].is_sent_start = False - # Note that if the following line is used, then doc[2].is_sent_start == False - # doc[3].is_sent_start = False - - doc = parser(doc) - assert doc[2].is_sent_start == False diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py deleted file mode 100644 index e3f3b5cfa..000000000 --- a/spacy/tests/regression/test_issue8168.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest -from spacy.lang.en import English - - -@pytest.mark.issue(8168) -def test_issue8168(): - nlp = English() - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - {"label": "ORG", "pattern": "Apple"}, - { - "label": "GPE", - "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], - "id": "san-francisco", - }, - { - "label": "GPE", - "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], - "id": "san-francisco", - }, - ] - ruler.add_patterns(patterns) - - assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py deleted file mode 100644 index 0b2f2824b..000000000 --- a/spacy/tests/regression/test_issue8190.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -import spacy -from spacy.lang.en import English -from ..util import make_tempdir - - -@pytest.mark.issue(8190) -def test_issue8190(): - """Test that config overrides are not lost after load is complete.""" - source_cfg = { - "nlp": { - "lang": "en", - }, - "custom": {"key": "value"}, - } - source_nlp = English.from_config(source_cfg) - with make_tempdir() as dir_path: - # We need to create a loadable source pipeline - source_path = dir_path / "test_model" - source_nlp.to_disk(source_path) - nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) - - assert nlp.config["custom"]["key"] == "updated_value" diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py deleted file mode 100644 index 0370074fe..000000000 --- a/spacy/tests/regression/test_issue8216.py +++ /dev/null @@ -1,34 +0,0 @@ -import pytest - -from spacy import registry -from spacy.language import Language - - -@pytest.fixture -def nlp(): - return Language() - - -@pytest.fixture -@registry.misc("entity_ruler_patterns") -def patterns(): - return [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"}, - ] - - -@pytest.mark.issue(8216) -def test_entity_ruler_fix8216(nlp, patterns): - """Test that patterns don't get added excessively.""" - ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) - ruler.add_patterns(patterns) - pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) - assert pattern_count > 0 - ruler.add_patterns([]) - after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) - assert after_count == pattern_count diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 102989705..1d50fd1d1 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -1,20 +1,17 @@ import pytest -from thinc.api import Config, ConfigValidationError -import spacy -from spacy.lang.en import English -from spacy.lang.de import German -from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH -from spacy.util import ( - registry, - load_model_from_config, - load_config, - load_config_from_str, -) -from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model -from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder -from spacy.schemas import ConfigSchema, ConfigSchemaPretrain from catalogue import RegistryError +from thinc.api import Config, ConfigValidationError +import spacy +from spacy.lang.de import German +from spacy.lang.en import English +from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH +from spacy.language import Language +from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed +from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model +from spacy.schemas import ConfigSchema, ConfigSchemaPretrain +from spacy.util import load_config, load_config_from_str +from spacy.util import load_model_from_config, registry from ..util import make_tempdir @@ -187,6 +184,25 @@ def my_parser(): return parser +@pytest.mark.issue(8190) +def test_issue8190(): + """Test that config overrides are not lost after load is complete.""" + source_cfg = { + "nlp": { + "lang": "en", + }, + "custom": {"key": "value"}, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) + + assert nlp.config["custom"]["key"] == "updated_value" + + def test_create_nlp_from_config(): config = Config().from_str(nlp_config_string) with pytest.raises(ConfigValidationError): diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 23afaf26c..15bf67bfd 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,13 +1,168 @@ -import pytest -from spacy.tokens.underscore import Underscore +import copy +import pickle -import spacy +import numpy +import pytest + +from spacy.attrs import DEP, HEAD from spacy.lang.en import English -from spacy.tokens import Doc, DocBin +from spacy.language import Language +from spacy.matcher import Matcher, PhraseMatcher +from spacy.tokens import Doc +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir +@pytest.mark.issue(1727) +def test_issue1727(): + """Test that models with no pretrained vectors can be deserialized + correctly after vectors are added.""" + nlp = Language(Vocab()) + data = numpy.ones((3, 300), dtype="f") + vectors = Vectors(data=data, keys=["I", "am", "Matt"]) + tagger = nlp.create_pipe("tagger") + tagger.add_label("PRP") + assert tagger.cfg.get("pretrained_dims", 0) == 0 + tagger.vocab.vectors = vectors + with make_tempdir() as path: + tagger.to_disk(path) + tagger = nlp.create_pipe("tagger").from_disk(path) + assert tagger.cfg.get("pretrained_dims", 0) == 0 + + +@pytest.mark.issue(1799) +def test_issue1799(): + """Test sentence boundaries are deserialized correctly, even for + non-projective sentences.""" + heads_deps = numpy.asarray( + [ + [1, 397], + [4, 436], + [2, 426], + [1, 402], + [0, 8206900633647566924], + [18446744073709551615, 440], + [18446744073709551614, 442], + ], + dtype="uint64", + ) + doc = Doc(Vocab(), words="Just what I was looking for .".split()) + doc.vocab.strings.add("ROOT") + doc = doc.from_array([HEAD, DEP], heads_deps) + assert len(list(doc.sents)) == 1 + + +@pytest.mark.issue(1834) +def test_issue1834(): + """Test that sentence boundaries & parse/tag flags are not lost + during serialization.""" + words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"] + doc = Doc(Vocab(), words=words) + doc[6].is_sent_start = True + new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) + assert new_doc[6].sent_start + assert not new_doc.has_annotation("DEP") + assert not new_doc.has_annotation("TAG") + doc = Doc( + Vocab(), + words=words, + tags=["TAG"] * len(words), + heads=[0, 0, 0, 0, 0, 0, 6, 6, 6], + deps=["dep"] * len(words), + ) + new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) + assert new_doc[6].sent_start + assert new_doc.has_annotation("DEP") + assert new_doc.has_annotation("TAG") + + +@pytest.mark.issue(1883) +def test_issue1883(): + matcher = Matcher(Vocab()) + matcher.add("pat1", [[{"orth": "hello"}]]) + doc = Doc(matcher.vocab, words=["hello"]) + assert len(matcher(doc)) == 1 + new_matcher = copy.deepcopy(matcher) + new_doc = Doc(new_matcher.vocab, words=["hello"]) + assert len(new_matcher(new_doc)) == 1 + + +@pytest.mark.issue(2564) +def test_issue2564(): + """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" + nlp = Language() + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") + nlp.initialize() + doc = nlp("hello world") + assert doc.has_annotation("TAG") + docs = nlp.pipe(["hello", "world"]) + piped_doc = next(docs) + assert piped_doc.has_annotation("TAG") + + +@pytest.mark.issue(3248) +def test_issue3248_2(): + """Test that the PhraseMatcher can be pickled correctly.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) + matcher.add("TEST2", [nlp("d")]) + data = pickle.dumps(matcher) + new_matcher = pickle.loads(data) + assert len(new_matcher) == len(matcher) + + +@pytest.mark.issue(3289) +def test_issue3289(): + """Test that Language.to_bytes handles serializing a pipeline component + with an uninitialized model.""" + nlp = English() + nlp.add_pipe("textcat") + bytes_data = nlp.to_bytes() + new_nlp = English() + new_nlp.add_pipe("textcat") + new_nlp.from_bytes(bytes_data) + + +@pytest.mark.issue(3468) +def test_issue3468(): + """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can + be restored after serialization.""" + nlp = English() + nlp.add_pipe("sentencizer") + doc = nlp("Hello world") + assert doc[0].is_sent_start + assert doc.has_annotation("SENT_START") + assert len(list(doc.sents)) == 1 + doc_bytes = doc.to_bytes() + new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) + assert new_doc[0].is_sent_start + assert new_doc.has_annotation("SENT_START") + assert len(list(new_doc.sents)) == 1 + + +@pytest.mark.issue(3959) +def test_issue3959(): + """Ensure that a modified pos attribute is serialized correctly.""" + nlp = English() + doc = nlp( + "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" + ) + assert doc[0].pos_ == "" + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" + # usually this is already True when starting from proper models instead of blank English + with make_tempdir() as tmp_dir: + file_path = tmp_dir / "my_doc" + doc.to_disk(file_path) + doc2 = nlp("") + doc2.from_disk(file_path) + assert doc2[0].pos_ == "NOUN" + + def test_serialize_empty_doc(en_vocab): doc = Doc(en_vocab) data = doc.to_bytes() @@ -61,69 +216,3 @@ def test_serialize_doc_span_groups(en_vocab): doc.spans["content"] = [doc[0:2]] new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert len(new_doc.spans["content"]) == 1 - - -def test_serialize_doc_bin(): - doc_bin = DocBin( - attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True - ) - texts = ["Some text", "Lots of texts...", "..."] - cats = {"A": 0.5} - nlp = English() - for doc in nlp.pipe(texts): - doc.cats = cats - doc.spans["start"] = [doc[0:2]] - doc[0].norm_ = "UNUSUAL_TOKEN_NORM" - doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" - doc_bin.add(doc) - bytes_data = doc_bin.to_bytes() - - # Deserialize later, e.g. in a new process - nlp = spacy.blank("en") - doc_bin = DocBin().from_bytes(bytes_data) - reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) - for i, doc in enumerate(reloaded_docs): - assert doc.text == texts[i] - assert doc.cats == cats - assert len(doc.spans) == 1 - assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" - assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" - - -def test_serialize_doc_bin_unknown_spaces(en_vocab): - doc1 = Doc(en_vocab, words=["that", "'s"]) - assert doc1.has_unknown_spaces - assert doc1.text == "that 's " - doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) - assert not doc2.has_unknown_spaces - assert doc2.text == "that's" - - doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) - re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) - assert re_doc1.has_unknown_spaces - assert re_doc1.text == "that 's " - assert not re_doc2.has_unknown_spaces - assert re_doc2.text == "that's" - - -@pytest.mark.parametrize( - "writer_flag,reader_flag,reader_value", - [ - (True, True, "bar"), - (True, False, "bar"), - (False, True, "nothing"), - (False, False, "nothing"), - ], -) -def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): - """Test that custom extensions are correctly serialized in DocBin.""" - Doc.set_extension("foo", default="nothing") - doc = Doc(en_vocab, words=["hello", "world"]) - doc._.foo = "bar" - doc_bin_1 = DocBin(store_user_data=writer_flag) - doc_bin_1.add(doc) - doc_bin_bytes = doc_bin_1.to_bytes() - doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) - doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] - assert doc_2._.foo == reader_value - Underscore.doc_extensions = {} diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py new file mode 100644 index 000000000..9f8e5e06b --- /dev/null +++ b/spacy/tests/serialize/test_serialize_docbin.py @@ -0,0 +1,106 @@ +import pytest + +import spacy +from spacy.lang.en import English +from spacy.tokens import Doc, DocBin +from spacy.tokens.underscore import Underscore + + +@pytest.mark.issue(4367) +def test_issue4367(): + """Test that docbin init goes well""" + DocBin() + DocBin(attrs=["LEMMA"]) + DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) + + +@pytest.mark.issue(4528) +def test_issue4528(en_vocab): + """Test that user_data is correctly serialized in DocBin.""" + doc = Doc(en_vocab, words=["hello", "world"]) + doc.user_data["foo"] = "bar" + # This is how extension attribute values are stored in the user data + doc.user_data[("._.", "foo", None, None)] = "bar" + doc_bin = DocBin(store_user_data=True) + doc_bin.add(doc) + doc_bin_bytes = doc_bin.to_bytes() + new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) + new_doc = list(new_doc_bin.get_docs(en_vocab))[0] + assert new_doc.user_data["foo"] == "bar" + assert new_doc.user_data[("._.", "foo", None, None)] == "bar" + + +@pytest.mark.issue(5141) +def test_issue5141(en_vocab): + """Ensure an empty DocBin does not crash on serialization""" + doc_bin = DocBin(attrs=["DEP", "HEAD"]) + assert list(doc_bin.get_docs(en_vocab)) == [] + doc_bin_bytes = doc_bin.to_bytes() + doc_bin_2 = DocBin().from_bytes(doc_bin_bytes) + assert list(doc_bin_2.get_docs(en_vocab)) == [] + + +def test_serialize_doc_bin(): + doc_bin = DocBin( + attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True + ) + texts = ["Some text", "Lots of texts...", "..."] + cats = {"A": 0.5} + nlp = English() + for doc in nlp.pipe(texts): + doc.cats = cats + doc.spans["start"] = [doc[0:2]] + doc[0].norm_ = "UNUSUAL_TOKEN_NORM" + doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" + doc_bin.add(doc) + bytes_data = doc_bin.to_bytes() + + # Deserialize later, e.g. in a new process + nlp = spacy.blank("en") + doc_bin = DocBin().from_bytes(bytes_data) + reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) + for i, doc in enumerate(reloaded_docs): + assert doc.text == texts[i] + assert doc.cats == cats + assert len(doc.spans) == 1 + assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" + assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" + + +def test_serialize_doc_bin_unknown_spaces(en_vocab): + doc1 = Doc(en_vocab, words=["that", "'s"]) + assert doc1.has_unknown_spaces + assert doc1.text == "that 's " + doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) + assert not doc2.has_unknown_spaces + assert doc2.text == "that's" + + doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) + re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) + assert re_doc1.has_unknown_spaces + assert re_doc1.text == "that 's " + assert not re_doc2.has_unknown_spaces + assert re_doc2.text == "that's" + + +@pytest.mark.parametrize( + "writer_flag,reader_flag,reader_value", + [ + (True, True, "bar"), + (True, False, "bar"), + (False, True, "nothing"), + (False, False, "nothing"), + ], +) +def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): + """Test that custom extensions are correctly serialized in DocBin.""" + Doc.set_extension("foo", default="nothing") + doc = Doc(en_vocab, words=["hello", "world"]) + doc._.foo = "bar" + doc_bin_1 = DocBin(store_user_data=writer_flag) + doc_bin_1.add(doc) + doc_bin_bytes = doc_bin_1.to_bytes() + doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) + doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] + assert doc_2._.foo == reader_value + Underscore.doc_extensions = {} diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 05529f9d1..6e7fa0e4e 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,8 +1,14 @@ -import pytest import re +import pickle + +import pytest from spacy.language import Language +from spacy.lang.it import Italian +from spacy.lang.en import English from spacy.tokenizer import Tokenizer +from spacy.training import Example +from spacy.util import load_config_from_str from ..util import make_tempdir @@ -21,6 +27,71 @@ def meta_data(): } +@pytest.mark.issue(2482) +def test_issue2482(): + """Test we can serialize and deserialize a blank NER or parser model.""" + nlp = Italian() + nlp.add_pipe("ner") + b = nlp.to_bytes() + Italian().from_bytes(b) + + +CONFIG_ISSUE_6950 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +@pytest.mark.issue(6950) +def test_issue6950(): + """Test that the nlp object with initialized tok2vec with listeners pickles + correctly (and doesn't have lambdas). + """ + nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) + nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) + pickle.dumps(nlp) + nlp("hello") + pickle.dumps(nlp) + + def test_serialize_language_meta_disk(meta_data): language = Language(meta=meta_data) with make_tempdir() as d: diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index eebf72638..9fcf18e2d 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,18 +1,25 @@ +import pickle + import pytest -from spacy import registry, Vocab, load -from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe +import srsly +from thinc.api import Linear + +import spacy +from spacy import Vocab, load, registry +from spacy.lang.en import English +from spacy.language import Language +from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler +from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer +from spacy.pipeline import TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.pipeline.senter import DEFAULT_SENTER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL -from spacy.pipeline.senter import DEFAULT_SENTER_MODEL -from spacy.lang.en import English -from thinc.api import Linear -import spacy +from spacy.util import ensure_path, load_model +from spacy.tokens import Span from ..util import make_tempdir - test_parsers = [DependencyParser, EntityRecognizer] @@ -58,6 +65,181 @@ def taggers(en_vocab): return tagger1, tagger2 +@pytest.mark.issue(3456) +def test_issue3456(): + # this crashed because of a padding error in layer.ops.unflatten in thinc + nlp = English() + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") + nlp.initialize() + list(nlp.pipe(["hi", ""])) + + +@pytest.mark.issue(3526) +def test_issue_3526_1(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler_bytes = ruler.to_bytes() + assert len(ruler) == len(patterns) + assert len(ruler.labels) == 4 + assert ruler.overwrite + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(ruler_bytes) + assert len(new_ruler) == len(ruler) + assert len(new_ruler.labels) == 4 + assert new_ruler.overwrite == ruler.overwrite + assert new_ruler.ent_id_sep == ruler.ent_id_sep + + +@pytest.mark.issue(3526) +def test_issue_3526_2(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + bytes_old_style = srsly.msgpack_dumps(ruler.patterns) + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(bytes_old_style) + assert len(new_ruler) == len(ruler) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.issue(3526) +def test_issue_3526_3(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + with make_tempdir() as tmpdir: + out_file = tmpdir / "entity_ruler" + srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) + new_ruler = EntityRuler(nlp).from_disk(out_file) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert len(new_ruler) == len(ruler) + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.issue(3526) +def test_issue_3526_4(en_vocab): + nlp = Language(vocab=en_vocab) + patterns = [{"label": "ORG", "pattern": "Apple"}] + config = {"overwrite_ents": True} + ruler = nlp.add_pipe("entity_ruler", config=config) + ruler.add_patterns(patterns) + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + ruler = nlp.get_pipe("entity_ruler") + assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert ruler.overwrite is True + nlp2 = load(tmpdir) + new_ruler = nlp2.get_pipe("entity_ruler") + assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert new_ruler.overwrite is True + + +@pytest.mark.issue(4042) +def test_issue4042(): + """Test that serialization of an EntityRuler before NER works fine.""" + nlp = English() + # add ner pipe + ner = nlp.add_pipe("ner") + ner.add_label("SOME_LABEL") + nlp.initialize() + # Add entity ruler + patterns = [ + {"label": "MY_ORG", "pattern": "Apple"}, + {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, + ] + # works fine with "after" + ruler = nlp.add_pipe("entity_ruler", before="ner") + ruler.add_patterns(patterns) + doc1 = nlp("What do you think about Apple ?") + assert doc1.ents[0].label_ == "MY_ORG" + + with make_tempdir() as d: + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + nlp2 = load_model(output_dir) + doc2 = nlp2("What do you think about Apple ?") + assert doc2.ents[0].label_ == "MY_ORG" + + +@pytest.mark.issue(4042) +def test_issue4042_bug2(): + """ + Test that serialization of an NER works fine when new labels were added. + This is the second bug of two bugs underlying the issue 4042. + """ + nlp1 = English() + # add ner pipe + ner1 = nlp1.add_pipe("ner") + ner1.add_label("SOME_LABEL") + nlp1.initialize() + # add a new label to the doc + doc1 = nlp1("What do you think about Apple ?") + assert len(ner1.labels) == 1 + assert "SOME_LABEL" in ner1.labels + apple_ent = Span(doc1, 5, 6, label="MY_ORG") + doc1.ents = list(doc1.ents) + [apple_ent] + # Add the label explicitly. Previously we didn't require this. + ner1.add_label("MY_ORG") + ner1(doc1) + assert len(ner1.labels) == 2 + assert "SOME_LABEL" in ner1.labels + assert "MY_ORG" in ner1.labels + with make_tempdir() as d: + # assert IO goes fine + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + ner1.to_disk(output_dir) + config = {} + ner2 = nlp1.create_pipe("ner", config=config) + ner2.from_disk(output_dir) + assert len(ner2.labels) == 2 + + +@pytest.mark.issue(4725) +def test_issue4725_1(): + """Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + config = { + "update_with_oracle_cut_size": 111, + } + ner = nlp.create_pipe("ner", config=config) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["update_with_oracle_cut_size"] == 111 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["update_with_oracle_cut_size"] == 111 + + @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index a9450cd04..e271f7707 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -1,9 +1,16 @@ -import pytest +import pickle import re -from spacy.util import get_lang_class -from spacy.tokenizer import Tokenizer -from ..util import make_tempdir, assert_packed_msg_equal +import pytest + +from spacy.attrs import ENT_IOB, ENT_TYPE +from spacy.lang.en import English +from spacy.tokenizer import Tokenizer +from spacy.tokens import Doc +from spacy.util import compile_infix_regex, compile_prefix_regex +from spacy.util import compile_suffix_regex, get_lang_class, load_model + +from ..util import assert_packed_msg_equal, make_tempdir def load_tokenizer(b): @@ -12,6 +19,79 @@ def load_tokenizer(b): return tok +@pytest.mark.issue(2833) +def test_issue2833(en_vocab): + """Test that a custom error is raised if a token or span is pickled.""" + doc = Doc(en_vocab, words=["Hello", "world"]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0:2]) + + +@pytest.mark.issue(3012) +def test_issue3012(en_vocab): + """Test that the is_tagged attribute doesn't get overwritten when we from_array + without tag information.""" + words = ["This", "is", "10", "%", "."] + tags = ["DT", "VBZ", "CD", "NN", "."] + pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] + ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] + doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) + assert doc.has_annotation("TAG") + expected = ("10", "NUM", "CD", "PERCENT") + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + header = [ENT_IOB, ENT_TYPE] + ent_array = doc.to_array(header) + doc.from_array(header, ent_array) + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + # Serializing then deserializing + doc_bytes = doc.to_bytes() + doc2 = Doc(en_vocab).from_bytes(doc_bytes) + assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected + + +@pytest.mark.issue(4190) +def test_issue4190(): + def customize_tokenizer(nlp): + prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) + suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) + infix_re = compile_infix_regex(nlp.Defaults.infixes) + # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') + exceptions = { + k: v + for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() + if not (len(k) == 2 and k[1] == ".") + } + new_tokenizer = Tokenizer( + nlp.vocab, + exceptions, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=nlp.tokenizer.token_match, + ) + nlp.tokenizer = new_tokenizer + + test_string = "Test c." + # Load default language + nlp_1 = English() + doc_1a = nlp_1(test_string) + result_1a = [token.text for token in doc_1a] # noqa: F841 + # Modify tokenizer + customize_tokenizer(nlp_1) + doc_1b = nlp_1(test_string) + result_1b = [token.text for token in doc_1b] + # Save and Reload + with make_tempdir() as model_dir: + nlp_1.to_disk(model_dir) + nlp_2 = load_model(model_dir) + # This should be the modified tokenizer + doc_2 = nlp_2(test_string) + result_2 = [token.text for token in doc_2] + assert result_1b == result_2 + + def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): """Test that custom tokenizer with not all functions defined or empty properties can be serialized and deserialized correctly (see #2494, diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index ab403ab54..fd80c3d8e 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -1,17 +1,71 @@ -import pytest import pickle + +import pytest from thinc.api import get_current_ops -from spacy.vocab import Vocab + +import spacy +from spacy.lang.en import English from spacy.strings import StringStore +from spacy.tokens import Doc +from spacy.util import ensure_path, load_model from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir - test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] test_strings_attrs = [(["rats", "are", "cute"], "Hello")] +@pytest.mark.issue(599) +def test_issue599(en_vocab): + doc = Doc(en_vocab) + doc2 = Doc(doc.vocab) + doc2.from_bytes(doc.to_bytes()) + assert doc2.has_annotation("DEP") + + +@pytest.mark.issue(4054) +def test_issue4054(en_vocab): + """Test that a new blank model can be made with a vocab from file, + and that serialization does not drop the language at any point.""" + nlp1 = English() + vocab1 = nlp1.vocab + with make_tempdir() as d: + vocab_dir = ensure_path(d / "vocab") + if not vocab_dir.exists(): + vocab_dir.mkdir() + vocab1.to_disk(vocab_dir) + vocab2 = Vocab().from_disk(vocab_dir) + nlp2 = spacy.blank("en", vocab=vocab2) + nlp_dir = ensure_path(d / "nlp") + if not nlp_dir.exists(): + nlp_dir.mkdir() + nlp2.to_disk(nlp_dir) + nlp3 = load_model(nlp_dir) + assert nlp3.lang == "en" + + +@pytest.mark.issue(4133) +def test_issue4133(en_vocab): + nlp = English() + vocab_bytes = nlp.vocab.to_bytes() + words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] + pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] + doc = Doc(en_vocab, words=words) + for i, token in enumerate(doc): + token.pos_ = pos[i] + # usually this is already True when starting from proper models instead of blank English + doc_bytes = doc.to_bytes() + vocab = Vocab() + vocab = vocab.from_bytes(vocab_bytes) + doc = Doc(vocab).from_bytes(doc_bytes) + actual = [] + for token in doc: + actual.append(token.pos_) + assert actual == pos + + @pytest.mark.parametrize("text", ["rat"]) def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3243d426b..ec512b839 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,28 +1,111 @@ -import pytest -from click import NoSuchOption -from packaging.specifiers import SpecifierSet -from spacy.training import docs_to_json, offsets_to_biluo_tags -from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs -from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate -from spacy.lang.nl import Dutch -from spacy.util import ENV_VARS, load_model_from_config -from spacy.cli import info -from spacy.cli.init_config import init_config, RECOMMENDATIONS -from spacy.cli._util import validate_project_commands, parse_config_overrides -from spacy.cli._util import load_project_config, substitute_project_variables -from spacy.cli._util import is_subpath_of -from spacy.cli._util import string_to_list -from spacy import about -from spacy.util import get_minor_version -from spacy.cli.validate import get_model_pkgs -from spacy.cli.download import get_compatibility, get_version -from spacy.cli.package import get_third_party_dependencies -from thinc.api import ConfigValidationError, Config -import srsly import os -from .util import make_tempdir +import pytest +import srsly +from click import NoSuchOption +from packaging.specifiers import SpecifierSet +from thinc.api import Config, ConfigValidationError + +from spacy import about +from spacy.cli import info +from spacy.cli._util import is_subpath_of, load_project_config +from spacy.cli._util import parse_config_overrides, string_to_list +from spacy.cli._util import substitute_project_variables +from spacy.cli._util import validate_project_commands +from spacy.cli.debug_data import _compile_gold, _get_labels_from_model +from spacy.cli.debug_data import _get_labels_from_spancat +from spacy.cli.download import get_compatibility, get_version +from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.package import get_third_party_dependencies +from spacy.cli.package import _is_permitted_package_name +from spacy.cli.validate import get_model_pkgs +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import Language +from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.tokens import Doc +from spacy.training import Example, docs_to_json, offsets_to_biluo_tags +from spacy.training.converters import conll_ner_to_docs, conllu_to_docs +from spacy.training.converters import iob_to_docs +from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config + from ..cli.init_pipeline import _init_labels +from .util import make_tempdir + + +@pytest.mark.issue(4665) +def test_cli_converters_conllu_empty_heads_ner(): + """ + conllu_to_docs should not raise an exception if the HEAD column contains an + underscore + """ + input_data = """ +1 [ _ PUNCT -LRB- _ _ punct _ _ +2 This _ DET DT _ _ det _ _ +3 killing _ NOUN NN _ _ nsubj _ _ +4 of _ ADP IN _ _ case _ _ +5 a _ DET DT _ _ det _ _ +6 respected _ ADJ JJ _ _ amod _ _ +7 cleric _ NOUN NN _ _ nmod _ _ +8 will _ AUX MD _ _ aux _ _ +9 be _ AUX VB _ _ aux _ _ +10 causing _ VERB VBG _ _ root _ _ +11 us _ PRON PRP _ _ iobj _ _ +12 trouble _ NOUN NN _ _ dobj _ _ +13 for _ ADP IN _ _ case _ _ +14 years _ NOUN NNS _ _ nmod _ _ +15 to _ PART TO _ _ mark _ _ +16 come _ VERB VB _ _ acl _ _ +17 . _ PUNCT . _ _ punct _ _ +18 ] _ PUNCT -RRB- _ _ punct _ _ +""" + docs = list(conllu_to_docs(input_data)) + # heads are all 0 + assert not all([t.head.i for t in docs[0]]) + # NER is unset + assert not docs[0].has_annotation("ENT_IOB") + + +@pytest.mark.issue(4924) +def test_issue4924(): + nlp = Language() + example = Example.from_dict(nlp.make_doc(""), {}) + nlp.evaluate([example]) + + +@pytest.mark.issue(7055) +def test_issue7055(): + """Test that fill-config doesn't turn sourced components into factories.""" + source_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, + "components": { + "tok2vec": {"factory": "tok2vec"}, + "tagger": {"factory": "tagger"}, + }, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + base_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": str(source_path)}, + "tagger": {"source": str(source_path)}, + "ner": {"factory": "ner"}, + }, + } + base_cfg = Config(base_cfg) + base_path = dir_path / "base.cfg" + base_cfg.to_disk(base_path) + output_path = dir_path / "config.cfg" + fill_config(output_path, base_path, silent=True) + filled_cfg = load_config(output_path) + assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) + assert filled_cfg["components"]["tagger"]["source"] == str(source_path) + assert filled_cfg["components"]["ner"]["factory"] == "ner" + assert "model" in filled_cfg["components"]["ner"] def test_cli_info(): @@ -565,7 +648,16 @@ def test_get_third_party_dependencies(): } }, ) - get_third_party_dependencies(nlp.config) == [] + assert get_third_party_dependencies(nlp.config) == [] + + # Test with lang-specific factory + @Dutch.factory("third_party_test") + def test_factory(nlp, name): + return lambda x: x + + nlp.add_pipe("third_party_test") + # Before #9674 this would throw an exception + get_third_party_dependencies(nlp.config) @pytest.mark.parametrize( @@ -581,3 +673,64 @@ def test_get_third_party_dependencies(): ) def test_is_subpath_of(parent, child, expected): assert is_subpath_of(parent, child) == expected + + +@pytest.mark.slow +@pytest.mark.parametrize( + "factory_name,pipe_name", + [ + ("ner", "ner"), + ("ner", "my_ner"), + ("spancat", "spancat"), + ("spancat", "my_spancat"), + ], +) +def test_get_labels_from_model(factory_name, pipe_name): + labels = ("A", "B") + + nlp = English() + pipe = nlp.add_pipe(factory_name, name=pipe_name) + for label in labels: + pipe.add_label(label) + nlp.initialize() + assert nlp.get_pipe(pipe_name).labels == labels + if factory_name == "spancat": + assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) + else: + assert _get_labels_from_model(nlp, factory_name) == set(labels) + + +def test_permitted_package_names(): + # https://www.python.org/dev/peps/pep-0426/#name + assert _is_permitted_package_name("Meine_Bäume") == False + assert _is_permitted_package_name("_package") == False + assert _is_permitted_package_name("package_") == False + assert _is_permitted_package_name(".package") == False + assert _is_permitted_package_name("package.") == False + assert _is_permitted_package_name("-package") == False + assert _is_permitted_package_name("package-") == False + + +def test_debug_data_compile_gold(): + nlp = English() + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc( + nlp.vocab, + words=["Token", ".", "New York City"], + sent_starts=[True, False, True], + ents=["O", "O", "B-ENT"], + ) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 0 + + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc( + nlp.vocab, + words=["Token", ".", "New York City"], + sent_starts=[True, False, True], + ents=["O", "B-ENT", "I-ENT"], + ) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 1 diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 040dd657f..392c95e42 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,8 +1,99 @@ +import numpy import pytest + from spacy import displacy from spacy.displacy.render import DependencyRenderer, EntityRenderer -from spacy.tokens import Span, Doc +from spacy.lang.en import English from spacy.lang.fa import Persian +from spacy.tokens import Span, Doc + + +@pytest.mark.issue(2361) +def test_issue2361(de_vocab): + """Test if < is escaped when rendering""" + chars = ("<", ">", "&", """) + words = ["<", ">", "&", '"'] + doc = Doc(de_vocab, words=words, deps=["dep"] * len(words)) + html = displacy.render(doc) + for char in chars: + assert char in html + + +@pytest.mark.issue(2728) +def test_issue2728(en_vocab): + """Test that displaCy ENT visualizer escapes HTML correctly.""" + doc = Doc(en_vocab, words=["test", "", "test"]) + doc.ents = [Span(doc, 0, 1, label="TEST")] + html = displacy.render(doc, style="ent") + assert "<RELEASE>" in html + doc.ents = [Span(doc, 1, 2, label="TEST")] + html = displacy.render(doc, style="ent") + assert "<RELEASE>" in html + + +@pytest.mark.issue(3288) +def test_issue3288(en_vocab): + """Test that retokenization works correctly via displaCy when punctuation + is merged onto the preceeding token and tensor is resized.""" + words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] + heads = [1, 1, 1, 4, 4, 6, 4, 4] + deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + doc.tensor = numpy.zeros((len(words), 96), dtype="float32") + displacy.render(doc) + + +@pytest.mark.issue(3531) +def test_issue3531(): + """Test that displaCy renderer doesn't require "settings" key.""" + example_dep = { + "words": [ + {"text": "But", "tag": "CCONJ"}, + {"text": "Google", "tag": "PROPN"}, + {"text": "is", "tag": "VERB"}, + {"text": "starting", "tag": "VERB"}, + {"text": "from", "tag": "ADP"}, + {"text": "behind.", "tag": "ADV"}, + ], + "arcs": [ + {"start": 0, "end": 3, "label": "cc", "dir": "left"}, + {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "aux", "dir": "left"}, + {"start": 3, "end": 4, "label": "prep", "dir": "right"}, + {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, + ], + } + example_ent = { + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG"}], + } + dep_html = displacy.render(example_dep, style="dep", manual=True) + assert dep_html + ent_html = displacy.render(example_ent, style="ent", manual=True) + assert ent_html + + +@pytest.mark.issue(3882) +def test_issue3882(en_vocab): + """Test that displaCy doesn't serialize the doc.user_data when making a + copy of the Doc. + """ + doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"]) + doc.user_data["test"] = set() + displacy.parse_deps(doc) + + +@pytest.mark.issue(5838) +def test_issue5838(): + # Displacy's EntityRenderer break line + # not working after last entity + sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n" + nlp = English() + doc = nlp(sample_text) + doc.ents = [Span(doc, 7, 8, label="test")] + html = displacy.render(doc, style="ent") + found = html.count("
") + assert found == 4 def test_displacy_parse_ents(en_vocab): @@ -12,7 +103,38 @@ def test_displacy_parse_ents(en_vocab): ents = displacy.parse_ents(doc) assert isinstance(ents, dict) assert ents["text"] == "But Google is starting from behind " - assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}] + assert ents["ents"] == [ + {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"} + ] + + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] + ents = displacy.parse_ents(doc) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [ + {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"} + ] + + +def test_displacy_parse_ents_with_kb_id_options(en_vocab): + """Test that named entities with kb_id on a Doc are converted into displaCy's format.""" + doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] + + ents = displacy.parse_ents( + doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"} + ) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [ + { + "start": 4, + "end": 10, + "label": "ORG", + "kb_id": "Q95", + "kb_url": "https://www.wikidata.org/wiki/Q95", + } + ] def test_displacy_parse_deps(en_vocab): diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index f17d5e62e..d8743d322 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -15,7 +15,8 @@ from spacy.training.batchers import minibatch_by_words from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import DEFAULT_CONFIG_PATH -from spacy.schemas import ConfigSchemaTraining +from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema +from pydantic import ValidationError from thinc.api import get_current_ops, NumpyOps, CupyOps @@ -33,6 +34,32 @@ def is_admin(): return admin +@pytest.mark.issue(6207) +def test_issue6207(en_tokenizer): + doc = en_tokenizer("zero one two three four five six") + + # Make spans + s1 = doc[:4] + s2 = doc[3:6] # overlaps with s1 + s3 = doc[5:7] # overlaps with s2, not s1 + + result = util.filter_spans((s1, s2, s3)) + assert s1 in result + assert s2 not in result + assert s3 in result + + +@pytest.mark.issue(6258) +def test_issue6258(): + """Test that the non-empty constraint pattern field is respected""" + # These one is valid + TokenPatternSchema(pattern=[TokenPattern()]) + # But an empty pattern list should fail to validate + # based on the schema's constraint + with pytest.raises(ValidationError): + TokenPatternSchema(pattern=[]) + + @pytest.mark.parametrize("text", ["hello/world", "hello world"]) def test_util_ensure_path_succeeds(text): path = util.ensure_path(text) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 452bcc079..a7270cb1e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,9 +1,284 @@ -import pytest import re -from spacy.vocab import Vocab -from spacy.tokenizer import Tokenizer -from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex + +import numpy +import pytest + from spacy.lang.en import English +from spacy.lang.de import German +from spacy.tokenizer import Tokenizer +from spacy.tokens import Doc +from spacy.training import Example +from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path +from spacy.util import compile_infix_regex +from spacy.vocab import Vocab +from spacy.symbols import ORTH + + +@pytest.mark.issue(743) +def test_issue743(): + doc = Doc(Vocab(), ["hello", "world"]) + token = doc[0] + s = set([token]) + items = list(s) + assert items[0] is token + + +@pytest.mark.issue(801) +@pytest.mark.skip( + reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218" +) +@pytest.mark.parametrize( + "text,tokens", + [ + ('"deserve,"--and', ['"', "deserve", ',"--', "and"]), + ("exception;--exclusive", ["exception", ";--", "exclusive"]), + ("day.--Is", ["day", ".--", "Is"]), + ("refinement:--just", ["refinement", ":--", "just"]), + ("memories?--To", ["memories", "?--", "To"]), + ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]), + ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]), + ], +) +def test_issue801(en_tokenizer, text, tokens): + """Test that special characters + hyphens are split correctly.""" + doc = en_tokenizer(text) + assert len(doc) == len(tokens) + assert [t.text for t in doc] == tokens + + +@pytest.mark.issue(1061) +def test_issue1061(): + """Test special-case works after tokenizing. Was caching problem.""" + text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." + tokenizer = English().tokenizer + doc = tokenizer(text) + assert "MATH" in [w.text for w in doc] + assert "_MATH_" not in [w.text for w in doc] + + tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) + doc = tokenizer(text) + assert "_MATH_" in [w.text for w in doc] + assert "MATH" not in [w.text for w in doc] + + # For sanity, check it works when pipeline is clean. + tokenizer = English().tokenizer + tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) + doc = tokenizer(text) + assert "_MATH_" in [w.text for w in doc] + assert "MATH" not in [w.text for w in doc] + + +@pytest.mark.issue(1963) +def test_issue1963(en_tokenizer): + """Test that doc.merge() resizes doc.tensor""" + doc = en_tokenizer("a b c d") + doc.tensor = numpy.ones((len(doc), 128), dtype="f") + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:2]) + assert len(doc) == 3 + assert doc.tensor.shape == (3, 128) + + +@pytest.mark.skip( + reason="Can not be fixed without variable-width look-behind (which we don't want)" +) +@pytest.mark.issue(1235) +def test_issue1235(): + """Test that g is not split of if preceded by a number and a letter""" + nlp = English() + testwords = "e2g 2g 52g" + doc = nlp(testwords) + assert len(doc) == 5 + assert doc[0].text == "e2g" + assert doc[1].text == "2" + assert doc[2].text == "g" + assert doc[3].text == "52" + assert doc[4].text == "g" + + +@pytest.mark.issue(1242) +def test_issue1242(): + nlp = English() + doc = nlp("") + assert len(doc) == 0 + docs = list(nlp.pipe(["", "hello"])) + assert len(docs[0]) == 0 + assert len(docs[1]) == 1 + + +@pytest.mark.issue(1257) +def test_issue1257(): + """Test that tokens compare correctly.""" + doc1 = Doc(Vocab(), words=["a", "b", "c"]) + doc2 = Doc(Vocab(), words=["a", "c", "e"]) + assert doc1[0] != doc2[0] + assert not doc1[0] == doc2[0] + + +@pytest.mark.issue(1375) +def test_issue1375(): + """Test that token.nbor() raises IndexError for out-of-bounds access.""" + doc = Doc(Vocab(), words=["0", "1", "2"]) + with pytest.raises(IndexError): + assert doc[0].nbor(-1) + assert doc[1].nbor(-1).text == "0" + with pytest.raises(IndexError): + assert doc[2].nbor(1) + assert doc[1].nbor(1).text == "2" + + +@pytest.mark.issue(1488) +def test_issue1488(): + """Test that tokenizer can parse DOT inside non-whitespace separators""" + prefix_re = re.compile(r"""[\[\("']""") + suffix_re = re.compile(r"""[\]\)"']""") + infix_re = re.compile(r"""[-~\.]""") + simple_url_re = re.compile(r"""^https?://""") + + def my_tokenizer(nlp): + return Tokenizer( + nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match, + ) + + nlp = English() + nlp.tokenizer = my_tokenizer(nlp) + doc = nlp("This is a test.") + for token in doc: + assert token.text + + +@pytest.mark.issue(1494) +def test_issue1494(): + """Test if infix_finditer works correctly""" + infix_re = re.compile(r"""[^a-z]""") + test_cases = [ + ("token 123test", ["token", "1", "2", "3", "test"]), + ("token 1test", ["token", "1test"]), + ("hello...test", ["hello", ".", ".", ".", "test"]), + ] + + def new_tokenizer(nlp): + return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) + + nlp = English() + nlp.tokenizer = new_tokenizer(nlp) + for text, expected in test_cases: + assert [token.text for token in nlp(text)] == expected + + +@pytest.mark.skip( + reason="Can not be fixed without iterative looping between prefix/suffix and infix" +) +@pytest.mark.issue(2070) +def test_issue2070(): + """Test that checks that a dot followed by a quote is handled + appropriately. + """ + # Problem: The dot is now properly split off, but the prefix/suffix rules + # are not applied again afterwards. This means that the quote will still be + # attached to the remaining token. + nlp = English() + doc = nlp('First sentence."A quoted sentence" he said ...') + assert len(doc) == 11 + + +@pytest.mark.issue(2926) +def test_issue2926(fr_tokenizer): + """Test that the tokenizer correctly splits tokens separated by a slash (/) + ending in a digit. + """ + doc = fr_tokenizer("Learn html5/css3/javascript/jquery") + assert len(doc) == 8 + assert doc[0].text == "Learn" + assert doc[1].text == "html5" + assert doc[2].text == "/" + assert doc[3].text == "css3" + assert doc[4].text == "/" + assert doc[5].text == "javascript" + assert doc[6].text == "/" + assert doc[7].text == "jquery" + + +@pytest.mark.parametrize( + "text", + [ + "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume", + "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:", + ], +) +@pytest.mark.issue(2626) +def test_issue2626_2835(en_tokenizer, text): + """Check that sentence doesn't cause an infinite loop in the tokenizer.""" + doc = en_tokenizer(text) + assert doc + + +@pytest.mark.issue(2656) +def test_issue2656(en_tokenizer): + """Test that tokenizer correctly splits off punctuation after numbers with + decimal points. + """ + doc = en_tokenizer("I went for 40.3, and got home by 10.0.") + assert len(doc) == 11 + assert doc[0].text == "I" + assert doc[1].text == "went" + assert doc[2].text == "for" + assert doc[3].text == "40.3" + assert doc[4].text == "," + assert doc[5].text == "and" + assert doc[6].text == "got" + assert doc[7].text == "home" + assert doc[8].text == "by" + assert doc[9].text == "10.0" + assert doc[10].text == "." + + +@pytest.mark.issue(2754) +def test_issue2754(en_tokenizer): + """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" + a = en_tokenizer("a") + assert a[0].norm_ == "a" + am = en_tokenizer("am") + assert am[0].norm_ == "am" + + +@pytest.mark.issue(3002) +def test_issue3002(): + """Test that the tokenizer doesn't hang on a long list of dots""" + nlp = German() + doc = nlp( + "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl" + ) + assert len(doc) == 5 + + +@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") +@pytest.mark.issue(3449) +def test_issue3449(): + nlp = English() + nlp.add_pipe("sentencizer") + text1 = "He gave the ball to I. Do you want to go to the movies with I?" + text2 = "He gave the ball to I. Do you want to go to the movies with I?" + text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" + t1 = nlp(text1) + t2 = nlp(text2) + t3 = nlp(text3) + assert t1[5].text == "I" + assert t2[5].text == "I" + assert t3[5].text == "I" + + +@pytest.mark.parametrize( + "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] +) +def test_gold_misaligned(en_tokenizer, text, words): + doc = en_tokenizer(text) + Example.from_dict(doc, {"words": words}) def test_tokenizer_handles_no_word(tokenizer): @@ -229,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens + + +def test_tokenizer_infix_prefix(en_vocab): + # the prefix and suffix matches overlap in the suffix lookbehind + infixes = ["±"] + suffixes = ["%"] + infix_re = compile_infix_regex(infixes) + suffix_re = compile_suffix_regex(suffixes) + tokenizer = Tokenizer( + en_vocab, + infix_finditer=infix_re.finditer, + suffix_search=suffix_re.search, + ) + tokens = [t.text for t in tokenizer("±10%")] + assert tokens == ["±10", "%"] + explain_tokens = [t[1] for t in tokenizer.explain("±10%")] + assert tokens == explain_tokens diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py index 43a78e4b0..e3639c5da 100644 --- a/spacy/tests/training/test_augmenters.py +++ b/spacy/tests/training/test_augmenters.py @@ -1,9 +1,11 @@ import pytest -from spacy.training import Corpus +from spacy.pipeline._parser_internals.nonproj import contains_cycle +from spacy.training import Corpus, Example from spacy.training.augment import create_orth_variants_augmenter from spacy.training.augment import create_lower_casing_augmenter +from spacy.training.augment import make_whitespace_variant from spacy.lang.en import English -from spacy.tokens import DocBin, Doc +from spacy.tokens import DocBin, Doc, Span from contextlib import contextmanager import random @@ -153,3 +155,84 @@ def test_custom_data_augmentation(nlp, doc): ents = [(e.start, e.end, e.label) for e in doc.ents] assert [(e.start, e.end, e.label) for e in corpus[0].reference.ents] == ents assert [(e.start, e.end, e.label) for e in corpus[1].reference.ents] == ents + + +def test_make_whitespace_variant(nlp): + # fmt: off + text = "They flew to New York City.\nThen they drove to Washington, D.C." + words = ["They", "flew", "to", "New", "York", "City", ".", "\n", "Then", "they", "drove", "to", "Washington", ",", "D.C."] + spaces = [True, True, True, True, True, False, False, False, True, True, True, True, False, True, False] + tags = ["PRP", "VBD", "IN", "NNP", "NNP", "NNP", ".", "_SP", "RB", "PRP", "VBD", "IN", "NNP", ",", "NNP"] + lemmas = ["they", "fly", "to", "New", "York", "City", ".", "\n", "then", "they", "drive", "to", "Washington", ",", "D.C."] + heads = [1, 1, 1, 4, 5, 2, 1, 10, 10, 10, 10, 10, 11, 12, 12] + deps = ["nsubj", "ROOT", "prep", "compound", "compound", "pobj", "punct", "dep", "advmod", "nsubj", "ROOT", "prep", "pobj", "punct", "appos"] + ents = ["O", "O", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"] + # fmt: on + doc = Doc( + nlp.vocab, + words=words, + spaces=spaces, + tags=tags, + lemmas=lemmas, + heads=heads, + deps=deps, + ents=ents, + ) + assert doc.text == text + example = Example(nlp.make_doc(text), doc) + # whitespace is only added internally in entity spans + mod_ex = make_whitespace_variant(nlp, example, " ", 3) + assert mod_ex.reference.ents[0].text == "New York City" + mod_ex = make_whitespace_variant(nlp, example, " ", 4) + assert mod_ex.reference.ents[0].text == "New York City" + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.reference.ents[0].text == "New York City" + mod_ex = make_whitespace_variant(nlp, example, " ", 6) + assert mod_ex.reference.ents[0].text == "New York City" + # add a space at every possible position + for i in range(len(doc) + 1): + mod_ex = make_whitespace_variant(nlp, example, " ", i) + assert mod_ex.reference[i].is_space + # adds annotation when the doc contains at least partial annotation + assert [t.tag_ for t in mod_ex.reference] == tags[:i] + ["_SP"] + tags[i:] + assert [t.lemma_ for t in mod_ex.reference] == lemmas[:i] + [" "] + lemmas[i:] + assert [t.dep_ for t in mod_ex.reference] == deps[:i] + ["dep"] + deps[i:] + # does not add partial annotation if doc does not contain this feature + assert not mod_ex.reference.has_annotation("POS") + assert not mod_ex.reference.has_annotation("MORPH") + # produces well-formed trees + assert not contains_cycle([t.head.i for t in mod_ex.reference]) + assert len(list(doc.sents)) == 2 + if i == 0: + assert mod_ex.reference[i].head.i == 1 + else: + assert mod_ex.reference[i].head.i == i - 1 + # adding another space also produces well-formed trees + for j in (3, 8, 10): + mod_ex2 = make_whitespace_variant(nlp, mod_ex, "\t\t\n", j) + assert not contains_cycle([t.head.i for t in mod_ex2.reference]) + assert len(list(doc.sents)) == 2 + assert mod_ex2.reference[j].head.i == j - 1 + # entities are well-formed + assert len(doc.ents) == len(mod_ex.reference.ents) + for ent in mod_ex.reference.ents: + assert not ent[0].is_space + assert not ent[-1].is_space + + # no modifications if: + # partial dependencies + example.reference[0].dep_ = "" + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.text == example.reference.text + example.reference[0].dep_ = "nsubj" # reset + + # spans + example.reference.spans["spans"] = [example.reference[0:5]] + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.text == example.reference.text + del example.reference.spans["spans"] # reset + + # links + example.reference.ents = [Span(doc, 0, 2, label="ENT", kb_id="Q123")] + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.text == example.reference.text diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 4dd90f416..a39d40ded 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -421,3 +421,13 @@ def test_Example_missing_heads(): # Ensure that the missing head doesn't create an artificial new sentence start expected = [True, False, False, False, False, False] assert example.get_aligned_sent_starts() == expected + + +def test_Example_aligned_whitespace(en_vocab): + words = ["a", " ", "b"] + tags = ["A", "SPACE", "B"] + predicted = Doc(en_vocab, words=words) + reference = Doc(en_vocab, words=words, tags=tags) + + example = Example(predicted, reference) + assert example.get_aligned("TAG", as_string=True) == tags diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py new file mode 100644 index 000000000..84c507702 --- /dev/null +++ b/spacy/tests/training/test_rehearse.py @@ -0,0 +1,211 @@ +import pytest +import spacy + +from typing import List +from spacy.training import Example + + +TRAIN_DATA = [ + ( + "Who is Kofi Annan?", + { + "entities": [(7, 18, "PERSON")], + "tags": ["PRON", "AUX", "PROPN", "PRON", "PUNCT"], + "heads": [1, 1, 3, 1, 1], + "deps": ["attr", "ROOT", "compound", "nsubj", "punct"], + "morphs": [ + "", + "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "Number=Sing", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"question": 1.0}, + }, + ), + ( + "Who is Steve Jobs?", + { + "entities": [(7, 17, "PERSON")], + "tags": ["PRON", "AUX", "PROPN", "PRON", "PUNCT"], + "heads": [1, 1, 3, 1, 1], + "deps": ["attr", "ROOT", "compound", "nsubj", "punct"], + "morphs": [ + "", + "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "Number=Sing", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"question": 1.0}, + }, + ), + ( + "Bob is a nice person.", + { + "entities": [(0, 3, "PERSON")], + "tags": ["PROPN", "AUX", "DET", "ADJ", "NOUN", "PUNCT"], + "heads": [1, 1, 4, 4, 1, 1], + "deps": ["nsubj", "ROOT", "det", "amod", "attr", "punct"], + "morphs": [ + "Number=Sing", + "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "Definite=Ind|PronType=Art", + "Degree=Pos", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"statement": 1.0}, + }, + ), + ( + "Hi Anil, how are you?", + { + "entities": [(3, 7, "PERSON")], + "tags": ["INTJ", "PROPN", "PUNCT", "ADV", "AUX", "PRON", "PUNCT"], + "deps": ["intj", "npadvmod", "punct", "advmod", "ROOT", "nsubj", "punct"], + "heads": [4, 0, 4, 4, 4, 4, 4], + "morphs": [ + "", + "Number=Sing", + "PunctType=Comm", + "", + "Mood=Ind|Tense=Pres|VerbForm=Fin", + "Case=Nom|Person=2|PronType=Prs", + "PunctType=Peri", + ], + "cats": {"greeting": 1.0, "question": 1.0}, + }, + ), + ( + "I like London and Berlin.", + { + "entities": [(7, 13, "LOC"), (18, 24, "LOC")], + "tags": ["PROPN", "VERB", "PROPN", "CCONJ", "PROPN", "PUNCT"], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + "heads": [1, 1, 1, 2, 2, 1], + "morphs": [ + "Case=Nom|Number=Sing|Person=1|PronType=Prs", + "Tense=Pres|VerbForm=Fin", + "Number=Sing", + "ConjType=Cmp", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"statement": 1.0}, + }, + ), +] + +REHEARSE_DATA = [ + ( + "Hi Anil", + { + "entities": [(3, 7, "PERSON")], + "tags": ["INTJ", "PROPN"], + "deps": ["ROOT", "npadvmod"], + "heads": [0, 0], + "morphs": ["", "Number=Sing"], + "cats": {"greeting": 1.0}, + }, + ), + ( + "Hi Ravish, how you doing?", + { + "entities": [(3, 9, "PERSON")], + "tags": ["INTJ", "PROPN", "PUNCT", "ADV", "AUX", "PRON", "PUNCT"], + "deps": ["intj", "ROOT", "punct", "advmod", "nsubj", "advcl", "punct"], + "heads": [1, 1, 1, 5, 5, 1, 1], + "morphs": [ + "", + "VerbForm=Inf", + "PunctType=Comm", + "", + "Case=Nom|Person=2|PronType=Prs", + "Aspect=Prog|Tense=Pres|VerbForm=Part", + "PunctType=Peri", + ], + "cats": {"greeting": 1.0, "question": 1.0}, + }, + ), + # UTENSIL new label + ( + "Natasha bought new forks.", + { + "entities": [(0, 7, "PERSON"), (19, 24, "UTENSIL")], + "tags": ["PROPN", "VERB", "ADJ", "NOUN", "PUNCT"], + "deps": ["nsubj", "ROOT", "amod", "dobj", "punct"], + "heads": [1, 1, 3, 1, 1], + "morphs": [ + "Number=Sing", + "Tense=Past|VerbForm=Fin", + "Degree=Pos", + "Number=Plur", + "PunctType=Peri", + ], + "cats": {"statement": 1.0}, + }, + ), +] + + +def _add_ner_label(ner, data): + for _, annotations in data: + for ent in annotations["entities"]: + ner.add_label(ent[2]) + + +def _add_tagger_label(tagger, data): + for _, annotations in data: + for tag in annotations["tags"]: + tagger.add_label(tag) + + +def _add_parser_label(parser, data): + for _, annotations in data: + for dep in annotations["deps"]: + parser.add_label(dep) + + +def _add_textcat_label(textcat, data): + for _, annotations in data: + for cat in annotations["cats"]: + textcat.add_label(cat) + + +def _optimize(nlp, component: str, data: List, rehearse: bool): + """Run either train or rehearse.""" + pipe = nlp.get_pipe(component) + if component == "ner": + _add_ner_label(pipe, data) + elif component == "tagger": + _add_tagger_label(pipe, data) + elif component == "parser": + _add_tagger_label(pipe, data) + elif component == "textcat_multilabel": + _add_textcat_label(pipe, data) + else: + raise NotImplementedError + + if rehearse: + optimizer = nlp.resume_training() + else: + optimizer = nlp.initialize() + + for _ in range(5): + for text, annotation in data: + doc = nlp.make_doc(text) + example = Example.from_dict(doc, annotation) + if rehearse: + nlp.rehearse([example], sgd=optimizer) + else: + nlp.update([example], sgd=optimizer) + return nlp + + +@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat_multilabel"]) +def test_rehearse(component): + nlp = spacy.blank("en") + nlp.add_pipe(component) + nlp = _optimize(nlp, component, TRAIN_DATA, False) + _optimize(nlp, component, REHEARSE_DATA, True) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 68f86190b..0d73300d8 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,15 +1,18 @@ +import random + import numpy -from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment -from spacy.training import biluo_tags_to_spans, iob_to_biluo -from spacy.training import Corpus, docs_to_json, Example -from spacy.training.align import get_alignments -from spacy.training.converters import json_to_docs -from spacy.lang.en import English -from spacy.tokens import Doc, DocBin -from spacy.util import get_words_and_spaces, minibatch -from thinc.api import compounding import pytest import srsly +from spacy.lang.en import English +from spacy.tokens import Doc, DocBin +from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets +from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo +from spacy.training import offsets_to_biluo_tags +from spacy.training.align import get_alignments +from spacy.training.converters import json_to_docs +from spacy.util import get_words_and_spaces, load_model_from_path, minibatch +from spacy.util import load_config_from_str +from thinc.api import compounding from ..util import make_tempdir @@ -68,6 +71,207 @@ def vocab(): return nlp.vocab +@pytest.mark.issue(999) +def test_issue999(): + """Test that adding entities and resuming training works passably OK. + There are two issues here: + 1) We have to re-add labels. This isn't very nice. + 2) There's no way to set the learning rate for the weight update, so we + end up out-of-scale, causing it to learn too fast. + """ + TRAIN_DATA = [ + ["hey", []], + ["howdy", []], + ["hey there", []], + ["hello", []], + ["hi", []], + ["i'm looking for a place to eat", []], + ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]], + ["show me chinese restaurants", [(8, 15, "CUISINE")]], + ["show me chines restaurants", [(8, 14, "CUISINE")]], + ] + nlp = English() + ner = nlp.add_pipe("ner") + for _, offsets in TRAIN_DATA: + for start, end, label in offsets: + ner.add_label(label) + nlp.initialize() + for itn in range(20): + random.shuffle(TRAIN_DATA) + for raw_text, entity_offsets in TRAIN_DATA: + example = Example.from_dict( + nlp.make_doc(raw_text), {"entities": entity_offsets} + ) + nlp.update([example]) + + with make_tempdir() as model_dir: + nlp.to_disk(model_dir) + nlp2 = load_model_from_path(model_dir) + + for raw_text, entity_offsets in TRAIN_DATA: + doc = nlp2(raw_text) + ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} + for start, end, label in entity_offsets: + if (start, end) in ents: + assert ents[(start, end)] == label + break + else: + if entity_offsets: + raise Exception(ents) + + +@pytest.mark.issue(4402) +def test_issue4402(): + json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], + } + nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] + with make_tempdir() as tmpdir: + output_file = tmpdir / "test4402.spacy" + docs = json_to_docs([json_data]) + data = DocBin(docs=docs, attrs=attrs).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + reader = Corpus(output_file) + train_data = list(reader(nlp)) + assert len(train_data) == 2 + + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 + + +CONFIG_7029 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +@pytest.mark.issue(7029) +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch.""" + TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ] + nlp = English.from_config(load_config_from_str(CONFIG_7029)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py deleted file mode 100644 index 295889186..000000000 --- a/spacy/tests/universe/test_universe_json.py +++ /dev/null @@ -1,17 +0,0 @@ -import json -import re -from pathlib import Path - - -def test_universe_json(): - - root_dir = Path(__file__).parent - universe_file = root_dir / "universe.json" - - with universe_file.open() as f: - universe_data = json.load(f) - for entry in universe_data["resources"]: - if "github" in entry: - assert not re.match( - r"^(http:)|^(https:)", entry["github"] - ), "Github field should be user/repo, not a url" diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index b6fee6628..d91f41db3 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -1,7 +1,25 @@ -import pytest import numpy +import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT +from spacy.lookups import Lookups +from spacy.tokens import Doc from spacy.util import OOV_RANK +from spacy.vocab import Vocab + + +@pytest.mark.issue(361) +@pytest.mark.parametrize("text1,text2", [("cat", "dog")]) +def test_issue361(en_vocab, text1, text2): + """Test Issue #361: Equality of lexemes""" + assert en_vocab[text1] == en_vocab[text1] + assert en_vocab[text1] != en_vocab[text2] + + +@pytest.mark.issue(600) +def test_issue600(): + vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + doc = Doc(vocab, words=["hello"]) + doc[0].tag_ = "NN" @pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)]) diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index b5f7303b5..47cd1f060 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -16,6 +16,16 @@ def vocab(en_vocab, vectors): return en_vocab +@pytest.mark.issue(2219) +def test_issue2219(en_vocab): + """Test if indexing issue still occurs during Token-Token similarity""" + vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] + add_vecs_to_vocab(en_vocab, vectors) + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(en_vocab, words=[word1, word2]) + assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) + + def test_vectors_similarity_LL(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors lex1 = vocab[word1] @@ -25,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors): assert lex1.vector_norm != 0 assert lex2.vector_norm != 0 assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1] + assert isinstance(lex1.similarity(lex2), float) assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2)) assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) @@ -37,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors): assert doc[0].vector_norm != 0 assert doc[1].vector_norm != 0 assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1] + assert isinstance(doc[0].similarity(doc[1]), float) assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2)) assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) +def test_vectors_similarity_SS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc[0:1].similarity(doc[0:2]), float) + assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1]) + + +def test_vectors_similarity_DD(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc1 = Doc(vocab, words=[word1, word2]) + doc2 = Doc(vocab, words=[word2, word1]) + assert isinstance(doc1.similarity(doc2), float) + assert doc1.similarity(doc2) == doc2.similarity(doc1) + + def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc.similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc), float) assert doc.similarity(doc[0]) == doc[0].similarity(doc) -def test_vectors_similarity_DS(vocab, vectors): - [(word1, vec1), (word2, vec2)] = vectors - doc = Doc(vocab, words=[word1, word2]) - assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) - - def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc[:2].similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc[-2]), float) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + + +def test_vectors_similarity_DS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc.similarity(doc[:2]), float) + assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index f2e74c3c9..ffd7489b2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,14 +1,15 @@ -import pytest import numpy -from numpy.testing import assert_allclose, assert_equal, assert_almost_equal -from thinc.api import get_current_ops +import pytest +from numpy.testing import assert_allclose, assert_almost_equal, assert_equal +from thinc.api import NumpyOps, get_current_ops + from spacy.lang.en import English -from spacy.vocab import Vocab -from spacy.vectors import Vectors -from spacy.tokenizer import Tokenizer from spacy.strings import hash_string # type: ignore +from spacy.tokenizer import Tokenizer from spacy.tokens import Doc from spacy.training.initialize import convert_vectors +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import add_vecs_to_vocab, get_cosine, make_tempdir @@ -65,6 +66,79 @@ def tokenizer_v(vocab): return Tokenizer(vocab, {}, None, None, None) +@pytest.mark.issue(1518) +def test_issue1518(): + """Test vectors.resize() works.""" + vectors = Vectors(shape=(10, 10)) + vectors.add("hello", row=2) + vectors.resize((5, 9)) + + +@pytest.mark.issue(1539) +def test_issue1539(): + """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" + v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) + v.resize((100, 100)) + + +@pytest.mark.issue(1807) +def test_issue1807(): + """Test vocab.set_vector also adds the word to the vocab.""" + vocab = Vocab(vectors_name="test_issue1807") + assert "hello" not in vocab + vocab.set_vector("hello", numpy.ones((50,), dtype="f")) + assert "hello" in vocab + + +@pytest.mark.issue(2871) +def test_issue2871(): + """Test that vectors recover the correct key for spaCy reserved words.""" + words = ["dog", "cat", "SUFFIX"] + vocab = Vocab(vectors_name="test_issue2871") + vocab.vectors.resize(shape=(3, 10)) + vector_data = numpy.zeros((3, 10), dtype="f") + for word in words: + _ = vocab[word] # noqa: F841 + vocab.set_vector(word, vector_data[0]) + vocab.vectors.name = "dummy_vectors" + assert vocab["dog"].rank == 0 + assert vocab["cat"].rank == 1 + assert vocab["SUFFIX"].rank == 2 + assert vocab.vectors.find(key="dog") == 0 + assert vocab.vectors.find(key="cat") == 1 + assert vocab.vectors.find(key="SUFFIX") == 2 + + +@pytest.mark.issue(3412) +def test_issue3412(): + data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") + vectors = Vectors(data=data, keys=["A", "B", "C"]) + keys, best_rows, scores = vectors.most_similar( + numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f") + ) + assert best_rows[0] == 2 + + +@pytest.mark.issue(4725) +def test_issue4725_2(): + if isinstance(get_current_ops, NumpyOps): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), + # or because of issues with pickling the NER (cf test_issue4725_1) + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + nlp = English(vocab=vocab) + nlp.add_pipe("ner") + nlp.initialize() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + + def test_init_vectors_with_resize_shape(strings, resize_data): v = Vectors(shape=(len(strings), 3)) v.resize(shape=resize_data.shape) @@ -347,7 +421,7 @@ def test_vector_is_oov(): def test_init_vectors_unset(): v = Vectors(shape=(10, 10)) assert v.is_full is False - assert v.data.shape == (10, 10) + assert v.shape == (10, 10) with pytest.raises(ValueError): v = Vectors(shape=(10, 10), mode="floret") @@ -440,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # rows: 2 rows per ngram rows = OPS.xp.asarray( [ - h % nlp.vocab.vectors.data.shape[0] + h % nlp.vocab.vectors.shape[0] for ngram in ngrams for h in nlp.vocab.vectors._get_ngram_hashes(ngram) ], @@ -461,6 +535,10 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # every word has a vector assert nlp.vocab[word * 5].has_vector + # n_keys is -1 for floret + assert nlp_plain.vocab.vectors.n_keys > 0 + assert nlp.vocab.vectors.n_keys == -1 + # check that single and batched vector lookups are identical words = [s for s in nlp_plain.vocab.vectors] single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words])) @@ -470,17 +548,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # an empty key returns 0s assert_equal( OPS.to_numpy(nlp.vocab[""].vector), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # an empty batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), - numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), + numpy.zeros((1, nlp.vocab.vectors.shape[0])), ) # an empty key within a batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # the loaded ngram vector table cannot be modified diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index 56ef1d108..16cf80a08 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,6 +1,19 @@ import pytest -from spacy.attrs import LEMMA, ORTH, IS_ALPHA +from spacy.attrs import IS_ALPHA, LEMMA, ORTH from spacy.parts_of_speech import NOUN, VERB +from spacy.vocab import Vocab + + +@pytest.mark.issue(1868) +def test_issue1868(): + """Test Vocab.__contains__ works with int keys.""" + vocab = Vocab() + lex = vocab["hello"] + assert lex.orth in vocab + assert lex.orth_ in vocab + assert "some string" not in vocab + int_id = vocab.strings.add("some string") + assert int_id not in vocab @pytest.mark.parametrize( diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f8df13610..91f228032 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -45,10 +45,12 @@ cdef class Tokenizer: `re.compile(string).search` to match suffixes. `infix_finditer` (callable): A function matching the signature of `re.compile(string).finditer` to find infixes. - token_match (callable): A boolean function matching strings to be + token_match (callable): A function matching the signature of + `re.compile(string).match`, for matching strings to be recognized as tokens. - url_match (callable): A boolean function matching strings to be - recognized as tokens after considering prefixes and suffixes. + url_match (callable): A function matching the signature of + `re.compile(string).match`, for matching strings to be + recognized as urls. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -681,6 +683,8 @@ cdef class Tokenizer: infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue if substring[offset : match.start()]: tokens.append(("TOKEN", substring[offset : match.start()])) if substring[match.start() : match.end()]: diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 470d3430f..8643243fa 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -6,6 +6,7 @@ import srsly from .span_group import SpanGroup from ..errors import Errors + if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .doc import Doc # noqa: F401 @@ -19,6 +20,8 @@ if TYPE_CHECKING: class SpanGroups(UserDict): """A dict-like proxy held by the Doc, to control access to span groups.""" + _EMPTY_BYTES = srsly.msgpack_dumps([]) + def __init__( self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple() ) -> None: @@ -43,11 +46,13 @@ class SpanGroups(UserDict): def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups # know their names. + if len(self) == 0: + return self._EMPTY_BYTES msg = [value.to_bytes() for value in self.values()] return srsly.msgpack_dumps(msg) def from_bytes(self, bytes_data: bytes) -> "SpanGroups": - msg = srsly.msgpack_loads(bytes_data) + msg = [] if bytes_data == self._EMPTY_BYTES else srsly.msgpack_loads(bytes_data) self.clear() doc = self._ensure_doc() for value_bytes in msg: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index bd2bdb811..2b72adb4d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -12,6 +12,7 @@ from ..compat import copy_reg from ..attrs import SPACY, ORTH, intify_attr, IDS from ..errors import Errors from ..util import ensure_path, SimpleFrozenList +from ._dict_proxies import SpanGroups # fmt: off ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START") @@ -146,7 +147,7 @@ class DocBin: doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) # type: ignore doc = doc.from_array(self.attrs, tokens) # type: ignore doc.cats = self.cats[i] - if self.span_groups[i]: + if self.span_groups[i] != SpanGroups._EMPTY_BYTES: doc.spans.from_bytes(self.span_groups[i]) else: doc.spans.clear() diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f540002c9..7e9340d58 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -10,7 +10,7 @@ from ..lexeme import Lexeme from ..vocab import Vocab from .underscore import Underscore from pathlib import Path -import numpy +import numpy as np class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] @@ -26,7 +26,7 @@ class Doc: user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]] - tensor: numpy.ndarray + tensor: np.ndarray[Any, np.dtype[np.float_]] user_data: Dict[str, Any] has_unknown_spaces: bool _context: Any @@ -144,7 +144,7 @@ class Doc: ) -> Doc: ... def to_array( self, py_attr_ids: Union[int, str, List[Union[int, str]]] - ) -> numpy.ndarray: ... + ) -> np.ndarray[Any, np.dtype[np.float_]]: ... @staticmethod def from_docs( docs: List[Doc], diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 362a17784..d33764ac9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -420,6 +420,8 @@ cdef class Doc: cdef int range_start = 0 if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: attr = SENT_START + elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]: + attr = SENT_START attr = intify_attr(attr) # adjust attributes if attr == HEAD: @@ -616,7 +618,7 @@ cdef class Doc: """ if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) - elif self.vocab.vectors.data.size: + elif self.vocab.vectors.size: return True elif self.tensor.size: return True @@ -641,7 +643,7 @@ cdef class Doc: if not len(self): self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") return self._vector - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector elif self.tensor.size > 0: @@ -1183,7 +1185,7 @@ cdef class Doc: token_offset = -1 for doc in docs[:-1]: token_offset += len(doc) - if not (len(doc) > 0 and doc[-1].is_space): + if len(doc) > 0 and not doc[-1].is_space: concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 96f843a33..4b0c724e5 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -126,38 +126,26 @@ cdef class Span: return False else: return True + self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.doc) + other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.doc) # < if op == 0: - return self.c.start_char < other.c.start_char + return self_tuple < other_tuple # <= elif op == 1: - return self.c.start_char <= other.c.start_char + return self_tuple <= other_tuple # == elif op == 2: - # Do the cheap comparisons first - return ( - (self.c.start_char == other.c.start_char) and \ - (self.c.end_char == other.c.end_char) and \ - (self.c.label == other.c.label) and \ - (self.c.kb_id == other.c.kb_id) and \ - (self.doc == other.doc) - ) + return self_tuple == other_tuple # != elif op == 3: - # Do the cheap comparisons first - return not ( - (self.c.start_char == other.c.start_char) and \ - (self.c.end_char == other.c.end_char) and \ - (self.c.label == other.c.label) and \ - (self.c.kb_id == other.c.kb_id) and \ - (self.doc == other.doc) - ) + return self_tuple != other_tuple # > elif op == 4: - return self.c.start_char > other.c.start_char + return self_tuple > other_tuple # >= elif op == 5: - return self.c.start_char >= other.c.start_char + return self_tuple >= other_tuple def __hash__(self): return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id)) @@ -364,8 +352,10 @@ cdef class Span: return 0.0 vector = self.vector xp = get_array_module(vector) - return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -404,6 +394,10 @@ cdef class Span: """ if "sent" in self.doc.user_span_hooks: return self.doc.user_span_hooks["sent"](self) + elif "sents" in self.doc.user_hooks: + for sentence in self.doc.user_hooks["sents"](self.doc): + if sentence.start <= self.start < sentence.end: + return sentence # Use `sent_start` token attribute to find sentence boundaries cdef int n = 0 if self.doc.has_annotation("SENT_START"): @@ -422,10 +416,51 @@ cdef class Span: else: raise ValueError(Errors.E030) + @property + def sents(self): + """Obtain the sentences that contain this span. If the given span + crosses sentence boundaries, return all sentences it is a part of. + + RETURNS (Iterable[Span]): All sentences that the span is a part of. + + DOCS: https://spacy.io/api/span#sents + """ + cdef int start + cdef int i + + if "sents" in self.doc.user_span_hooks: + yield from self.doc.user_span_hooks["sents"](self) + elif "sents" in self.doc.user_hooks: + for sentence in self.doc.user_hooks["sents"](self.doc): + if sentence.end > self.start: + if sentence.start < self.end or sentence.start == self.start == self.end: + yield sentence + else: + break + else: + if not self.doc.has_annotation("SENT_START"): + raise ValueError(Errors.E030) + # Use `sent_start` token attribute to find sentence boundaries + # Find start of the 1st sentence of the Span + start = self.start + while self.doc.c[start].sent_start != 1 and start > 0: + start -= 1 + + # Now, find all the sentences in the span + for i in range(start + 1, self.doc.length): + if self.doc.c[i].sent_start == 1: + yield Span(self.doc, start, i) + start = i + if start >= self.end: + break + if start < self.end: + yield Span(self.doc, start, self.end) + + @property def ents(self): - """The named entities in the span. Returns a tuple of named entity - `Span` objects, if the entity recognizer has been applied. + """The named entities that fall completely within the span. Returns + a tuple of `Span` objects. RETURNS (tuple): Entities in the span, one `Span` per entity. @@ -452,7 +487,7 @@ cdef class Span: """ if "has_vector" in self.doc.user_span_hooks: return self.doc.user_span_hooks["has_vector"](self) - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) elif self.doc.tensor.size > 0: return True diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index aa97e2b07..d14930348 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads from .. import parts_of_speech from ..errors import Errors, Warnings +from ..attrs import IOB_STRINGS from .underscore import Underscore, get_ext_args @@ -209,8 +210,10 @@ cdef class Token: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + def has_morph(self): """Check whether the token has annotated morph information. Return False when the morph annotation is unset/missing. @@ -484,8 +487,6 @@ cdef class Token: RETURNS (bool / None): Whether the token starts a sentence. None if unknown. - - DOCS: https://spacy.io/api/token#is_sent_start """ def __get__(self): if self.c.sent_start == 0: @@ -743,7 +744,7 @@ cdef class Token: @classmethod def iob_strings(cls): - return ("", "I", "O", "B") + return IOB_STRINGS @property def ent_iob_(self): diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 7fa7bf095..e9a4e1862 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,17 +1,31 @@ -from typing import Dict, Any +from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING import functools import copy - from ..errors import Errors +if TYPE_CHECKING: + from .doc import Doc + from .span import Span + from .token import Token + class Underscore: mutable_types = (dict, list, set) doc_extensions: Dict[Any, Any] = {} span_extensions: Dict[Any, Any] = {} token_extensions: Dict[Any, Any] = {} + _extensions: Dict[str, Any] + _obj: Union["Doc", "Span", "Token"] + _start: Optional[int] + _end: Optional[int] - def __init__(self, extensions, obj, start=None, end=None): + def __init__( + self, + extensions: Dict[str, Any], + obj: Union["Doc", "Span", "Token"], + start: Optional[int] = None, + end: Optional[int] = None, + ): object.__setattr__(self, "_extensions", extensions) object.__setattr__(self, "_obj", obj) # Assumption is that for doc values, _start and _end will both be None @@ -23,12 +37,12 @@ class Underscore: object.__setattr__(self, "_start", start) object.__setattr__(self, "_end", end) - def __dir__(self): + def __dir__(self) -> List[str]: # Hack to enable autocomplete on custom extensions extensions = list(self._extensions.keys()) return ["set", "get", "has"] + extensions - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: if name not in self._extensions: raise AttributeError(Errors.E046.format(name=name)) default, method, getter, setter = self._extensions[name] @@ -56,7 +70,7 @@ class Underscore: return new_default return default - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: Any): if name not in self._extensions: raise AttributeError(Errors.E047.format(name=name)) default, method, getter, setter = self._extensions[name] @@ -65,28 +79,30 @@ class Underscore: else: self._doc.user_data[self._get_key(name)] = value - def set(self, name, value): + def set(self, name: str, value: Any): return self.__setattr__(name, value) - def get(self, name): + def get(self, name: str) -> Any: return self.__getattr__(name) - def has(self, name): + def has(self, name: str) -> bool: return name in self._extensions - def _get_key(self, name): + def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]: return ("._.", name, self._start, self._end) @classmethod - def get_state(cls): + def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]: return cls.token_extensions, cls.span_extensions, cls.doc_extensions @classmethod - def load_state(cls, state): + def load_state( + cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]] + ) -> None: cls.token_extensions, cls.span_extensions, cls.doc_extensions = state -def get_ext_args(**kwargs): +def get_ext_args(**kwargs: Any): """Validate and convert arguments. Reused in Doc, Token and Span.""" default = kwargs.get("default") getter = kwargs.get("getter") diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 63b54034c..59a39c7ee 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,4 +1,5 @@ from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING +from typing import Optional import random import itertools from functools import partial @@ -11,32 +12,87 @@ if TYPE_CHECKING: from ..language import Language # noqa: F401 -class OrthVariantsSingle(BaseModel): - tags: List[StrictStr] - variants: List[StrictStr] +@registry.augmenters("spacy.combined_augmenter.v1") +def create_combined_augmenter( + lower_level: float, + orth_level: float, + orth_variants: Optional[Dict[str, List[Dict]]], + whitespace_level: float, + whitespace_per_token: float, + whitespace_variants: Optional[List[str]], +) -> Callable[["Language", Example], Iterator[Example]]: + """Create a data augmentation callback that uses orth-variant replacement. + The callback can be added to a corpus or other data iterator during training. + + lower_level (float): The percentage of texts that will be lowercased. + orth_level (float): The percentage of texts that will be augmented. + orth_variants (Optional[Dict[str, List[Dict]]]): A dictionary containing the + single and paired orth variants. Typically loaded from a JSON file. + whitespace_level (float): The percentage of texts that will have whitespace + tokens inserted. + whitespace_per_token (float): The number of whitespace tokens to insert in + the modified doc as a percentage of the doc length. + whitespace_variants (Optional[List[str]]): The whitespace token texts. + RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter. + """ + return partial( + combined_augmenter, + lower_level=lower_level, + orth_level=orth_level, + orth_variants=orth_variants, + whitespace_level=whitespace_level, + whitespace_per_token=whitespace_per_token, + whitespace_variants=whitespace_variants, + ) -class OrthVariantsPaired(BaseModel): - tags: List[StrictStr] - variants: List[List[StrictStr]] - - -class OrthVariants(BaseModel): - paired: List[OrthVariantsPaired] = [] - single: List[OrthVariantsSingle] = [] +def combined_augmenter( + nlp: "Language", + example: Example, + *, + lower_level: float = 0.0, + orth_level: float = 0.0, + orth_variants: Optional[Dict[str, List[Dict]]] = None, + whitespace_level: float = 0.0, + whitespace_per_token: float = 0.0, + whitespace_variants: Optional[List[str]] = None, +) -> Iterator[Example]: + if random.random() < lower_level: + example = make_lowercase_variant(nlp, example) + if orth_variants and random.random() < orth_level: + raw_text = example.text + orig_dict = example.to_dict() + variant_text, variant_token_annot = make_orth_variants( + nlp, + raw_text, + orig_dict["token_annotation"], + orth_variants, + lower=False, + ) + orig_dict["token_annotation"] = variant_token_annot + example = example.from_dict(nlp.make_doc(variant_text), orig_dict) + if whitespace_variants and random.random() < whitespace_level: + for _ in range(int(len(example.reference) * whitespace_per_token)): + example = make_whitespace_variant( + nlp, + example, + random.choice(whitespace_variants), + random.randrange(0, len(example.reference)), + ) + yield example @registry.augmenters("spacy.orth_variants.v1") def create_orth_variants_augmenter( - level: float, lower: float, orth_variants: OrthVariants + level: float, lower: float, orth_variants: Dict[str, List[Dict]] ) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. level (float): The percentage of texts that will be augmented. lower (float): The percentage of texts that will be lowercased. - orth_variants (Dict[str, dict]): A dictionary containing the single and - paired orth variants. Typically loaded from a JSON file. + orth_variants (Dict[str, List[Dict]]): A dictionary containing + the single and paired orth variants. Typically loaded from a JSON file. RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter. """ return partial( @@ -67,16 +123,20 @@ def lower_casing_augmenter( if random.random() >= level: yield example else: - example_dict = example.to_dict() - doc = nlp.make_doc(example.text.lower()) - example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference] - yield example.from_dict(doc, example_dict) + yield make_lowercase_variant(nlp, example) + + +def make_lowercase_variant(nlp: "Language", example: Example): + example_dict = example.to_dict() + doc = nlp.make_doc(example.text.lower()) + example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference] + return example.from_dict(doc, example_dict) def orth_variants_augmenter( nlp: "Language", example: Example, - orth_variants: Dict, + orth_variants: Dict[str, List[Dict]], *, level: float = 0.0, lower: float = 0.0, @@ -148,10 +208,132 @@ def make_orth_variants( pair_idx = pair.index(words[word_idx]) words[word_idx] = punct_choices[punct_idx][pair_idx] token_dict["ORTH"] = words - # construct modified raw text from words and spaces + raw = construct_modified_raw_text(token_dict) + return raw, token_dict + + +def make_whitespace_variant( + nlp: "Language", + example: Example, + whitespace: str, + position: int, +) -> Example: + """Insert the whitespace token at the specified token offset in the doc. + This is primarily intended for v2-compatible training data that doesn't + include links or spans. If the document includes links, spans, or partial + dependency annotation, it is returned without modifications. + + The augmentation follows the basics of the v2 space attachment policy, but + without a distinction between "real" and other tokens, so space tokens + may be attached to space tokens: + - at the beginning of a sentence attach the space token to the following + token + - otherwise attach the space token to the preceding token + + The augmenter does not attempt to consolidate adjacent whitespace in the + same way that the tokenizer would. + + The following annotation is used for the space token: + TAG: "_SP" + MORPH: "" + POS: "SPACE" + LEMMA: ORTH + DEP: "dep" + SENT_START: False + + The annotation for each attribute is only set for the space token if there + is already at least partial annotation for that attribute in the original + example. + + RETURNS (Example): Example with one additional space token. + """ + example_dict = example.to_dict() + doc_dict = example_dict.get("doc_annotation", {}) + token_dict = example_dict.get("token_annotation", {}) + # returned unmodified if: + # - doc is empty + # - words are not defined + # - links are defined (only character-based offsets, which is more a quirk + # of Example.to_dict than a technical constraint) + # - spans are defined + # - there are partial dependencies + if ( + len(example.reference) == 0 + or "ORTH" not in token_dict + or len(doc_dict.get("links", [])) > 0 + or len(example.reference.spans) > 0 + or ( + example.reference.has_annotation("DEP") + and not example.reference.has_annotation("DEP", require_complete=True) + ) + ): + return example + words = token_dict.get("ORTH", []) + length = len(words) + assert 0 <= position <= length + if example.reference.has_annotation("ENT_TYPE"): + # I-ENTITY if between B/I-ENTITY and I/L-ENTITY otherwise O + entity = "O" + if position > 1 and position < length: + ent_prev = doc_dict["entities"][position - 1] + ent_next = doc_dict["entities"][position] + if "-" in ent_prev and "-" in ent_next: + ent_iob_prev = ent_prev.split("-")[0] + ent_type_prev = ent_prev.split("-", 1)[1] + ent_iob_next = ent_next.split("-")[0] + ent_type_next = ent_next.split("-", 1)[1] + if ( + ent_iob_prev in ("B", "I") + and ent_iob_next in ("I", "L") + and ent_type_prev == ent_type_next + ): + entity = f"I-{ent_type_prev}" + doc_dict["entities"].insert(position, entity) + else: + del doc_dict["entities"] + token_dict["ORTH"].insert(position, whitespace) + token_dict["SPACY"].insert(position, False) + if example.reference.has_annotation("TAG"): + token_dict["TAG"].insert(position, "_SP") + else: + del token_dict["TAG"] + if example.reference.has_annotation("LEMMA"): + token_dict["LEMMA"].insert(position, whitespace) + else: + del token_dict["LEMMA"] + if example.reference.has_annotation("POS"): + token_dict["POS"].insert(position, "SPACE") + else: + del token_dict["POS"] + if example.reference.has_annotation("MORPH"): + token_dict["MORPH"].insert(position, "") + else: + del token_dict["MORPH"] + if example.reference.has_annotation("DEP", require_complete=True): + if position == 0: + token_dict["HEAD"].insert(position, 0) + else: + token_dict["HEAD"].insert(position, position - 1) + for i in range(len(token_dict["HEAD"])): + if token_dict["HEAD"][i] >= position: + token_dict["HEAD"][i] += 1 + token_dict["DEP"].insert(position, "dep") + else: + del token_dict["HEAD"] + del token_dict["DEP"] + if example.reference.has_annotation("SENT_START"): + token_dict["SENT_START"].insert(position, False) + else: + del token_dict["SENT_START"] + raw = construct_modified_raw_text(token_dict) + return Example.from_dict(nlp.make_doc(raw), example_dict) + + +def construct_modified_raw_text(token_dict): + """Construct modified raw text from words and spaces.""" raw = "" for orth, spacy in zip(token_dict["ORTH"], token_dict["SPACY"]): raw += orth if spacy: raw += " " - return raw, token_dict + return raw diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 66156b6e5..7052504cc 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -71,6 +71,7 @@ def read_conllx( ): """Yield docs, one for each sentence""" vocab = Vocab() # need vocab to make a minimal Doc + set_ents = has_ner(input_data, ner_tag_pattern) for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: @@ -83,6 +84,7 @@ def read_conllx( merge_subtokens=merge_subtokens, append_morphology=append_morphology, ner_map=ner_map, + set_ents=set_ents, ) yield doc @@ -133,6 +135,7 @@ def conllu_sentence_to_doc( merge_subtokens=False, append_morphology=False, ner_map=None, + set_ents=False, ): """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. @@ -188,6 +191,7 @@ def conllu_sentence_to_doc( id_ = int(id_) - 1 head = (int(head) - 1) if head not in ("0", "_") else id_ tag = pos if tag == "_" else tag + pos = pos if pos != "_" else "" morph = morph if morph != "_" else "" dep = "ROOT" if dep == "root" else dep lemmas.append(lemma) @@ -213,8 +217,10 @@ def conllu_sentence_to_doc( doc[i]._.merged_morph = morphs[i] doc[i]._.merged_lemma = lemmas[i] doc[i]._.merged_spaceafter = spaces[i] - ents = get_entities(lines, ner_tag_pattern, ner_map) - doc.ents = biluo_tags_to_spans(doc, ents) + ents = None + if set_ents: + ents = get_entities(lines, ner_tag_pattern, ner_map) + doc.ents = biluo_tags_to_spans(doc, ents) if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) @@ -246,7 +252,10 @@ def conllu_sentence_to_doc( deps=deps, heads=heads, ) - doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] + if set_ents: + doc_x.ents = [ + Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents + ] return doc_x diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 732203e7b..d792c9bbf 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -159,20 +159,17 @@ cdef class Example: gold_values = self.reference.to_array([field]) output = [None] * len(self.predicted) for token in self.predicted: - if token.is_space: + values = gold_values[align[token.i].dataXd] + values = values.ravel() + if len(values) == 0: output[token.i] = None + elif len(values) == 1: + output[token.i] = values[0] + elif len(set(list(values))) == 1: + # If all aligned tokens have the same value, use it. + output[token.i] = values[0] else: - values = gold_values[align[token.i].dataXd] - values = values.ravel() - if len(values) == 0: - output[token.i] = None - elif len(values) == 1: - output[token.i] = values[0] - elif len(set(list(values))) == 1: - # If all aligned tokens have the same value, use it. - output[token.i] = values[0] - else: - output[token.i] = None + output[token.i] = None if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] return output diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 084204389..b59288e38 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -164,7 +164,7 @@ def load_vectors_into_model( len(vectors_nlp.vocab.vectors.keys()) == 0 and vectors_nlp.vocab.vectors.mode != VectorsMode.floret ) or ( - vectors_nlp.vocab.vectors.data.shape[0] == 0 + vectors_nlp.vocab.vectors.shape[0] == 0 and vectors_nlp.vocab.vectors.mode == VectorsMode.floret ): logger.warning(Warnings.W112.format(name=name)) diff --git a/spacy/util.py b/spacy/util.py index 4424f6897..2a8b9f5cc 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -63,7 +63,7 @@ OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] -# Default order of sections in the config.cfg. Not all sections needs to exist, +# Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] # fmt: on @@ -465,7 +465,7 @@ def load_model_from_path( """Load a model from a data directory path. Creates Language class with pipeline from config.cfg and then calls from_disk() with path. - model_path (Path): Mmodel path. + model_path (Path): Model path. meta (Dict[str, Any]): Optional model meta. vocab (Vocab / True): Optional vocab to pass in on initialization. If True, a new Vocab object will be created. @@ -642,8 +642,8 @@ def load_config( sys.stdin.read(), overrides=overrides, interpolate=interpolate ) else: - if not config_path or not config_path.exists() or not config_path.is_file(): - raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) + if not config_path or not config_path.is_file(): + raise IOError(Errors.E053.format(path=config_path, name="config file")) return config.from_disk( config_path, overrides=overrides, interpolate=interpolate ) @@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path: name (str): Package name. RETURNS (Path): Path to installed package. """ - name = name.lower() # use lowercase version to be safe # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. pkg = importlib.import_module(name) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6d6783af4..2b1ea764b 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,5 +1,5 @@ cimport numpy as np -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset from murmurhash.mrmr cimport hash128_x64 @@ -10,7 +10,7 @@ from typing import cast import warnings from enum import Enum import srsly -from thinc.api import get_array_module, get_current_ops +from thinc.api import Ops, get_array_module, get_current_ops from thinc.backends import get_array_ops from thinc.types import Floats2d @@ -146,7 +146,7 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#size """ - return self.data.shape[0] * self.data.shape[1] + return self.data.size @property def is_full(self): @@ -170,6 +170,8 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#n_keys """ + if self.mode == Mode.floret: + return -1 return len(self.key2row) def __reduce__(self): @@ -274,7 +276,7 @@ cdef class Vectors: self.data = resized_array self._sync_unset() removed_items = [] - for key, row in list(self.key2row.items()): + for key, row in self.key2row.copy().items(): if row >= shape[0]: self.key2row.pop(key) removed_items.append((key, row)) @@ -353,12 +355,18 @@ cdef class Vectors: key (str): The string key. RETURNS: A list of the integer hashes. """ - cdef uint32_t[4] out + # MurmurHash3_x64_128 returns an array of 2 uint64_t values. + cdef uint64_t[2] out chars = s.encode("utf8") cdef char* utf8_string = chars hash128_x64(utf8_string, len(chars), self.hash_seed, &out) - rows = [out[i] for i in range(min(self.hash_count, 4))] - return rows + rows = [ + out[0] & 0xffffffffu, + out[0] >> 32, + out[1] & 0xffffffffu, + out[1] >> 32, + ] + return rows[:min(self.hash_count, 4)] def _get_ngrams(self, unicode key): """Get all padded ngram strings using the ngram settings. @@ -511,6 +519,9 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) + def to_ops(self, ops: Ops): + self.data = ops.asarray(self.data) + def _get_cfg(self): if self.mode == Mode.default: return { diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e2e7ad1db..badd291ed 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -283,7 +283,7 @@ cdef class Vocab: @property def vectors_length(self): - return self.vectors.data.shape[1] + return self.vectors.shape[1] def reset_vectors(self, *, width=None, shape=None): """Drop the current vector table. Because all vectors must be the same @@ -294,7 +294,7 @@ cdef class Vocab: elif shape is not None: self.vectors = Vectors(strings=self.strings, shape=shape) else: - width = width if width is not None else self.vectors.data.shape[1] + width = width if width is not None else self.vectors.shape[1] self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) def prune_vectors(self, nr_row, batch_size=1024): diff --git a/website/Dockerfile b/website/Dockerfile new file mode 100644 index 000000000..f71733e55 --- /dev/null +++ b/website/Dockerfile @@ -0,0 +1,16 @@ +FROM node:11.15.0 + +WORKDIR /spacy-io + +RUN npm install -g gatsby-cli@2.7.4 + +COPY package.json . +COPY package-lock.json . + +RUN npm install + +# This is so the installed node_modules will be up one directory +# from where a user mounts files, so that they don't accidentally mount +# their own node_modules from a different build +# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders +WORKDIR /spacy-io/website/ diff --git a/website/README.md b/website/README.md index 076032d92..db050cf03 100644 --- a/website/README.md +++ b/website/README.md @@ -554,6 +554,42 @@ extensions for your code editor. The [`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc) file in the root defines the settings used in this codebase. +## Building & developing the site with Docker {#docker} +Sometimes it's hard to get a local environment working due to rapid updates to node dependencies, +so it may be easier to use docker for building the docs. + +If you'd like to do this, +**be sure you do *not* include your local `node_modules` folder**, +since there are some dependencies that need to be built for the image system. +Rename it before using. + +```bash +docker run -it \ + -v $(pwd):/spacy-io/website \ + -p 8000:8000 \ + ghcr.io/explosion/spacy-io \ + gatsby develop -H 0.0.0.0 +``` + +This will allow you to access the built website at http://0.0.0.0:8000/ +in your browser, and still edit code in your editor while having the site +reflect those changes. + +**Note**: If you're working on a Mac with an M1 processor, +you might see segfault errors from `qemu` if you use the default image. +To fix this use the `arm64` tagged image in the `docker run` command +(ghcr.io/explosion/spacy-io:arm64). + +### Building the Docker image {#docker-build} + +If you'd like to build the image locally, you can do so like this: + +```bash +docker build -t spacy-io . +``` + +This will take some time, so if you want to use the prebuilt image you'll save a bit of time. + ## Markdown reference {#markdown} All page content and page meta lives in the `.md` files in the `/docs` diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md index d37c4561a..770bbde13 100644 --- a/website/UNIVERSE.md +++ b/website/UNIVERSE.md @@ -44,7 +44,7 @@ markup is correct. "id": "unique-project-id", "title": "Project title", "slogan": "A short summary", - "description": "A longer description – *Mardown allowed!*", + "description": "A longer description – *Markdown allowed!*", "github": "user/repo", "pip": "package-name", "code_example": [ diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 44ba94d9e..07b76393f 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based representation. If pretrained vectors are available, they can be included in the -representation as well, with the vectors table will be kept static (i.e. it's +representation as well, with the vectors table kept static (i.e. it's not updated). | Name | Description | @@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are mapped to a zero vector. See the documentation on [static vectors](/usage/embeddings-transformers#static-vectors) for details. -| Name |  Description | +| Name | Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ | | `nM` | The width of the static vectors. ~~Optional[int]~~ | @@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list of feature names to extract, which should refer to token attributes. -| Name |  Description | +| Name | Description | | ----------- | ------------------------------------------------------------------------ | | `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ | | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index b872181f9..89e2e87d9 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -148,8 +148,8 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [ ### init fill-config {#init-fill-config new="3"} -Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all -default values**, e.g. a config generated with the +Auto-fill a partial [.cfg file](/usage/training#config) with **all default +values**, e.g. a config generated with the [quickstart widget](/usage/training#quickstart). Config files used for training should always be complete and not contain any hidden defaults or missing values, so this command helps you create your final training config. In order to find @@ -175,7 +175,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff] | Name | Description | | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | | `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | @@ -208,7 +208,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | -| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | +| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | | `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 986c6f458..35afc8fea 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -79,6 +79,7 @@ train/test skew. | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | +| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index c51a6dbca..b7aedc511 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -535,7 +535,7 @@ As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class and pipeline anymore and only contains meta information for reference and for creating a Python package with [`spacy package`](/api/cli#package). How to set up the `nlp` object is now defined in the -[`config.cfg`](/api/data-formats#config), which includes detailed information +[config file](/api/data-formats#config), which includes detailed information about the pipeline components and their model architectures, and all other settings and hyperparameters used to train the pipeline. It's the **single source of truth** used for loading a pipeline. diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 9836b8c21..c21328caf 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. ## Doc.has_annotation {#has_annotation tag="method"} -Check whether the doc contains annotation on a token attribute. +Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes). diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 2f7a88fbf..14b6fece4 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -65,7 +65,7 @@ architectures and their arguments and hyperparameters. | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | -| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | +| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER will learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | | `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ```python diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index fb33642f8..1ef283870 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example: ## EntityRuler.initialize {#initialize tag="method" new="3"} Initialize the component with data and used before training to load in rules -from a file. This method is typically called by -[`Language.initialize`](/api/language#initialize) and lets you customize -arguments it receives via the +from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method +is typically called by [`Language.initialize`](/api/language#initialize) and +lets you customize arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. @@ -210,6 +210,24 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on | ---------- | ---------------------------------------------------------------- | | `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ | + +## EntityRuler.remove {#remove tag="method" new="3.2.1"} + +Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist. + +> #### Example +> +> ```python +> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}] +> ruler = nlp.add_pipe("entity_ruler") +> ruler.add_patterns(patterns) +> ruler.remove("apple") +> ``` + +| Name | Description | +| ---------- | ---------------------------------------------------------------- | +| `id` | The ID of the pattern rule. ~~str~~ | + ## EntityRuler.to_disk {#to_disk tag="method"} Save the entity ruler patterns to a directory. The patterns will be saved as diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index 916a5bf7f..e24c37d77 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -248,23 +248,6 @@ the others, but may not be as accurate, especially if texts are short. ## Loggers {#loggers} -These functions are available from `@spacy.registry.loggers`. +Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. -### spacy.WandbLogger.v1 {#WandbLogger_v1} - -The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet -support the `log_dataset_dir` and `model_log_interval` arguments. - -> #### Example config -> -> ```ini -> [training.logger] -> @loggers = "spacy.WandbLogger.v1" -> project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] -> ``` -> -> | Name | Description | -> | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -> | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | -> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | +More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file. diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 803105ba2..3e7f9dc04 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,7 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 2938b4253..ff7905bc0 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -257,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be ## Span.ents {#ents tag="property" new="2.0.13" model="ner"} -The named entities in the span. Returns a tuple of named entity `Span` objects, -if the entity recognizer has been applied. +The named entities that fall completely within the span. Returns a tuple of +`Span` objects. > #### Example > @@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)] | ----------- | ------------------------------------------------------- | | **RETURNS** | The sentence span that this span is a part of. ~~Span~~ | +## Span.sents {#sents tag="property" model="sentences" new="3.2.1"} + +Returns a generator over the sentences the span belongs to. This property is only available +when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. + +If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned. + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> span = doc[2:4] +> assert len(span.sents) == 2 +> ``` + +| Name | Description | +| ----------- | -------------------------------------------------------------------------- | +| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | + ## Attributes {#attributes} | Name | Description | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 93b6bc88b..b51864d3a 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -40,11 +40,12 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | +| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 47f868637..2ff569bad 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -34,7 +34,11 @@ only. Predictions will be saved to `doc.cats` as a dictionary, where the key is the name of the category and the value is a score between 0 and 1 (inclusive). For `textcat` (exclusive categories), the scores will sum to 1, while for -`textcat_multilabel` there is no particular guarantee about their sum. +`textcat_multilabel` there is no particular guarantee about their sum. This also +means that for `textcat`, missing values are equated to a value of 0 (i.e. +`False`) and are counted as such towards the loss and scoring metrics. This is +not the case for `textcat_multilabel`, where missing values in the gold standard +data do not influence the loss or accuracy calculations. Note that when assigning values to create training data, the score of each category must be 0 or 1. Using other values, for example to create a document diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 44a2ea9e8..3c3d12d54 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants. | ---------- | ------------------------------------------------------------------------------------ | | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ | -## Token.is_sent_start {#is_sent_start tag="property" new="2"} - -A boolean value indicating whether the token starts a sentence. `None` if -unknown. Defaults to `True` for the first token in the `Doc`. - -> #### Example -> -> ```python -> doc = nlp("Give it back! He pleaded.") -> assert doc[4].is_sent_start -> assert not doc[5].is_sent_start -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------- | -| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ | - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -465,6 +448,8 @@ The L2 norm of the token's vector representation. | `is_punct` | Is the token punctuation? ~~bool~~ | | `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | | `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. | +| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. | | `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | | `is_bracket` | Is the token a bracket? ~~bool~~ | | `is_quote` | Is the token a quotation mark? ~~bool~~ | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 4361db4c0..1a3e9da46 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -313,11 +313,12 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Description | -| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | -| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| Name | Description | +| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| `kb_url_template` 3.2.1 | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy's trained pipelines](/models). If you're using custom entity types, you @@ -326,6 +327,14 @@ or pipeline package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +By default, displaCy links to `#` for entities without a `kb_id` set on their +span. If you wish to link an entity to their URL then consider using the +`kb_url_template` option from above. For example if the `kb_id` on a span is +`Q95` and this is a Wikidata identifier then this option can be set to +`https://www.wikidata.org/wiki/{}`. Clicking on your entity in the rendered HTML +should redirect you to their Wikidata page, in this case +`https://www.wikidata.org/wiki/Q95`. + ## registry {#registry source="spacy/util.py" new="3"} spaCy's function registry extends @@ -412,10 +421,10 @@ finished. To log each training step, a and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the -console in tabular format. The +console in tabular format. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as -a dependency of spaCy, enables other loggers: currently it provides one that sends -results to a [Weights & Biases](https://www.wandb.com/) dashboard. +a dependency of spaCy, enables other loggers, such as one that +sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). @@ -466,7 +475,6 @@ start decreasing across epochs. - ## Readers {#readers} ### File readers {#file-readers source="github.com/explosion/srsly" new="3"} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 84d2c00ad..a651c23b0 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -327,9 +327,9 @@ will be counted individually. In `floret` mode, the keys table is not used. > assert vectors.n_keys == 0 > ``` -| Name | Description | -| ----------- | -------------------------------------------- | -| **RETURNS** | The number of all keys in the table. ~~int~~ | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------- | +| **RETURNS** | The number of all keys in the table. Returns `-1` for floret vectors. ~~int~~ | ## Vectors.most_similar {#most_similar tag="method"} @@ -348,7 +348,7 @@ supported for `floret` mode. > ``` | Name | Description | -| -------------- | --------------------------------------------------------------------------- | +| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | | `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | | _keyword-only_ | | | `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | @@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch. | ------ | --------------------------------------- | | `keys` | The keys. ~~Iterable[Union[int, str]]~~ | +## Vectors.to_ops {#to_ops tag="method"} + +Change the embedding matrix to use different Thinc ops. + +> #### Example +> +> ```python +> from thinc.api import NumpyOps +> +> vectors.to_ops(NumpyOps()) +> +> ``` + +| Name | Description | +| ----- | -------------------------------------------------------- | +| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | + ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. diff --git a/website/docs/images/spacy-tailored-pipelines_wide.png b/website/docs/images/spacy-tailored-pipelines_wide.png new file mode 100644 index 000000000..d1a762ebe Binary files /dev/null and b/website/docs/images/spacy-tailored-pipelines_wide.png differ diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f748fa8d6..f8baf5588 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -831,6 +831,8 @@ def tokenizer_pseudo_code( infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue tokens.append(substring[offset : match.start()]) tokens.append(substring[match.start() : match.end()]) offset = match.end() diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 0264a2825..11fd1459d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3} This example shows the implementation of a pipeline component that fetches -country meta data via the [REST Countries API](https://restcountries.eu), sets +country meta data via the [REST Countries API](https://restcountries.com), sets entity annotations for countries and sets custom attributes on the `Doc` and `Span` – for example, the capital, latitude/longitude coordinates and even the country flag. @@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token @Language.factory("rest_countries") class RESTCountriesComponent: def __init__(self, nlp, name, label="GPE"): - r = requests.get("https://restcountries.eu/rest/v2/all") + r = requests.get("https://restcountries.com/v2/all") r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index e0e787a1d..57d226913 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up a quick web demo. It looks pretty similar to a config file used to define CI pipelines. +> #### Tip: Multi-line YAML syntax for long values +> +> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be +> helpful for readability with longer values such as project descriptions or +> commands that take several arguments. + ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` diff --git a/website/docs/usage/v3-2.md b/website/docs/usage/v3-2.md index 766d1c0a9..d1d45c7ba 100644 --- a/website/docs/usage/v3-2.md +++ b/website/docs/usage/v3-2.md @@ -159,7 +159,7 @@ their contributions! - All Universal Dependencies training data has been updated to v2.8. - The Catalan data, tokenizer and lemmatizer have been updated, thanks to Carlos - Rodriguez and the Barcelona Supercomputing Center! + Rodriguez, Carme Armentano and the Barcelona Supercomputing Center! - The transformer pipelines are trained using spacy-transformers v1.1, with improved IO and more options for [model config and output](/api/architectures#TransformerModel). diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 1054f7626..c49b49c73 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -40,7 +40,11 @@ "label": "Resources", "items": [ { "text": "Project Templates", "url": "https://github.com/explosion/projects" }, - { "text": "v2.x Documentation", "url": "https://v2.spacy.io" } + { "text": "v2.x Documentation", "url": "https://v2.spacy.io" }, + { + "text": "Custom Solutions", + "url": "https://explosion.ai/spacy-tailored-pipelines" + } ] } ] diff --git a/website/meta/site.json b/website/meta/site.json index b8f1a58ef..9ecaef74c 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -22,7 +22,8 @@ "list": "89ad33e698" }, "docSearch": { - "apiKey": "371e26ed49d29a27bd36273dfdaf89af", + "appId": "Y1LB128RON", + "apiKey": "bb601a1daab73e2dc66faf2b79564807", "indexName": "spacy" }, "binderUrl": "explosion/spacy-io-binder", @@ -47,7 +48,11 @@ { "text": "Usage", "url": "/usage" }, { "text": "Models", "url": "/models" }, { "text": "API Reference", "url": "/api" }, - { "text": "Online Course", "url": "https://course.spacy.io" } + { "text": "Online Course", "url": "https://course.spacy.io" }, + { + "text": "Custom Solutions", + "url": "https://explosion.ai/spacy-tailored-pipelines" + } ] }, { diff --git a/website/meta/universe.json b/website/meta/universe.json index 7f3813a95..6374600f2 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,43 @@ { "resources": [ + { + "id": "spacypdfreader", + "title": "spadypdfreader", + "category": ["pipeline"], + "tags": ["PDF"], + "slogan": "Easy PDF to text to spaCy text extraction in Python.", + "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.", + "github": "SamEdwardes/spacypdfreader", + "pip": "spacypdfreader", + "url": "https://samedwardes.github.io/spacypdfreader/", + "code_language": "python", + "author": "Sam Edwardes", + "author_links": { + "twitter": "TheReaLSamlam", + "github": "SamEdwardes", + "website": "https://samedwardes.com" + }, + "code_example": [ + "import spacy", + "from spacypdfreader import pdf_reader", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)", + "", + "# Get the page number of any token.", + "print(doc[0]._.page_number) # 1", + "print(doc[-1]._.page_number) # 4", + "", + "# Get page meta data about the PDF document.", + "print(doc._.pdf_file_name) # 'tests/data/test_pdf_01.pdf'", + "print(doc._.page_range) # (1, 4)", + "print(doc._.first_page) # 1", + "print(doc._.last_page) # 4", + "", + "# Get all of the text from a specific PDF page.", + "print(doc._.page(4)) # 'able to display the destination page (unless...'" + ] + }, { "id": "nlpcloud", "title": "NLPCloud.io", @@ -26,32 +64,6 @@ "category": ["apis", "nonpython", "standalone"], "tags": ["api", "deploy", "production"] }, - { - "id": "denomme", - "title": "denomme : Multilingual Name Detector", - "slogan": "Multilingual Name Detection", - "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone", - "github": "meghanabhange/denomme", - "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz", - "code_example": [ - "from spacy.lang.xx import MultiLanguage", - "from denomme.name import person_name_component", - "nlp = MultiLanguage()", - "nlp.add_pipe('denomme')", - "doc = nlp('Hi my name is Meghana S.R Bhange and I want to talk Asha')", - "print(doc._.person_name)", - "# ['Meghana S.R Bhange', 'Asha']" - ], - "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png", - "code_language": "python", - "author": "Meghana Bhange", - "author_links": { - "github": "meghanabhange", - "twitter": "_aspiringcat" - }, - "category": ["standalone"], - "tags": ["person-name-detection"] - }, { "id": "eMFDscore", "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python", @@ -129,7 +141,8 @@ "website": "https://www.nr.no/~plison" }, "category": ["pipeline", "standalone", "research", "training"], - "tags": [] + "tags": [], + "spacy_version": 3 }, { "id": "numerizer", @@ -214,11 +227,11 @@ }, { "id": "spacy-textblob", - "title": "spaCyTextBlob", - "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!", - "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png", - "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`", - "github": "SamEdwardes/spaCyTextBlob", + "title": "spacytextblob", + "slogan": "A TextBlob sentiment analysis pipeline component for spaCy.", + "thumb": "https://github.com/SamEdwardes/spacytextblob/raw/main/docs/static/img/logo-thumb-square-250x250.png", + "description": "spacytextblob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extension `._.blob` to `Doc`, `Span`, and `Token` objects.", + "github": "SamEdwardes/spacytextblob", "pip": "spacytextblob", "code_example": [ "import spacy", @@ -228,9 +241,10 @@ "nlp.add_pipe('spacytextblob')", "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'", "doc = nlp(text)", - "doc._.polarity # Polarity: -0.125", - "doc._.subjectivity # Sujectivity: 0.9", - "doc._.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]" + "doc._.blob.polarity # Polarity: -0.125", + "doc._.blob.subjectivity # Subjectivity: 0.9", + "doc._.blob.sentiment_assessments.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]", + "doc._.blob.ngrams() # [WordList(['I', 'had', 'a']), WordList(['had', 'a', 'really']), WordList(['a', 'really', 'horrible']), WordList(['really', 'horrible', 'day']), WordList(['horrible', 'day', 'It']), WordList(['day', 'It', 'was']), WordList(['It', 'was', 'the']), WordList(['was', 'the', 'worst']), WordList(['the', 'worst', 'day']), WordList(['worst', 'day', 'ever']), WordList(['day', 'ever', 'But']), WordList(['ever', 'But', 'every']), WordList(['But', 'every', 'now']), WordList(['every', 'now', 'and']), WordList(['now', 'and', 'then']), WordList(['and', 'then', 'I']), WordList(['then', 'I', 'have']), WordList(['I', 'have', 'a']), WordList(['have', 'a', 'really']), WordList(['a', 'really', 'good']), WordList(['really', 'good', 'day']), WordList(['good', 'day', 'that']), WordList(['day', 'that', 'makes']), WordList(['that', 'makes', 'me']), WordList(['makes', 'me', 'happy'])]" ], "code_language": "python", "url": "https://spacytextblob.netlify.app/", @@ -241,7 +255,8 @@ "website": "https://samedwardes.com" }, "category": ["pipeline"], - "tags": ["sentiment", "textblob"] + "tags": ["sentiment", "textblob"], + "spacy_version": 3 }, { "id": "spacy-ray", @@ -940,6 +955,37 @@ "category": ["pipeline"], "tags": ["lemmatizer", "danish"] }, + { + "id": "augmenty", + "title": "Augmenty", + "slogan": "The cherry on top of your NLP pipeline", + "description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.", + "github": "kennethenevoldsen/augmenty", + "pip": "augmenty", + "code_example": [ + "import spacy", + "import augmenty", + "", + "nlp = spacy.load('en_core_web_md')", + "", + "docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])", + "", + "ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}", + "entity_augmenter = augmenty.load('ents_replace.v1',", + " ent_dict = ent_dict, level=1)", + "", + "for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):", + " print(doc)" + ], + "thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "kennethenevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["training", "research"], + "tags": ["training", "research", "augmentation"] + }, { "id": "dacy", "title": "DaCy", @@ -965,6 +1011,48 @@ "category": ["pipeline"], "tags": ["pipeline", "danish"] }, + { + "id": "spacy-wrap", + "title": "spaCy-wrap", + "slogan": "For Wrapping fine-tuned transformers in spaCy pipelines", + "description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.", + "github": "kennethenevoldsen/spacy-wrap", + "pip": "spacy_wrap", + "code_example": [ + "import spacy", + "import spacy_wrap", + "", + "nlp = spacy.blank('en')", + "config = {", + " 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass", + " 'doc_extension_prediction': 'sentiment', # document extention for the prediction", + " 'labels': ['negative', 'neutral', 'positive'],", + " 'model': {", + " 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model", + "},", + "}", + "", + "transformer = nlp.add_pipe('classification_transformer', config=config)", + "transformer.model.initialize()", + "", + "doc = nlp('spaCy is a wonderful tool')", + "", + "print(doc._.clf_trf_data)", + "# TransformerData(wordpieces=...", + "print(doc._.sentiment)", + "# 'positive'", + "print(doc._.sentiment_prob)", + "# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}" + ], + "thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "KennethEnevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["pipeline", "models", "training"], + "tags": ["pipeline", "models", "transformers"] + }, { "id": "textdescriptives", "title": "TextDescriptives", @@ -1752,6 +1840,23 @@ }, "category": ["courses"] }, + { + "type": "education", + "id": "applt-course", + "title": "Applied Language Technology", + "slogan": "NLP for newcomers using spaCy and Stanza", + "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", + "url": "https://applied-language-technology.mooc.fi", + "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", + "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png", + "author": "Tuomo Hiippala", + "author_links": { + "twitter": "tuomo_h", + "github": "thiippal", + "website": "https://www.mv.helsinki.fi/home/thiippal/" + }, + "category": ["courses"] + }, { "type": "education", "id": "video-spacys-ner-model", @@ -2757,6 +2862,54 @@ "website": "https://yanaiela.github.io" } }, + { + "id": "Healthsea", + "title": "Healthsea", + "slogan": "Healthsea: an end-to-end spaCy pipeline for exploring health supplement effects", + "description": "This spaCy project trains an NER model and a custom Text Classification model with Clause Segmentation and Blinding capabilities to analyze supplement reviews and their potential effects on health.", + "github": "explosion/healthsea", + "thumb": "https://github.com/explosion/healthsea/blob/main/img/Jellyfish.png", + "category": ["pipeline", "research"], + "code_example": [ + "import spacy", + "", + "nlp = spacy.load(\"en_healthsea\")", + "doc = nlp(\"This is great for joint pain.\")", + "", + "# Clause Segmentation & Blinding", + "print(doc._.clauses)", + "", + "> {", + "> \"split_indices\": [0, 7],", + "> \"has_ent\": true,", + "> \"ent_indices\": [4, 6],", + "> \"blinder\": \"_CONDITION_\",", + "> \"ent_name\": \"joint pain\",", + "> \"cats\": {", + "> \"POSITIVE\": 0.9824668169021606,", + "> \"NEUTRAL\": 0.017364952713251114,", + "> \"NEGATIVE\": 0.00002889777533710003,", + "> \"ANAMNESIS\": 0.0001394189748680219", + "> \"prediction_text\": [\"This\", \"is\", \"great\", \"for\", \"_CONDITION_\", \"!\"]", + "> }", + "", + "# Aggregated results", + "> {", + "> \"joint_pain\": {", + "> \"effects\": [\"POSITIVE\"],", + "> \"effect\": \"POSITIVE\",", + "> \"label\": \"CONDITION\",", + "> \"text\": \"joint pain\"", + "> }", + "> }" + ], + "author": "Edward Schmuhl", + "author_links": { + "github": "thomashacker", + "twitter": "aestheticedwar1", + "website": "https://explosion.ai/" + } + }, { "id": "presidio", "title": "Presidio", @@ -3618,6 +3771,65 @@ }, "category": ["pipeline"], "tags": ["pipeline", "nlp", "sentiment"] + }, + { + "id": "textnets", + "slogan": "Text analysis with networks", + "description": "textnets represents collections of texts as networks of documents and words. This provides novel possibilities for the visualization and analysis of texts.", + "github": "jboynyc/textnets", + "image": "https://user-images.githubusercontent.com/2187261/152641425-6c0fb41c-b8e0-44fb-a52a-7c1ba24eba1e.png", + "code_example": [ + "import textnets as tn", + "", + "corpus = tn.Corpus(tn.examples.moon_landing)", + "t = tn.Textnet(corpus.tokenized(), min_docs=1)", + "t.plot(label_nodes=True,", + " show_clusters=True,", + " scale_nodes_by=\"birank\",", + " scale_edges_by=\"weight\")" + ], + "author": "John Boy", + "author_links": { + "github": "jboynyc", + "twitter": "jboy" + }, + "category": ["visualizers", "standalone"] + }, + { + "id": "tmtoolkit", + "slogan": "Text mining and topic modeling toolkit", + "description": "tmtoolkit is a set of tools for text mining and topic modeling with Python developed especially for the use in the social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation and a clear programming interface while offering good performance on large datasets by the means of vectorized operations (via NumPy) and parallel computation (using Python’s multiprocessing module and the loky package).", + "github": "WZBSocialScienceCenter/tmtoolkit", + "code_example": [ + "# Note: This requires these setup steps:", + "# pip install tmtoolkit[recommended]", + "# python -m tmtoolkit setup en", + "from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm", + "from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table", + "# load built-in sample dataset and use 4 worker processes", + "corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)", + "# investigate corpus as dataframe", + "toktbl = tokens_table(corp)", + "print(toktbl)", + "# apply some text normalization", + "lemmatize(corp)", + "to_lowercase(corp)", + "# build sparse document-token matrix (DTM)", + "# document labels identify rows, vocabulary tokens identify columns", + "mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)", + "# apply tf-idf transformation to DTM", + "# operation is applied on sparse matrix and uses few memory", + "tfidf_mat = tfidf(mat)", + "# show top 5 tokens per document ranked by tf-idf", + "top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)", + "print(top_tokens)" + ], + "author": "Markus Konrad / WZB Social Science Center", + "author_links": { + "github": "internaut", + "twitter": "_knrd" + }, + "category": ["scientific", "standalone"] } ], diff --git a/website/src/components/embed.js b/website/src/components/embed.js index 8d82bfaae..9f959bc99 100644 --- a/website/src/components/embed.js +++ b/website/src/components/embed.js @@ -3,6 +3,7 @@ import PropTypes from 'prop-types' import classNames from 'classnames' import Link from './link' +import Button from './button' import { InlineCode } from './code' import { markdownToReact } from './util' @@ -104,4 +105,23 @@ const Image = ({ src, alt, title, ...props }) => { ) } -export { YouTube, SoundCloud, Iframe, Image } +const GoogleSheet = ({ id, link, height, button = 'View full table' }) => { + return ( +
+