diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 7c3c3e0a6..0b9b0731f 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -1,6 +1,6 @@ parameters: - python_version: '' - architecture: 'x64' + python_version: "" + architecture: "x64" num_build_jobs: 2 steps: @@ -12,7 +12,7 @@ steps: - bash: | echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" - displayName: 'Set variables' + displayName: "Set variables" - script: | python -m pip install -U build pip setuptools @@ -25,7 +25,7 @@ steps: - script: | python -m mypy spacy - displayName: 'Run mypy' + displayName: "Run mypy" condition: ne(variables['python_version'], '3.6') - task: DeleteFiles@1 @@ -52,56 +52,56 @@ steps: python -W error -c "import spacy" displayName: "Test import" -# - script: | -# python -m spacy download ca_core_news_sm -# python -m spacy download ca_core_news_md -# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" -# displayName: 'Test download CLI' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" -# displayName: 'Test no warnings on load (#11713)' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping -# displayName: 'Test skip re-download (#12188)' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + displayName: "Test download CLI" + condition: eq(variables['python_version'], '3.9') + + - script: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + displayName: "Test download_url in info CLI" + condition: eq(variables['python_version'], '3.9') + + - script: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + displayName: "Test no warnings on load (#11713)" + condition: eq(variables['python_version'], '3.9') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . - displayName: 'Test convert CLI' - condition: eq(variables['python_version'], '3.8') + displayName: "Test convert CLI" + condition: eq(variables['python_version'], '3.9') - script: | python -m spacy init config -p ner -l ca ner.cfg python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy - displayName: 'Test debug config CLI' - condition: eq(variables['python_version'], '3.8') + displayName: "Test debug config CLI" + condition: eq(variables['python_version'], '3.9') - script: | # will have errors due to sparse data, check for summary in output python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary - displayName: 'Test debug data CLI' - condition: eq(variables['python_version'], '3.8') + displayName: "Test debug data CLI" + condition: eq(variables['python_version'], '3.9') - script: | python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 - displayName: 'Test train CLI' - condition: eq(variables['python_version'], '3.8') + displayName: "Test train CLI" + condition: eq(variables['python_version'], '3.9') -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" -# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir -# displayName: 'Test assemble CLI' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" -# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 -# displayName: 'Test assemble CLI vectors warning' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: "Test assemble CLI" + condition: eq(variables['python_version'], '3.9') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: "Test assemble CLI vectors warning" + condition: eq(variables['python_version'], '3.9') - script: | python -m pip install -U -r requirements.txt @@ -116,9 +116,3 @@ steps: python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) - - - script: | - python .github/validate_universe_json.py website/meta/universe.json - displayName: 'Test website/meta/universe.json' - condition: eq(variables['python_version'], '3.8') - diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml deleted file mode 100644 index 555322782..000000000 --- a/.github/workflows/autoblack.yml +++ /dev/null @@ -1,45 +0,0 @@ -# GitHub Action that uses Black to reformat all Python code and submits a PR -# in regular intervals. Inspired by: https://github.com/cclauss/autoblack - -name: autoblack -on: - workflow_dispatch: # allow manual trigger - schedule: - - cron: '0 8 * * 5' # every Friday at 8am UTC - -jobs: - autoblack: - if: github.repository_owner == 'explosion' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - ref: ${{ github.head_ref }} - - uses: actions/setup-python@v4 - - run: pip install black -c requirements.txt - - name: Auto-format code if needed - run: black spacy - # We can't run black --check here because that returns a non-zero excit - # code and makes GitHub think the action failed - - name: Check for modified files - id: git-check - run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT - - - name: Create Pull Request - if: steps.git-check.outputs.modified == 'true' - uses: peter-evans/create-pull-request@v4 - with: - title: Auto-format code with black - labels: meta - commit-message: Auto-format code with black - committer: GitHub - author: explosion-bot - body: _This PR is auto-generated._ - branch: autoblack - delete-branch: true - draft: false - - name: Check outputs - if: steps.git-check.outputs.modified == 'true' - run: | - echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" - echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 6b472cd12..910cfdc40 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -8,6 +8,7 @@ on: jobs: explosion-bot: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Dump GitHub context diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 8f3a151ea..6c7d7d5a6 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -13,6 +13,7 @@ on: jobs: issue-manager: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - uses: tiangolo/issue-manager@0.4.0 diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 794adee85..6c3985a93 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -13,6 +13,7 @@ concurrency: jobs: action: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - uses: dessant/lock-threads@v4 diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 837aaeb33..33851fbcc 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -7,6 +7,7 @@ on: jobs: build: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..f226057c9 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,173 @@ +name: tests + +on: + push: + branches-ignore: + - "spacy.io" + - "nightly.spacy.io" + - "v2.spacy.io" + paths-ignore: + - "*.md" + - "*.mdx" + - "website/**" + - ".github/workflows/**" + pull_request: + types: [opened, synchronize, reopened, edited] + paths-ignore: + - "*.md" + - "*.mdx" + - "website/**" + +jobs: + validate: + name: Validate + if: github.repository_owner == 'explosion' + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: "3.7" + architecture: x64 + + - name: black + run: | + python -m pip install black -c requirements.txt + python -m black spacy --check + - name: flake8 + run: | + python -m pip install flake8==5.0.4 + python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics + tests: + name: Test + needs: Validate + strategy: + fail-fast: true + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python_version: ["3.11"] + include: + - os: ubuntu-20.04 + python_version: "3.6" + - os: windows-latest + python_version: "3.7" + - os: macos-latest + python_version: "3.8" + - os: ubuntu-latest + python_version: "3.9" + - os: windows-latest + python_version: "3.10" + + runs-on: ${{ matrix.os }} + + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python_version }} + architecture: x64 + + - name: Install dependencies + run: | + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt + + - name: Build sdist + run: | + python -m build --sdist + + - name: Run mypy + run: | + python -m mypy spacy + if: matrix.python_version != '3.6' + + - name: Delete source directory and .egg-info + run: | + rm -rf spacy *.egg-info + shell: bash + + - name: Uninstall all packages + run: | + python -m pip freeze + python -m pip freeze --exclude pywin32 > installed.txt + python -m pip uninstall -y -r installed.txt + + - name: Install from sdist + run: | + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST + shell: bash + + - name: Test import + run: python -W error -c "import spacy" + + - name: "Test download CLI" + run: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + if: matrix.python_version == '3.9' + + - name: "Test download_url in info CLI" + run: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + if: matrix.python_version == '3.9' + + - name: "Test no warnings on load (#11713)" + run: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + if: matrix.python_version == '3.9' + + - name: "Test convert CLI" + run: | + python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . + if: matrix.python_version == '3.9' + + - name: "Test debug config CLI" + run: | + python -m spacy init config -p ner -l ca ner.cfg + python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy + if: matrix.python_version == '3.9' + + - name: "Test debug data CLI" + run: | + # will have errors due to sparse data, check for summary in output + python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary + if: matrix.python_version == '3.9' + + - name: "Test train CLI" + run: | + python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 + if: matrix.python_version == '3.9' + + - name: "Test assemble CLI" + run: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + if: matrix.python_version == '3.9' + + - name: "Test assemble CLI vectors warning" + run: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + if: matrix.python_version == '3.9' + + - name: "Install test requirements" + run: | + python -m pip install -U -r requirements.txt + + - name: "Run CPU tests" + run: | + python -m pytest --pyargs spacy -W error + + - name: "Run CPU tests with thinc-apple-ops" + run: | + python -m pip install 'spacy[apple]' + python -m pytest --pyargs spacy + if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11' diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml new file mode 100644 index 000000000..a1e3253a9 --- /dev/null +++ b/.github/workflows/universe_validation.yml @@ -0,0 +1,33 @@ +name: universe validation + +on: + push: + branches-ignore: + - "spacy.io" + - "nightly.spacy.io" + - "v2.spacy.io" + paths: + - "website/meta/universe.json" + pull_request: + types: [opened, synchronize, reopened, edited] + paths: + - "website/meta/universe.json" + +jobs: + validate: + name: Validate + if: github.repository_owner == 'explosion' + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: "3.7" + architecture: x64 + + - name: Validate website/meta/universe.json + run: | + python .github/validate_universe_json.py website/meta/universe.json diff --git a/README.md b/README.md index bf8083e0e..3216a8890 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). +💥 **We'd love to hear more about your experience with spaCy!** +[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f) + 💫 **Version 3.5 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b7ebbe01..9ac5fcf1b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,6 +48,9 @@ jobs: pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics displayName: "flake8" + - script: | + python .github/validate_universe_json.py website/meta/universe.json + displayName: 'Validate website/meta/universe.json' - job: "Test" dependsOn: "Validate" diff --git a/pyproject.toml b/pyproject.toml index 837cf1fd8..9cd96ac2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=9.0.0.dev2,<9.1.0", + "thinc>=8.1.8,<8.2.0", "numpy>=1.15.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 6f4b61918..30a87c4cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=9.0.0.dev2,<9.1.0 +thinc>=8.1.8,<8.2.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 diff --git a/setup.cfg b/setup.cfg index 975ec03ce..859df09a9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,7 +29,15 @@ project_urls = [options] zip_safe = false include_package_data = true -python_requires = >=3.8 +python_requires = >=3.6 +setup_requires = + cython>=0.25,<3.0 + numpy>=1.15.0 + # We also need our Cython packages here to compile against + cymem>=2.0.2,<2.1.0 + preshed>=3.0.2,<3.1.0 + murmurhash>=0.28.0,<1.1.0 + thinc>=8.1.8,<8.2.0 install_requires = # Our libraries spacy-legacy>=4.0.0.dev0,<4.1.0 @@ -37,7 +45,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=9.0.0.dev2,<9.1.0 + thinc>=8.1.8,<8.2.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1c242cec8..61d563cb9 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -7,6 +7,7 @@ import srsly from wasabi import Printer, MESSAGES, msg import typer import math +import numpy from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli, _format_number @@ -520,9 +521,13 @@ def debug_data( if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") - label_list = [label for label in gold_train_data["tags"]] - model_labels = _get_labels_from_model(nlp, "tagger") + label_list, counts = zip(*gold_train_data["tags"].items()) msg.info(f"{len(label_list)} label(s) in train data") + p = numpy.array(counts) + p = p / p.sum() + norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list)) + msg.info(f"{norm_entropy} is the normalised label entropy") + model_labels = _get_labels_from_model(nlp, "tagger") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index efa664832..6d591053d 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -35,7 +35,7 @@ def find_threshold_cli( code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), - verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): """ diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..d82bf3fbc 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer @@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version from .. import util from .. import about +from ..compat import importlib_metadata @app.command("info") @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib_metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index eb48d1de5..09df6a05b 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} {%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} -{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} +{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%} [paths] train = null dev = null @@ -24,8 +24,11 @@ gpu_allocator = null lang = "{{ lang }}" {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} {%- set with_accuracy = optimize == "accuracy" -%} -{%- set has_accurate_textcat = has_textcat and with_accuracy -%} -{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} +{# The BOW textcat doesn't need a source of features, so it can omit the +tok2vec/transformer. #} +{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} +{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} +{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- else -%} {%- set full_pipeline = components -%} @@ -154,6 +157,36 @@ grad_factor = 1.0 sizes = [1,2,3] {% endif -%} +{% if "spancat_singlelabel" in components %} +[components.spancat_singlelabel] +factory = "spancat_singlelabel" +negative_weight = 1.0 +allow_overlap = true +scorer = {"@scorers":"spacy.spancat_scorer.v1"} +spans_key = "sc" + +[components.spancat_singlelabel.model] +@architectures = "spacy.SpanCategorizer.v1" + +[components.spancat_singlelabel.model.reducer] +@layers = "spacy.mean_max_reducer.v1" +hidden_size = 128 + +[components.spancat_singlelabel.model.scorer] +@layers = "Softmax.v2" + +[components.spancat_singlelabel.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.spancat_singlelabel.model.tok2vec.pooling] +@layers = "reduce_mean.v1" + +[components.spancat_singlelabel.suggester] +@misc = "spacy.ngram_suggester.v1" +sizes = [1,2,3] +{% endif %} + {% if "trainable_lemmatizer" in components -%} [components.trainable_lemmatizer] factory = "trainable_lemmatizer" @@ -219,10 +252,16 @@ no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v2" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = true -ngram_size = 1 -no_output_layer = false +nO = null + +[components.textcat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {%- endif %} {%- endif %} @@ -250,10 +289,16 @@ no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v2" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = false -ngram_size = 1 -no_output_layer = false +nO = null + +[components.textcat_multilabel.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat_multilabel.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {%- endif %} {%- endif %} @@ -284,6 +329,7 @@ maxout_pieces = 3 {% if "morphologizer" in components %} [components.morphologizer] factory = "morphologizer" +label_smoothing = 0.05 [components.morphologizer.model] @architectures = "spacy.Tagger.v2" @@ -297,6 +343,7 @@ width = ${components.tok2vec.model.encode.width} {% if "tagger" in components %} [components.tagger] factory = "tagger" +label_smoothing = 0.05 [components.tagger.model] @architectures = "spacy.Tagger.v2" @@ -370,6 +417,33 @@ width = ${components.tok2vec.model.encode.width} sizes = [1,2,3] {% endif %} +{% if "spancat_singlelabel" in components %} +[components.spancat_singlelabel] +factory = "spancat_singlelabel" +negative_weight = 1.0 +allow_overlap = true +scorer = {"@scorers":"spacy.spancat_scorer.v1"} +spans_key = "sc" + +[components.spancat_singlelabel.model] +@architectures = "spacy.SpanCategorizer.v1" + +[components.spancat_singlelabel.model.reducer] +@layers = "spacy.mean_max_reducer.v1" +hidden_size = 128 + +[components.spancat_singlelabel.model.scorer] +@layers = "Softmax.v2" + +[components.spancat_singlelabel.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.spancat_singlelabel.suggester] +@misc = "spacy.ngram_suggester.v1" +sizes = [1,2,3] +{% endif %} + {% if "trainable_lemmatizer" in components -%} [components.trainable_lemmatizer] factory = "trainable_lemmatizer" diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index ea6bba2c9..f42dad0c9 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -125,13 +125,17 @@ def app(environ, start_response): return [res] -def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: +def parse_deps( + orig_doc: Union[Doc, Span], options: Dict[str, Any] = {} +) -> Dict[str, Any]: """Generate dependency parse in {'words': [], 'arcs': []} format. - orig_doc (Doc): Document to parse. + orig_doc (Union[Doc, Span]): Document to parse. options (Dict[str, Any]): Dependency parse specific visualisation options. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ + if isinstance(orig_doc, Span): + orig_doc = orig_doc.as_doc() doc = Doc(orig_doc.vocab).from_bytes( orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) ) diff --git a/spacy/errors.py b/spacy/errors.py index 5049100d8..26774aac3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -542,6 +542,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x + E850 = ("The PretrainVectors objective currently only supports default or " + "floret vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " "but found value of '{val}'.") E852 = ("The tar file pulled from the remote attempted an unsafe path " @@ -951,6 +953,7 @@ class Errors(metaclass=ErrorsWithCodes): "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " "or use `auto_select_port=True` to pick an available port automatically.") + E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") # v4 error strings E4000 = ("Expected a Doc as input, but got: '{type}'") diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index fd0c8c832..b99ce96ec 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,11 +1,14 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language, BaseDefaults class SerbianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sr/punctuation.py b/spacy/lang/sr/punctuation.py new file mode 100644 index 000000000..793a20ec2 --- /dev/null +++ b/spacy/lang/sr/punctuation.py @@ -0,0 +1,36 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES +from ..char_classes import CURRENCY, UNITS, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{a}{e}{p}(?:{q})])\.".format( + a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + ] +) + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4942b18aa..a83be974e 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -24,7 +24,8 @@ class Lexeme: def orth_(self) -> str: ... @property def text(self) -> str: ... - lower: str + orth: int + lower: int norm: int shape: int prefix: int diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index e57098f17..3d94542cf 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -186,7 +186,7 @@ cdef class Lexeme: return self.orth_ property lower: - """RETURNS (str): Lowercase form of the lexeme.""" + """RETURNS (uint64): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index a7d67c6dd..7eb13b608 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,5 +1,5 @@ from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast -from thinc.types import Floats2d +from thinc.types import Floats2d, Ints1d from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import MultiSoftmax, list2array from thinc.api import to_categorical, CosineDistance, L2Distance @@ -7,7 +7,8 @@ from thinc.loss import Loss from ...util import registry, OOV_RANK from ...errors import Errors -from ...attrs import ID +from ...attrs import ID, ORTH +from ...vectors import Mode as VectorsMode import numpy from functools import partial @@ -67,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance): """Compute a loss based on a distance between the documents' vectors and the prediction. """ - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = docs[0].vocab.vectors.data[ids] - target[ids == OOV_RANK] = 0 - d_target, loss = distance(prediction, target) + vocab = docs[0].vocab + if vocab.vectors.mode == VectorsMode.default: + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our + # tokens, and look them up all at once. This prevents data copying. + ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = docs[0].vocab.vectors.data[ids] + target[ids == OOV_RANK] = 0 + d_target, loss = distance(prediction, target) + elif vocab.vectors.mode == VectorsMode.floret: + keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs]) + target = vocab.vectors.get_batch(keys) + target = ops.as_contig(target) + d_target, loss = distance(prediction, target) + else: + raise ValueError(Errors.E850.format(mode=vocab.vectors.mode)) return loss, d_target diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index cd13a4b21..c3e4e8957 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -502,18 +502,24 @@ class EntityLinker(TrainablePipe): # Looping through each entity in batch (TODO: rewrite) for j, ent in enumerate(ent_batch): - sent_index = sentences.index(ent.sent) - assert sent_index >= 0 + assert hasattr(ent, "sents") + sents = list(ent.sents) + sent_indices = ( + sentences.index(sents[0]), + sentences.index(sents[-1]), + ) + assert sent_indices[1] >= sent_indices[0] >= 0 if self.incl_context: # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) + start_sentence = max(0, sent_indices[0] - self.n_sents) end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents + len(sentences) - 1, sent_indices[1] + self.n_sents ) start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model.predict([sent_doc])[0] sentence_encoding_t = sentence_encoding.T diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index fabc51fee..3146c7907 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -50,13 +50,8 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={ - "model": DEFAULT_MORPH_MODEL, - "overwrite": True, - "extend": False, - "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, - "save_activations": False, - }, + default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -65,11 +60,11 @@ def make_morphologizer( name: str, overwrite: bool, extend: bool, + label_smoothing: float, scorer: Optional[Callable], save_activations: bool, ): - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, - save_activations=save_activations) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer) def morphologizer_score(examples, **kwargs): @@ -98,8 +93,9 @@ class Morphologizer(Tagger): model: Model, name: str = "morphologizer", *, - overwrite: bool = False, - extend: bool = False, + overwrite: bool = BACKWARD_OVERWRITE, + extend: bool = BACKWARD_EXTEND, + label_smoothing: float = 0.0, scorer: Optional[Callable] = morphologizer_score, save_activations: bool = False, ): @@ -131,6 +127,7 @@ class Morphologizer(Tagger): "labels_pos": {}, "overwrite": overwrite, "extend": extend, + "label_smoothing": label_smoothing, } self.cfg = dict(sorted(cfg.items())) self.scorer = scorer @@ -289,7 +286,8 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#get_loss """ validate_examples(examples, "Morphologizer.get_loss") - loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, + label_smoothing=self.cfg["label_smoothing"]) truths = [] for eg in examples: eg_truths = [] diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 33e1c87dc..cca1a0bee 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,5 +1,5 @@ -from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast -from typing import Union, Protocol, runtime_checkable +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union +from dataclasses import dataclass from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d @@ -46,7 +46,36 @@ maxout_pieces = 3 depth = 4 """ +spancat_singlelabel_default_config = """ +[model] +@architectures = "spacy.SpanCategorizer.v1" +scorer = {"@layers": "Softmax.v2"} + +[model.reducer] +@layers = spacy.mean_max_reducer.v1 +hidden_size = 128 + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v2" +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 96 +rows = [5000, 1000, 2500, 1000] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = ${model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 4 +""" + DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"] +DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str( + spancat_singlelabel_default_config +)["model"] @runtime_checkable @@ -124,10 +153,14 @@ def make_spancat( max_positive: Optional[int], save_activations: bool, ) -> "SpanCategorizer": - """Create a SpanCategorizer component. The span categorizer consists of two + """Create a SpanCategorizer component and configure it for multi-label + classification to be able to assign multiple labels for each span. + The span categorizer consists of two parts: a suggester function that proposes candidate spans, and a labeller model that predicts one or more labels for each span. + name (str): The component instance name, used to add entries to the + losses during training. suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. Spans are returned as a ragged array with two integer columns, for the start and end positions. @@ -150,12 +183,80 @@ def make_spancat( """ return SpanCategorizer( nlp.vocab, - suggester=suggester, model=model, - spans_key=spans_key, - threshold=threshold, - max_positive=max_positive, + suggester=suggester, name=name, + spans_key=spans_key, + negative_weight=None, + allow_overlap=True, + max_positive=max_positive, + threshold=threshold, + scorer=scorer, + add_negative_label=False, + ) + + +@Language.factory( + "spancat_singlelabel", + assigns=["doc.spans"], + default_config={ + "spans_key": "sc", + "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, + "negative_weight": 1.0, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "allow_overlap": True, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, +) +def make_spancat_singlelabel( + nlp: Language, + name: str, + suggester: Suggester, + model: Model[Tuple[List[Doc], Ragged], Floats2d], + spans_key: str, + negative_weight: float, + allow_overlap: bool, + scorer: Optional[Callable], +) -> "SpanCategorizer": + """Create a SpanCategorizer component and configure it for multi-class + classification. With this configuration each span can get at most one + label. The span categorizer consists of two + parts: a suggester function that proposes candidate spans, and a labeller + model that predicts one or more labels for each span. + + name (str): The component instance name, used to add entries to the + losses during training. + suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. + Spans are returned as a ragged array with two integer columns, for the + start and end positions. + model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that + is given a list of documents and (start, end) indices representing + candidate span offsets. The model predicts a probability for each category + for each span. + spans_key (str): Key of the doc.spans dict to save the spans under. During + initialization and training, the component will look for spans on the + reference document under the same key. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the Doc.spans[spans_key] with overlapping + spans allowed. + negative_weight (float): Multiplier for the loss terms. + Can be used to downweight the negative samples if there are too many. + allow_overlap (bool): If True the data is assumed to contain overlapping spans. + Otherwise it produces non-overlapping spans greedily prioritizing + higher assigned label scores. + """ + return SpanCategorizer( + nlp.vocab, + model=model, + suggester=suggester, + name=name, + spans_key=spans_key, + negative_weight=negative_weight, + allow_overlap=allow_overlap, + max_positive=1, + add_negative_label=True, + threshold=None, scorer=scorer, save_activations=save_activations, ) @@ -179,6 +280,27 @@ def make_spancat_scorer(): return spancat_score +@dataclass +class _Intervals: + """ + Helper class to avoid storing overlapping spans. + """ + + def __init__(self): + self.ranges = set() + + def add(self, i, j): + for e in range(i, j): + self.ranges.add(e) + + def __contains__(self, rang): + i, j = rang + for e in range(i, j): + if e in self.ranges: + return True + return False + + class SpanCategorizer(TrainablePipe): """Pipeline component to label spans of text. @@ -192,26 +314,44 @@ class SpanCategorizer(TrainablePipe): suggester: Suggester, name: str = "spancat", *, + add_negative_label: bool = False, spans_key: str = "spans", - threshold: float = 0.5, + negative_weight: Optional[float] = 1.0, + allow_overlap: Optional[bool] = True, max_positive: Optional[int] = None, + threshold: Optional[float] = 0.5, scorer: Optional[Callable] = spancat_score, save_activations: bool = False, ) -> None: - """Initialize the span categorizer. + """Initialize the multi-label or multi-class span categorizer. + vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. + For multi-class classification (single label per span) we recommend + using a Softmax classifier as a the final layer, while for multi-label + classification (multiple possible labels per span) we recommend Logistic. + suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. + Spans are returned as a ragged array with two integer columns, for the + start and end positions. name (str): The component instance name, used to add entries to the losses during training. spans_key (str): Key of the Doc.spans dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. - threshold (float): Minimum probability to consider a prediction - positive. Spans with a positive prediction will be saved on the Doc. - Defaults to 0.5. + add_negative_label (bool): Learn to predict a special 'negative_label' + when a Span is not annotated. + threshold (Optional[float]): Minimum probability to consider a prediction + positive. Defaults to 0.5. Spans with a positive prediction will be saved + on the Doc. max_positive (Optional[int]): Maximum number of labels to consider positive per span. Defaults to None, indicating no limit. + negative_weight (float): Multiplier for the loss terms. + Can be used to downweight the negative samples if there are too many + when add_negative_label is True. Otherwise its unused. + allow_overlap (bool): If True the data is assumed to contain overlapping spans. + Otherwise it produces non-overlapping spans greedily prioritizing + higher assigned label scores. Only used when max_positive is 1. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the Doc.spans[spans_key] with overlapping spans allowed. @@ -223,13 +363,17 @@ class SpanCategorizer(TrainablePipe): "spans_key": spans_key, "threshold": threshold, "max_positive": max_positive, + "negative_weight": negative_weight, + "allow_overlap": allow_overlap, } self.vocab = vocab self.suggester = suggester self.model = model self.name = name self.scorer = scorer - self.save_activations = save_activations + self.add_negative_label = add_negative_label + if not allow_overlap and max_positive is not None and max_positive > 1: + raise ValueError(Errors.E1051.format(max_positive=max_positive)) @property def key(self) -> str: @@ -239,6 +383,21 @@ class SpanCategorizer(TrainablePipe): """ return str(self.cfg["spans_key"]) + def _allow_extra_label(self) -> None: + """Raise an error if the component can not add any more labels.""" + nO = None + if self.model.has_dim("nO"): + nO = self.model.get_dim("nO") + elif self.model.has_ref("output_layer") and self.model.get_ref( + "output_layer" + ).has_dim("nO"): + nO = self.model.get_ref("output_layer").get_dim("nO") + if nO is not None and nO == self._n_labels: + if not self.is_resizable: + raise ValueError( + Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")) + ) + def add_label(self, label: str) -> int: """Add a new label to the pipe. @@ -272,7 +431,28 @@ class SpanCategorizer(TrainablePipe): """ return list(self.labels) - def predict(self, docs: Iterable[Doc]) -> ActivationsT: + @property + def _label_map(self) -> Dict[str, int]: + """RETURNS (Dict[str, int]): The label map.""" + return {label: i for i, label in enumerate(self.labels)} + + @property + def _n_labels(self) -> int: + """RETURNS (int): Number of labels.""" + if self.add_negative_label: + return len(self.labels) + 1 + else: + return len(self.labels) + + @property + def _negative_label_i(self) -> Union[int, None]: + """RETURNS (Union[int, None]): Index of the negative label.""" + if self.add_negative_label: + return len(self.label_data) + else: + return None + + def predict(self, docs: Iterable[Doc]): """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -313,24 +493,24 @@ class SpanCategorizer(TrainablePipe): DOCS: https://spacy.io/api/spancategorizer#set_annotations """ - labels = self.labels - - indices = activations["indices"] - assert isinstance(indices, Ragged) - scores = cast(Floats2d, activations["scores"]) - + indices, scores = indices_scores offset = 0 for i, doc in enumerate(docs): indices_i = indices[i].dataXd - if self.save_activations: - doc.activations[self.name] = {} - doc.activations[self.name]["indices"] = indices_i - doc.activations[self.name]["scores"] = scores[ - offset : offset + indices.lengths[i] - ] - doc.spans[self.key] = self._make_span_group( - doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] - ) + allow_overlap = cast(bool, self.cfg["allow_overlap"]) + if self.cfg["max_positive"] == 1: + doc.spans[self.key] = self._make_span_group_singlelabel( + doc, + indices_i, + scores[offset : offset + indices.lengths[i]], + allow_overlap, + ) + else: + doc.spans[self.key] = self._make_span_group_multilabel( + doc, + indices_i, + scores[offset : offset + indices.lengths[i]], + ) offset += indices.lengths[i] def update( @@ -390,9 +570,11 @@ class SpanCategorizer(TrainablePipe): spans = Ragged( self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths) ) - label_map = {label: i for i, label in enumerate(self.labels)} target = numpy.zeros(scores.shape, dtype=scores.dtype) + if self.add_negative_label: + negative_spans = numpy.ones((scores.shape[0])) offset = 0 + label_map = self._label_map for i, eg in enumerate(examples): # Map (start, end) offset of spans to the row in the d_scores array, # so that we can adjust the gradient for predictions that were @@ -409,10 +591,16 @@ class SpanCategorizer(TrainablePipe): row = spans_index[key] k = label_map[gold_span.label_] target[row, k] = 1.0 + if self.add_negative_label: + # delete negative label target. + negative_spans[row] = 0.0 # The target is a flat array for all docs. Track the position # we're at within the flat array. offset += spans.lengths[i] target = self.model.ops.asarray(target, dtype="f") # type: ignore + if self.add_negative_label: + negative_samples = numpy.nonzero(negative_spans)[0] + target[negative_samples, self._negative_label_i] = 1.0 # type: ignore # The target will have the values 0 (for untrue predictions) or 1 # (for true predictions). # The scores should be in the range [0, 1]. @@ -421,6 +609,10 @@ class SpanCategorizer(TrainablePipe): # If the prediction is 0.9 and it's false, the gradient will be # 0.9 (0.9 - 0.0) d_scores = scores - target + if self.add_negative_label: + neg_weight = cast(float, self.cfg["negative_weight"]) + if neg_weight != 1.0: + d_scores[negative_samples] *= neg_weight loss = float((d_scores**2).sum()) return loss, d_scores @@ -457,7 +649,7 @@ class SpanCategorizer(TrainablePipe): if subbatch: docs = [eg.x for eg in subbatch] spans = build_ngram_suggester(sizes=[1])(docs) - Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) + Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels) self.model.initialize(X=(docs, spans), Y=Y) else: self.model.initialize() @@ -471,31 +663,98 @@ class SpanCategorizer(TrainablePipe): eg.reference.spans.get(self.key, []), allow_overlap=True ) - def _make_span_group( - self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str] + def _make_span_group_multilabel( + self, + doc: Doc, + indices: Ints2d, + scores: Floats2d, ) -> SpanGroup: + """Find the top-k labels for each span (k=max_positive).""" spans = SpanGroup(doc, name=self.key) - max_positive = self.cfg["max_positive"] + if scores.size == 0: + return spans + scores = self.model.ops.to_numpy(scores) + indices = self.model.ops.to_numpy(indices) threshold = self.cfg["threshold"] + max_positive = self.cfg["max_positive"] keeps = scores >= threshold - ranked = (scores * -1).argsort() # type: ignore if max_positive is not None: assert isinstance(max_positive, int) + if self.add_negative_label: + negative_scores = numpy.copy(scores[:, self._negative_label_i]) + scores[:, self._negative_label_i] = -numpy.inf + ranked = (scores * -1).argsort() # type: ignore + scores[:, self._negative_label_i] = negative_scores + else: + ranked = (scores * -1).argsort() # type: ignore span_filter = ranked[:, max_positive:] for i, row in enumerate(span_filter): keeps[i, row] = False - spans.attrs["scores"] = scores[keeps].flatten() - - indices = self.model.ops.to_numpy(indices) - keeps = self.model.ops.to_numpy(keeps) + attrs_scores = [] for i in range(indices.shape[0]): start = indices[i, 0] end = indices[i, 1] - for j, keep in enumerate(keeps[i]): if keep: - spans.append(Span(doc, start, end, label=labels[j])) - + if j != self._negative_label_i: + spans.append(Span(doc, start, end, label=self.labels[j])) + attrs_scores.append(scores[i, j]) + spans.attrs["scores"] = numpy.array(attrs_scores) + return spans + + def _make_span_group_singlelabel( + self, + doc: Doc, + indices: Ints2d, + scores: Floats2d, + allow_overlap: bool = True, + ) -> SpanGroup: + """Find the argmax label for each span.""" + # Handle cases when there are zero suggestions + if scores.size == 0: + return SpanGroup(doc, name=self.key) + scores = self.model.ops.to_numpy(scores) + indices = self.model.ops.to_numpy(indices) + predicted = scores.argmax(axis=1) + argmax_scores = numpy.take_along_axis( + scores, numpy.expand_dims(predicted, 1), axis=1 + ) + keeps = numpy.ones(predicted.shape, dtype=bool) + # Remove samples where the negative label is the argmax. + if self.add_negative_label: + keeps = numpy.logical_and(keeps, predicted != self._negative_label_i) + # Filter samples according to threshold. + threshold = self.cfg["threshold"] + if threshold is not None: + keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze()) + # Sort spans according to argmax probability + if not allow_overlap: + # Get the probabilities + sort_idx = (argmax_scores.squeeze() * -1).argsort() + argmax_scores = argmax_scores[sort_idx] + predicted = predicted[sort_idx] + indices = indices[sort_idx] + keeps = keeps[sort_idx] + seen = _Intervals() + spans = SpanGroup(doc, name=self.key) + attrs_scores = [] + for i in range(indices.shape[0]): + if not keeps[i]: + continue + + label = predicted[i] + start = indices[i, 0] + end = indices[i, 1] + + if not allow_overlap: + if (start, end) in seen: + continue + else: + seen.add(start, end) + attrs_scores.append(argmax_scores[i]) + spans.append(Span(doc, start, end, label=self.labels[label])) + + spans.attrs["scores"] = numpy.array(attrs_scores) return spans diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 101d8bcea..2dc64c30b 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -47,13 +47,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={ - "model": DEFAULT_TAGGER_MODEL, - "overwrite": False, - "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, - "neg_prefix": "!", - "save_activations": False, - }, + default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -63,7 +57,7 @@ def make_tagger( overwrite: bool, scorer: Optional[Callable], neg_prefix: str, - save_activations: bool, + label_smoothing: float, ): """Construct a part-of-speech tagger component. @@ -72,8 +66,7 @@ def make_tagger( in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, - save_activations=save_activations) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing) def tagger_score(examples, **kwargs): @@ -99,7 +92,7 @@ class Tagger(TrainablePipe): overwrite=False, scorer=tagger_score, neg_prefix="!", - save_activations: bool = False, + label_smoothing=0.0, ): """Initialize a part-of-speech tagger. @@ -118,7 +111,7 @@ class Tagger(TrainablePipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix} + cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing} self.cfg = dict(sorted(cfg.items())) self.scorer = scorer self.save_activations = save_activations @@ -294,7 +287,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"]) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"]) # Convert empty tag "" to missing value None so that both misaligned # tokens and tokens with missing annotation have the default missing # value None. diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index a99f8b561..df650eaa9 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -689,21 +689,32 @@ def test_span_group_copy(doc): assert len(doc_copy.spans["test"]) == 2 -@pytest.mark.issue(11113) -def test_span_ent_id(en_tokenizer): - doc = en_tokenizer("a b c d") - doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")] - span = doc.ents[0] - assert doc[1].ent_id_ == "ID0" +def test_for_partial_ent_sents(): + """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences, + which this tests for. + """ + doc = Doc( + English().vocab, + words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."], + sent_starts=[1, 0, 0, 1, 0, 0], + ) + doc.set_ents([Span(doc, 1, 4, "WORK")]) + # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be + # equal to the sentences referenced in ent.sents. + for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents): + assert doc_sent == ent_sent - # setting Span.id sets Token.ent_id - span.id_ = "ID1" - doc.ents = [span] - assert doc.ents[0].ent_id_ == "ID1" - assert doc[1].ent_id_ == "ID1" - # Span.ent_id is an alias of Span.id - span.ent_id_ = "ID2" - doc.ents = [span] - assert doc.ents[0].ent_id_ == "ID2" - assert doc[1].ent_id_ == "ID2" +def test_for_no_ent_sents(): + """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full + sentence. + """ + doc = Doc( + English().vocab, + words=["This", "is", "a", "test.", "ENTITY"], + sent_starts=[1, 0, 0, 0, 1], + ) + doc.set_ents([Span(doc, 4, 5, "WORK")]) + sents = list(doc.ents[0].sents) + assert len(sents) == 1 + assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 62b8f9704..403c3fed0 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -9,6 +9,8 @@ from spacy.lang.en import English from spacy.lang.it import Italian from spacy.language import Language from spacy.lookups import Lookups +from spacy.pipeline import EntityRecognizer +from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example, iob_to_biluo, split_bilu_label from spacy.tokens import Doc, Span @@ -17,8 +19,6 @@ from thinc.api import fix_random_seed import logging from ..util import make_tempdir -from ...pipeline import EntityRecognizer -from ...pipeline.ner import DEFAULT_NER_MODEL TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 2f2fa397e..c964166f5 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -10,13 +10,11 @@ from spacy.lang.en import English from spacy.training import Example from spacy.tokens import Doc from spacy.vocab import Vocab -from spacy import util, registry -from thinc.api import fix_random_seed +from spacy.pipeline import DependencyParser +from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL -from ...pipeline import DependencyParser -from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ..util import apply_transition_sequence, make_tempdir -from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL TRAIN_DATA = [ ( diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index ed84ce674..7afb7c804 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,10 +1,10 @@ -from typing import Callable, Iterable, Dict, Any, cast +from typing import Callable, Iterable, Dict, Any, Tuple import pytest from numpy.testing import assert_equal from thinc.types import Ragged -from spacy import registry, util +from spacy import registry, util, Language from spacy.attrs import ENT_KB_ID from spacy.compat import pickle from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase @@ -108,18 +108,23 @@ def test_issue7065(): @pytest.mark.issue(7065) -def test_issue7065_b(): +@pytest.mark.parametrize("entity_in_first_sentence", [True, False]) +def test_sentence_crossing_ents(entity_in_first_sentence: bool): + """Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an + entity. + entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the + sentence-crossing entity. + """ # Test that the NEL doesn't crash when an entity crosses a sentence boundary nlp = English() vector_length = 3 - nlp.add_pipe("sentencizer") text = "Mahler 's Symphony No. 8 was beautiful." - entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = { - (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, - } - sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + entities = [(10, 24, "WORK")] + links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}} + if entity_in_first_sentence: + entities.append((0, 6, "PERSON")) + links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0} + sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0] doc = nlp(text) example = Example.from_dict( doc, {"entities": entities, "links": links, "sent_starts": sent_starts} @@ -145,31 +150,14 @@ def test_issue7065_b(): # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) + entity_linker.set_kb(create_kb) # type: ignore # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer) - # Add a custom rule-based component to mimick NER - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - { - "label": "WORK", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - }, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - # test the trained model - this should not throw E148 - doc = nlp(text) - assert doc + # This shouldn't crash. + entity_linker.predict([example.reference]) # type: ignore def test_no_entities(): diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 5b9b17c01..fcbe62cf3 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,6 +1,6 @@ from typing import cast import pytest -from numpy.testing import assert_equal +from numpy.testing import assert_equal, assert_almost_equal from spacy import util from spacy.training import Example @@ -21,6 +21,8 @@ def test_label_types(): morphologizer.add_label(9) +TAGS = ["Feat=N", "Feat=V", "Feat=J"] + TRAIN_DATA = [ ( "I like green eggs", @@ -34,6 +36,29 @@ TRAIN_DATA = [ ] +def test_label_smoothing(): + nlp = Language() + morph_no_ls = nlp.add_pipe("morphologizer", "no_label_smoothing") + morph_ls = nlp.add_pipe( + "morphologizer", "label_smoothing", config=dict(label_smoothing=0.05) + ) + train_examples = [] + losses = {} + for tag in TAGS: + morph_no_ls.add_label(tag) + morph_ls.add_label(tag) + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + tag_scores, bp_tag_scores = morph_ls.model.begin_update( + [eg.predicted for eg in train_examples] + ) + no_ls_grads = morph_no_ls.get_loss(train_examples, tag_scores)[1][0] + ls_grads = morph_ls.get_loss(train_examples, tag_scores)[1][0] + assert_almost_equal(ls_grads / no_ls_grads, 0.94285715) + + def test_no_label(): nlp = Language() nlp.add_pipe("morphologizer") diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index da9bffbc8..b216f9f6c 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -15,6 +15,8 @@ OPS = get_current_ops() SPAN_KEY = "labeled_spans" +SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"] + TRAIN_DATA = [ ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}), ( @@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA): return train_examples -def test_no_label(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_no_label(name): nlp = Language() - nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) with pytest.raises(ValueError): nlp.initialize() -def test_no_resize(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_no_resize(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) spancat.add_label("Thing") spancat.add_label("Phrase") assert spancat.labels == ("Thing", "Phrase") nlp.initialize() - assert spancat.model.get_dim("nO") == 2 + assert spancat.model.get_dim("nO") == spancat._n_labels # this throws an error because the spancat can't be resized after initialization with pytest.raises(ValueError): spancat.add_label("Stuff") -def test_implicit_labels(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_implicit_labels(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 train_examples = make_examples(nlp) nlp.initialize(get_examples=lambda: train_examples) assert spancat.labels == ("PERSON", "LOC") -def test_explicit_labels(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_explicit_labels(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 spancat.add_label("PERSON") spancat.add_label("LOC") @@ -102,13 +108,13 @@ def test_doc_gc(): # XXX This fails with length 0 sometimes assert len(spangroup) > 0 with pytest.raises(RuntimeError): - span = spangroup[0] + spangroup[0] @pytest.mark.parametrize( "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)] ) -def test_make_spangroup(max_positive, nr_results): +def test_make_spangroup_multilabel(max_positive, nr_results): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe( @@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results): indices = ngram_suggester([doc])[0].dataXd assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] + for label in labels: + spancat.add_label(label) scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" ) - spangroup = spancat._make_span_group(doc, indices, scores, labels) + spangroup = spancat._make_span_group_multilabel(doc, indices, scores) assert len(spangroup) == nr_results # first span is always the second token "London" @@ -154,6 +162,130 @@ def test_make_spangroup(max_positive, nr_results): assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5) +@pytest.mark.parametrize( + "threshold,allow_overlap,nr_results", + [(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)], +) +def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results): + fix_random_seed(0) + nlp = Language() + spancat = nlp.add_pipe( + "spancat", + config={ + "spans_key": SPAN_KEY, + "threshold": threshold, + "max_positive": 1, + }, + ) + doc = nlp.make_doc("Greater London") + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) + indices = ngram_suggester([doc])[0].dataXd + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) + labels = ["Thing", "City", "Person", "GreatCity"] + for label in labels: + spancat.add_label(label) + scores = numpy.asarray( + [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" + ) + spangroup = spancat._make_span_group_singlelabel( + doc, indices, scores, allow_overlap + ) + if threshold > 0.4: + if allow_overlap: + assert spangroup[0].text == "London" + assert spangroup[0].label_ == "City" + assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5) + assert spangroup[1].text == "Greater London" + assert spangroup[1].label_ == "GreatCity" + assert spangroup.attrs["scores"][1] == 0.9 + assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5) + else: + assert spangroup[0].text == "Greater London" + assert spangroup[0].label_ == "GreatCity" + assert spangroup.attrs["scores"][0] == 0.9 + else: + if allow_overlap: + assert spangroup[0].text == "Greater" + assert spangroup[0].label_ == "City" + assert spangroup[1].text == "London" + assert spangroup[1].label_ == "City" + assert spangroup[2].text == "Greater London" + assert spangroup[2].label_ == "GreatCity" + else: + assert spangroup[0].text == "Greater London" + + +def test_make_spangroup_negative_label(): + fix_random_seed(0) + nlp_single = Language() + nlp_multi = Language() + spancat_single = nlp_single.add_pipe( + "spancat", + config={ + "spans_key": SPAN_KEY, + "threshold": 0.1, + "max_positive": 1, + }, + ) + spancat_multi = nlp_multi.add_pipe( + "spancat", + config={ + "spans_key": SPAN_KEY, + "threshold": 0.1, + "max_positive": 2, + }, + ) + spancat_single.add_negative_label = True + spancat_multi.add_negative_label = True + doc = nlp_single.make_doc("Greater London") + labels = ["Thing", "City", "Person", "GreatCity"] + for label in labels: + spancat_multi.add_label(label) + spancat_single.add_label(label) + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) + indices = ngram_suggester([doc])[0].dataXd + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) + scores = numpy.asarray( + [ + [0.2, 0.4, 0.3, 0.1, 0.1], + [0.1, 0.6, 0.2, 0.4, 0.9], + [0.8, 0.7, 0.3, 0.9, 0.1], + ], + dtype="f", + ) + spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores) + spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores) + assert len(spangroup_single) == 2 + assert spangroup_single[0].text == "Greater" + assert spangroup_single[0].label_ == "City" + assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5) + assert spangroup_single[1].text == "Greater London" + assert spangroup_single[1].label_ == "GreatCity" + assert spangroup_single.attrs["scores"][1] == 0.9 + assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5) + + assert len(spangroup_multi) == 6 + assert spangroup_multi[0].text == "Greater" + assert spangroup_multi[0].label_ == "City" + assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5) + assert spangroup_multi[1].text == "Greater" + assert spangroup_multi[1].label_ == "Person" + assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5) + assert spangroup_multi[2].text == "London" + assert spangroup_multi[2].label_ == "City" + assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5) + assert spangroup_multi[3].text == "London" + assert spangroup_multi[3].label_ == "GreatCity" + assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5) + assert spangroup_multi[4].text == "Greater London" + assert spangroup_multi[4].label_ == "Thing" + assert spangroup_multi[4].text == "Greater London" + assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5) + assert spangroup_multi[5].text == "Greater London" + assert spangroup_multi[5].label_ == "GreatCity" + assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5) + + def test_ngram_suggester(en_tokenizer): # test different n-gram lengths for size in [1, 2, 3]: @@ -371,9 +503,9 @@ def test_overfitting_IO_overlapping(): assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"} -def test_zero_suggestions(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_zero_suggestions(name): # Test with a suggester that can return 0 suggestions - @registry.misc("test_mixed_zero_suggester") def make_mixed_zero_suggester(): def mixed_zero_suggester(docs, *, ops=None): @@ -400,7 +532,7 @@ def test_zero_suggestions(): fix_random_seed(0) nlp = English() spancat = nlp.add_pipe( - "spancat", + name, config={ "suggester": {"@misc": "test_mixed_zero_suggester"}, "spans_key": SPAN_KEY, @@ -408,7 +540,7 @@ def test_zero_suggestions(): ) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) - assert spancat.model.get_dim("nO") == 2 + assert spancat.model.get_dim("nO") == spancat._n_labels assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer) @@ -424,9 +556,10 @@ def test_zero_suggestions(): list(nlp.pipe(["", "one", "three three three"])) -def test_set_candidates(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_set_candidates(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp) nlp.initialize(get_examples=lambda: train_examples) texts = [ diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 505b41f8c..defb12365 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,6 @@ from typing import cast import pytest -from numpy.testing import assert_equal +from numpy.testing import assert_equal, assert_almost_equal from spacy.attrs import TAG from spacy import util @@ -71,6 +71,29 @@ PARTIAL_DATA = [ ] +def test_label_smoothing(): + nlp = Language() + tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing") + tagger_ls = nlp.add_pipe( + "tagger", "label_smoothing", config=dict(label_smoothing=0.05) + ) + train_examples = [] + losses = {} + for tag in TAGS: + tagger_no_ls.add_label(tag) + tagger_ls.add_label(tag) + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + tag_scores, bp_tag_scores = tagger_ls.model.begin_update( + [eg.predicted for eg in train_examples] + ) + no_ls_grads = tagger_no_ls.get_loss(train_examples, tag_scores)[1][0] + ls_grads = tagger_ls.get_loss(train_examples, tag_scores)[1][0] + assert_almost_equal(ls_grads / no_ls_grads, 0.925) + + def test_no_label(): nlp = Language() nlp.add_pipe("tagger") diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index dc7ce46fe..1fdf059b3 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -29,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.init_pipeline import _init_labels from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.project.remote_storage import RemoteStorage @@ -47,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config -from ..cli.init_pipeline import _init_labels from .util import make_tempdir @@ -553,7 +552,14 @@ def test_parse_cli_overrides(): @pytest.mark.parametrize("lang", ["en", "nl"]) @pytest.mark.parametrize( - "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]] + "pipeline", + [ + ["tagger", "parser", "ner"], + [], + ["ner", "textcat", "sentencizer"], + ["morphologizer", "spancat", "entity_linker"], + ["spancat_singlelabel", "textcat_multilabel"], + ], ) @pytest.mark.parametrize("optimize", ["efficiency", "accuracy"]) @pytest.mark.parametrize("pretraining", [True, False]) @@ -1126,6 +1132,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1158,6 +1165,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 8aaadf686..9ba4f0e5c 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -5,10 +5,18 @@ import srsly from typer.testing import CliRunner from spacy.tokens import DocBin, Doc -from spacy.cli._util import app +from spacy.cli._util import app, get_git_version from .util import make_tempdir, normalize_whitespace +def has_git(): + try: + get_git_version() + return True + except RuntimeError: + return False + + def test_convert_auto(): with make_tempdir() as d_in, make_tempdir() as d_out: for f in ["data1.iob", "data2.iob", "data3.iob"]: @@ -181,6 +189,7 @@ def test_project_run(project_dir): assert "okokok" in result.stdout +@pytest.mark.skipif(not has_git(), reason="git not installed") @pytest.mark.parametrize( "options", [ diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index f298b38e0..837a92e02 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab): {"start": 2, "end": 3, "label": "det", "dir": "left"}, {"start": 1, "end": 3, "label": "attr", "dir": "right"}, ] + # Test that displacy.parse_deps converts Span to Doc + deps = displacy.parse_deps(doc[:]) + assert isinstance(deps, dict) + assert deps["words"] == [ + {"lemma": None, "text": words[0], "tag": pos[0]}, + {"lemma": None, "text": words[1], "tag": pos[1]}, + {"lemma": None, "text": words[2], "tag": pos[2]}, + {"lemma": None, "text": words[3], "tag": pos[3]}, + ] + assert deps["arcs"] == [ + {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "det", "dir": "left"}, + {"start": 1, "end": 3, "label": "attr", "dir": "right"}, + ] def test_displacy_invalid_arcs(): diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index 9359c8485..d1db92de5 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -2,17 +2,19 @@ from pathlib import Path import numpy as np import pytest import srsly -from spacy.vocab import Vocab -from thinc.api import Config +from thinc.api import Config, get_current_ops +from spacy import util +from spacy.lang.en import English +from spacy.training.initialize import init_nlp +from spacy.training.loop import train +from spacy.training.pretrain import pretrain +from spacy.tokens import Doc, DocBin +from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH +from spacy.ml.models.multi_task import create_pretrain_vectors +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir -from ... import util -from ...lang.en import English -from ...training.initialize import init_nlp -from ...training.loop import train -from ...training.pretrain import pretrain -from ...tokens import Doc, DocBin -from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH pretrain_string_listener = """ [nlp] @@ -346,3 +348,26 @@ def write_vectors_model(tmp_dir): nlp = English(vocab) nlp.to_disk(nlp_path) return str(nlp_path) + + +def test_pretrain_default_vectors(): + nlp = English() + nlp.add_pipe("tok2vec") + nlp.initialize() + + # default vectors are supported + nlp.vocab.vectors = Vectors(shape=(10, 10)) + create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + + # floret vectors are supported + nlp.vocab.vectors = Vectors( + data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1 + ) + create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + + # error for no vectors + with pytest.raises(ValueError, match="E875"): + nlp.vocab.vectors = Vectors() + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + ) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 75f7db7ca..8227cf453 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -494,10 +494,12 @@ cdef class Span: start = i if start >= self.end: break - if start < self.end: - spans.append(Span(self.doc, start, self.end)) - return tuple(spans) + elif i == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) + # Ensure that trailing parts of the Span instance are included in last element of .sents. + if start == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) @property def ents(self): diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 1a3f15e48..cbce62dad 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1253,19 +1253,19 @@ be provided. > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f > ``` -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | -| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | -| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | -| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | -| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| Name | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | +| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | +| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | +| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | +| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | ## assemble {id="assemble",tag="command"} diff --git a/website/docs/api/coref.mdx b/website/docs/api/coref.mdx index 8647f35d1..0b9ebb888 100644 --- a/website/docs/api/coref.mdx +++ b/website/docs/api/coref.mdx @@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters. > config={ > "model": DEFAULT_COREF_MODEL, > "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX, -> }, +> } > nlp.add_pipe("experimental_coref", config=config) > ``` diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index 9514bc773..d509d30d5 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -42,13 +42,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("morphologizer", config=config) > ``` -| Setting | Description | -| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | -| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | -| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | +| Setting | Description | +| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | +| `label_smoothing` 3.6 | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ | ```python %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index c51b32671..f54a8687b 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -13,8 +13,16 @@ A span categorizer consists of two parts: a [suggester function](#suggesters) that proposes candidate spans, which may or may not overlap, and a labeler model that predicts zero or more labels for each candidate. -Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc. -Individual span scores can be found in `spangroup.attrs["scores"]`. +This component comes in two forms: `spancat` and `spancat_singlelabel` (added in +spaCy v3.5.1). When you need to perform multi-label classification on your +spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the +output class probabilities are independent for each class. However, if you need +to predict at most one true class for a span, then use `spancat_singlelabel`. It +uses a `Softmax` layer and treats the task as a multi-class problem. + +Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc +under `doc.spans[spans_key]`, where `spans_key` is a component config setting. +Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`. ## Assigned Attributes {id="assigned-attributes"} @@ -22,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will be saved in `SpanGroup.attrs["scores"]`. -`spans_key` defaults to `"sc"`, but can be passed as a parameter. +`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat` +component will overwrite any existing spans under the spans key +`doc.spans[spans_key]`. | Location | Value | | -------------------------------------- | -------------------------------------------------------- | @@ -38,7 +48,7 @@ how the component should be configured. You can override its settings via the [model architectures](/api/architectures) documentation for details on the architectures and their arguments and hyperparameters. -> #### Example +> #### Example (spancat) > > ```python > from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL @@ -52,15 +62,33 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("spancat", config=config) > ``` -| Setting | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | -| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | -| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | -| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | -| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ | +> #### Example (spancat_singlelabel) +> +> ```python +> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL +> config = { +> "threshold": 0.5, +> "spans_key": "labeled_spans", +> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, +> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, +> # Additional spancat_singlelabel parameters +> "negative_weight": 0.8, +> "allow_overlap": True, +> } +> nlp.add_pipe("spancat_singlelabel", config=config) +> ``` + +| Setting | Description | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| `add_negative_label` 3.5.1 | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ | +| `negative_weight` 3.5.1 | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ | +| `allow_overlap` 3.5.1 | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ | ```python %%GITHUB_SPACY/spacy/pipeline/spancat.py @@ -72,6 +100,7 @@ architectures and their arguments and hyperparameters. > > ```python > # Construction via add_pipe with default model +> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes > spancat = nlp.add_pipe("spancat") > > # Construction via add_pipe with custom model @@ -87,16 +116,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | -| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | -| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| Name | Description | +| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| `allow_overlap` 3.5.1 | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ | +| `add_negative_label` 3.5.1 | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ | +| `negative_weight` 3.5.1 | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ | ## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"} diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 7e380f5f8..ea7e985e3 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of integer IDs. This ensures that strings always map to the same ID, even from different `StringStores`. + + +Note that a `StringStore` instance is not static. It increases in size as texts +with new tokens are processed. + + + ## StringStore.\_\_init\_\_ {id="init",tag="method"} Create the `StringStore`. diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx index 35e7a23b1..7548b309a 100644 --- a/website/docs/api/tagger.mdx +++ b/website/docs/api/tagger.mdx @@ -40,13 +40,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | -| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | -| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | +| Setting | Description | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | +| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | +| `label_smoothing` 3.6 | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index b5f561cae..aa7c51750 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -294,7 +294,7 @@ the `manual=True` argument in `displacy.render`. | Name | Description | | ----------- | ------------------------------------------------------------------- | -| `orig_doc` | Doc to parse dependencies. ~~Doc~~ | +| `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ | | `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ | | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ | diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 3faf1f1a0..d03e34785 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared between `Doc` objects. + + +Note that a `Vocab` instance is not static. It increases in size as texts with +new tokens are processed. + + + ## Vocab.\_\_init\_\_ {id="init",tag="method"} Create the vocabulary. diff --git a/website/meta/universe.json b/website/meta/universe.json index e35a4f045..039d42b33 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3219,6 +3219,51 @@ "category": ["pipeline"], "tags": ["syllables", "multilingual"] }, + { + "id": "sentimental-onix", + "title": "Sentimental Onix", + "slogan": "Use onnx for sentiment models", + "description": "spaCy pipeline component for sentiment analysis using onnx", + "github": "sloev/sentimental-onix", + "pip": "sentimental-onix", + "code_example": [ + "# Download model:", + "# python -m sentimental_onix download en", + "import spacy", + "from sentimental_onix import pipeline", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"sentencizer\")", + "nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")", + "", + "sentences = [", + " (sent.text, sent._.sentiment)", + " for doc in nlp.pipe(", + " [", + " \"i hate pasta on tuesdays\",", + " \"i like movies on wednesdays\",", + " \"i find your argument ridiculous\",", + " \"soda with straws are my favorite\",", + " ]", + " )", + " for sent in doc.sents", + "]", + "", + "assert sentences == [", + " (\"i hate pasta on tuesdays\", \"Negative\"),", + " (\"i like movies on wednesdays\", \"Positive\"),", + " (\"i find your argument ridiculous\", \"Negative\"),", + " (\"soda with straws are my favorite\", \"Positive\"),", + "]" + ], + "thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp", + "author": "Johannes Valbjørn", + "author_links": { + "github": "sloev" + }, + "category": ["pipeline"], + "tags": ["sentiment", "english"] + }, { "id": "gobbli", "title": "gobbli", diff --git a/website/package.json b/website/package.json index eeefe32df..5f8bae47e 100644 --- a/website/package.json +++ b/website/package.json @@ -6,6 +6,7 @@ "dev": "next dev", "build": "next build && npm run sitemap && next export", "prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh", + "predev": "npm run prebuild", "sitemap": "next-sitemap --config next-sitemap.config.mjs", "start": "next start", "lint": "next lint", diff --git a/website/src/styles/navigation.module.sass b/website/src/styles/navigation.module.sass index da5c18b6f..3adc5cd03 100644 --- a/website/src/styles/navigation.module.sass +++ b/website/src/styles/navigation.module.sass @@ -111,11 +111,12 @@ line-height: var(--line-height-xs) text-align: center -@include breakpoint(max, xs) - .list +@include breakpoint(max, md) + .alert display: none - .alert +@include breakpoint(max, xs) + .list display: none .has-alert diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 227b25be8..4c10e09c5 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -57,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => { ) } +// const navAlert = ( +// +// 💥 Out now: spaCy v3.5 +// +// ) + const navAlert = ( - - 💥 Out now: spaCy v3.5 + + 💥 Take the user survey! )