From 9da333cbfa613fa49b9fab764bdb8f96105d059e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 13 Mar 2023 13:13:47 +0100 Subject: [PATCH 01/29] Add GHA for CI tests (#12403) * Add GHA for CI tests * Reorder paths --- .github/workflows/tests.yml | 195 ++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..ad380d39a --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,195 @@ +name: tests + +on: + push: + branches-ignore: + - "spacy.io" + - "nightly.spacy.io" + - "v2.spacy.io" + paths-ignore: + - "*.md" + - "*.mdx" + - "website/docs/**" + - "website/src/**" + - "website/meta/*.tsx" + - "website/meta/*.mjs" + - "website/meta/languages.json" + - "website/meta/site.json" + - "website/meta/sidebars.json" + - "website/meta/type-annotations.json" + - "website/pages/**" + - ".github/workflows/**" + pull_request: + types: [opened, synchronize, reopened, edited] + paths: + - "!*.md" + - "!*.mdx" + - "!website/docs/**" + - "!website/src/**" + - "!website/meta/*.tsx" + - "!website/meta/*.mjs" + - "!website/meta/languages.json" + - "!website/meta/site.json" + - "!website/meta/sidebars.json" + - "!website/meta/type-annotations.json" + - "!website/pages/**" + - "!.github/workflows/**" + - ".github/workflows/tests.yml" + +jobs: + validate: + name: Validate + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: "3.7" + architecture: x64 + + - name: black + run: | + python -m pip install black -c requirements.txt + python -m black spacy --check + - name: flake8 + run: | + python -m pip install flake8==5.0.4 + python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics + tests: + name: Test + needs: Validate + strategy: + fail-fast: true + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python_version: ["3.11"] + include: + - os: ubuntu-20.04 + python_version: "3.6" + - os: windows-latest + python_version: "3.7" + - os: macos-latest + python_version: "3.8" + - os: ubuntu-latest + python_version: "3.9" + - os: windows-latest + python_version: "3.10" + + runs-on: ${{ matrix.os }} + + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python_version }} + architecture: x64 + + - name: Install dependencies + run: | + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt + + - name: Build sdist + run: | + python -m build --sdist + + - name: Run mypy + run: | + python -m mypy spacy + if: matrix.python_version != '3.6' + + - name: Delete source directory and .egg-info + run: | + rm -rf spacy *.egg-info + shell: bash + + - name: Uninstall all packages + run: | + python -m pip freeze + python -m pip freeze --exclude pywin32 > installed.txt + python -m pip uninstall -y -r installed.txt + + - name: Install from sdist + run: | + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST + shell: bash + + - name: Test import + run: python -W error -c "import spacy" + + - name: "Test download CLI" + run: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + if: matrix.python_version == '3.8' + + - name: "Test download_url in info CLI" + run: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + if: matrix.python_version == '3.8' + + - name: "Test no warnings on load (#11713)" + run: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + if: matrix.python_version == '3.8' + + - name: "Test convert CLI" + run: | + python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . + if: matrix.python_version == '3.8' + + - name: "Test debug config CLI" + run: | + python -m spacy init config -p ner -l ca ner.cfg + python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy + if: matrix.python_version == '3.8' + + - name: "Test debug data CLI" + run: | + # will have errors due to sparse data, check for summary in output + python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary + if: matrix.python_version == '3.8' + + - name: "Test train CLI" + run: | + python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 + if: matrix.python_version == '3.8' + + - name: "Test assemble CLI" + run: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + if: matrix.python_version == '3.8' + + - name: "Test assemble CLI vectors warning" + run: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + if: matrix.python_version == '3.8' + + - name: "Install test requirements" + run: | + python -m pip install -U -r requirements.txt + + - name: "Run CPU tests" + run: | + python -m pytest --pyargs spacy -W error + + - name: "Run CPU tests with thinc-apple-ops" + run: | + python -m pip install 'spacy[apple]' + python -m pytest --pyargs spacy + if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.8' + + - run: | + python .github/validate_universe_json.py website/meta/universe.json + name: "Test website/meta/universe.json" + if: matrix.python_version == '3.8' From ed83cafe46d973ca42d3798348d750a1156feab9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 13 Mar 2023 14:21:17 +0100 Subject: [PATCH 02/29] CI: Move universe validation to validate job (#12406) * CI: Move universe validation to validate job * Fix indentation * Update step name --- .github/azure-steps.yml | 6 ------ .github/workflows/tests.yml | 8 +++----- azure-pipelines.yml | 3 +++ 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index b2ccf3d81..1b8d81521 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -116,9 +116,3 @@ steps: python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) - - - script: | - python .github/validate_universe_json.py website/meta/universe.json - displayName: 'Test website/meta/universe.json' - condition: eq(variables['python_version'], '3.8') - diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ad380d39a..b04e2a8c0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -58,6 +58,9 @@ jobs: run: | python -m pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics + - name: Validate website/meta/universe.json + run: | + python .github/validate_universe_json.py website/meta/universe.json tests: name: Test needs: Validate @@ -188,8 +191,3 @@ jobs: python -m pip install 'spacy[apple]' python -m pytest --pyargs spacy if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.8' - - - run: | - python .github/validate_universe_json.py website/meta/universe.json - name: "Test website/meta/universe.json" - if: matrix.python_version == '3.8' diff --git a/azure-pipelines.yml b/azure-pipelines.yml index dba11bd1a..83c57a164 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,6 +48,9 @@ jobs: pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics displayName: "flake8" + - script: | + python .github/validate_universe_json.py website/meta/universe.json + displayName: 'Validate website/meta/universe.json' - job: "Test" dependsOn: "Validate" From 9ca67dc5394a9401fe293b60ddce23372116a270 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 13 Mar 2023 15:10:04 +0100 Subject: [PATCH 03/29] Fix thinc-apple-ops test to run for python 3.11 (#12408) --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b04e2a8c0..880c09128 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -190,4 +190,4 @@ jobs: run: | python -m pip install 'spacy[apple]' python -m pytest --pyargs spacy - if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.8' + if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11' \ No newline at end of file From d00e58d1ac7507c15d5524bb273f2b537baba1b6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 13 Mar 2023 15:14:46 +0100 Subject: [PATCH 04/29] CI: Move CLI tests to ubuntu for speed (#12409) --- .github/azure-steps.yml | 18 +++++++++--------- .github/workflows/tests.yml | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 1b8d81521..20d4582cb 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -57,51 +57,51 @@ steps: python -m spacy download ca_core_news_md python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" displayName: 'Test download CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -W error -m spacy info ca_core_news_sm | grep -q download_url displayName: 'Test download_url in info CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" displayName: 'Test no warnings on load (#11713)' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . displayName: 'Test convert CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -m spacy init config -p ner -l ca ner.cfg python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy displayName: 'Test debug config CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | # will have errors due to sparse data, check for summary in output python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary displayName: 'Test debug data CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 displayName: 'Test train CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir displayName: 'Test assemble CLI' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 displayName: 'Test assemble CLI vectors warning' - condition: eq(variables['python_version'], '3.8') + condition: eq(variables['python_version'], '3.9') - script: | python -m pip install -U -r requirements.txt diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 880c09128..e51bb6c17 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -132,51 +132,51 @@ jobs: python -m spacy download ca_core_news_sm python -m spacy download ca_core_news_md python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test download_url in info CLI" run: | python -W error -m spacy info ca_core_news_sm | grep -q download_url - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test no warnings on load (#11713)" run: | python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test convert CLI" run: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test debug config CLI" run: | python -m spacy init config -p ner -l ca ner.cfg python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test debug data CLI" run: | # will have errors due to sparse data, check for summary in output python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test train CLI" run: | python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test assemble CLI" run: | python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Test assemble CLI vectors warning" run: | python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - if: matrix.python_version == '3.8' + if: matrix.python_version == '3.9' - name: "Install test requirements" run: | From 545218a7d9763df60e300e16a489a4169242cf9c Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 14 Mar 2023 10:21:53 +0100 Subject: [PATCH 05/29] Fix sentence indexing bug in `Span.sents` (#12405) * Add test for partial sentences in ent.sents. * Removed unneeded import. * Format. Simplify code. --- spacy/tests/doc/test_span.py | 16 ++++++++++++++++ spacy/tokens/span.pyx | 5 ++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index b4631037a..adef5922f 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -700,3 +700,19 @@ def test_span_group_copy(doc): assert len(doc.spans["test"]) == 3 # check that the copy spans were not modified and this is an isolated doc assert len(doc_copy.spans["test"]) == 2 + + +def test_for_partial_ent_sents(): + """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences, + which this tests for. + """ + doc = Doc( + English().vocab, + words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."], + sent_starts=[1, 0, 0, 1, 0, 0], + ) + doc.set_ents([Span(doc, 1, 4, "WORK")]) + # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be + # equal to the sentences referenced in ent.sents. + for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents): + assert doc_sent == ent_sent diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cfe1236df..7750b16ed 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -460,9 +460,8 @@ cdef class Span: start = i if start >= self.end: break - if start < self.end: - yield Span(self.doc, start, self.end) - + elif i == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) @property def ents(self): From 7880da952bbe459a2fdcecd74fd899dd05da2fe3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 14 Mar 2023 16:06:08 +0100 Subject: [PATCH 06/29] CI: Add all paths before excluding patterns (#12419) --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e51bb6c17..c18f9cd23 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,6 +22,7 @@ on: pull_request: types: [opened, synchronize, reopened, edited] paths: + - "**" - "!*.md" - "!*.mdx" - "!website/docs/**" @@ -190,4 +191,4 @@ jobs: run: | python -m pip install 'spacy[apple]' python -m pytest --pyargs spacy - if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11' \ No newline at end of file + if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11' From be644caa135c49a19a41431305545ac4e4decb3d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 14 Mar 2023 17:16:49 +0100 Subject: [PATCH 07/29] Fix --verbose for spacy find-threshold (#12418) --- spacy/cli/find_threshold.py | 2 +- website/docs/api/cli.mdx | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index efa664832..6d591053d 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -35,7 +35,7 @@ def find_threshold_cli( code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), - verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): """ diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 3f31bef95..2bb0199fc 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1254,19 +1254,19 @@ be provided. > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f > ``` -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | -| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | -| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | -| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | -| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| Name | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | +| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | +| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | +| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | +| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | ## assemble {id="assemble",tag="command"} From bd0768c05c3b91b82b596eab4b46155e37944516 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 14 Mar 2023 22:02:49 +0100 Subject: [PATCH 08/29] Fix EL failure with sentence-crossing entities (#12398) * Add test reproducing EL failure in sentence-crossing entities. * Format. * Draft fix. * Format. * Fix case for len(ent.sents) == 1. * Format. * Format. * Format. * Fix mypy error. * Merge EL sentence crossing tests. * Remove unneeded sentencizer component. * Fix or ignore mypy issues in test. * Simplify ent.sents handling. * Format. Update assert in ent.sents handling. * Small rewrite --------- Co-authored-by: Sofie Van Landeghem --- spacy/pipeline/entity_linker.py | 14 ++++-- spacy/tests/pipeline/test_entity_linker.py | 50 ++++++++-------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index f2dae0529..76ccc3247 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -474,18 +474,24 @@ class EntityLinker(TrainablePipe): # Looping through each entity in batch (TODO: rewrite) for j, ent in enumerate(ent_batch): - sent_index = sentences.index(ent.sent) - assert sent_index >= 0 + assert hasattr(ent, "sents") + sents = list(ent.sents) + sent_indices = ( + sentences.index(sents[0]), + sentences.index(sents[-1]), + ) + assert sent_indices[1] >= sent_indices[0] >= 0 if self.incl_context: # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) + start_sentence = max(0, sent_indices[0] - self.n_sents) end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents + len(sentences) - 1, sent_indices[1] + self.n_sents ) start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model.predict([sent_doc])[0] sentence_encoding_t = sentence_encoding.T diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 2a6258386..fc960cb01 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,9 +1,9 @@ -from typing import Callable, Iterable, Dict, Any +from typing import Callable, Iterable, Dict, Any, Tuple import pytest from numpy.testing import assert_equal -from spacy import registry, util +from spacy import registry, util, Language from spacy.attrs import ENT_KB_ID from spacy.compat import pickle from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase @@ -108,18 +108,23 @@ def test_issue7065(): @pytest.mark.issue(7065) -def test_issue7065_b(): +@pytest.mark.parametrize("entity_in_first_sentence", [True, False]) +def test_sentence_crossing_ents(entity_in_first_sentence: bool): + """Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an + entity. + entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the + sentence-crossing entity. + """ # Test that the NEL doesn't crash when an entity crosses a sentence boundary nlp = English() vector_length = 3 - nlp.add_pipe("sentencizer") text = "Mahler 's Symphony No. 8 was beautiful." - entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = { - (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, - } - sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + entities = [(10, 24, "WORK")] + links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}} + if entity_in_first_sentence: + entities.append((0, 6, "PERSON")) + links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0} + sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0] doc = nlp(text) example = Example.from_dict( doc, {"entities": entities, "links": links, "sent_starts": sent_starts} @@ -145,31 +150,14 @@ def test_issue7065_b(): # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) + entity_linker.set_kb(create_kb) # type: ignore # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer) - # Add a custom rule-based component to mimick NER - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - { - "label": "WORK", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - }, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - # test the trained model - this should not throw E148 - doc = nlp(text) - assert doc + # This shouldn't crash. + entity_linker.predict([example.reference]) # type: ignore def test_no_entities(): From 6183906a0bfc07852c33a1e1928c6491f8e4e462 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 17 Mar 2023 09:35:00 +0100 Subject: [PATCH 09/29] Remove autoblack workflow (#12437) Now that all PRs have `black` formatting validation, we no longer need the autoblack workflow. --- .github/workflows/autoblack.yml | 45 --------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 .github/workflows/autoblack.yml diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml deleted file mode 100644 index 555322782..000000000 --- a/.github/workflows/autoblack.yml +++ /dev/null @@ -1,45 +0,0 @@ -# GitHub Action that uses Black to reformat all Python code and submits a PR -# in regular intervals. Inspired by: https://github.com/cclauss/autoblack - -name: autoblack -on: - workflow_dispatch: # allow manual trigger - schedule: - - cron: '0 8 * * 5' # every Friday at 8am UTC - -jobs: - autoblack: - if: github.repository_owner == 'explosion' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - ref: ${{ github.head_ref }} - - uses: actions/setup-python@v4 - - run: pip install black -c requirements.txt - - name: Auto-format code if needed - run: black spacy - # We can't run black --check here because that returns a non-zero excit - # code and makes GitHub think the action failed - - name: Check for modified files - id: git-check - run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT - - - name: Create Pull Request - if: steps.git-check.outputs.modified == 'true' - uses: peter-evans/create-pull-request@v4 - with: - title: Auto-format code with black - labels: meta - commit-message: Auto-format code with black - committer: GitHub - author: explosion-bot - body: _This PR is auto-generated._ - branch: autoblack - delete-branch: true - draft: false - - name: Check outputs - if: steps.git-check.outputs.modified == 'true' - run: | - echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" - echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" From f9c0220ea567b4f4415a71deefc467f21bb0d9dd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 17 Mar 2023 10:01:49 +0100 Subject: [PATCH 10/29] CI: Switch PR back to paths-ignore (#12438) Switch PR tests back to paths-ignore but include changes to `.github` for all PRs rather than trying to figure out complicated includes+excludes. Changes to `.github` are relatively rare and should not be a huge burden for the CI. --- .github/workflows/tests.yml | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c18f9cd23..eef24ff33 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,21 +21,18 @@ on: - ".github/workflows/**" pull_request: types: [opened, synchronize, reopened, edited] - paths: - - "**" - - "!*.md" - - "!*.mdx" - - "!website/docs/**" - - "!website/src/**" - - "!website/meta/*.tsx" - - "!website/meta/*.mjs" - - "!website/meta/languages.json" - - "!website/meta/site.json" - - "!website/meta/sidebars.json" - - "!website/meta/type-annotations.json" - - "!website/pages/**" - - "!.github/workflows/**" - - ".github/workflows/tests.yml" + paths-ignore: + - "*.md" + - "*.mdx" + - "website/docs/**" + - "website/src/**" + - "website/meta/*.tsx" + - "website/meta/*.mjs" + - "website/meta/languages.json" + - "website/meta/site.json" + - "website/meta/sidebars.json" + - "website/meta/type-annotations.json" + - "website/pages/**" jobs: validate: From f1a42b6fcc3b331a09140633194eacd537e2b458 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 17 Mar 2023 10:59:53 +0100 Subject: [PATCH 11/29] CI: Separate spacy universe validation into a separate workflow (#12440) * Separate spacy universe validation into a separate workflow * Fix new workflow name --- .github/workflows/tests.yml | 23 ++-------------- .github/workflows/universe_validation.yml | 32 +++++++++++++++++++++++ 2 files changed, 34 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/universe_validation.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index eef24ff33..41ea6ce50 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,30 +9,14 @@ on: paths-ignore: - "*.md" - "*.mdx" - - "website/docs/**" - - "website/src/**" - - "website/meta/*.tsx" - - "website/meta/*.mjs" - - "website/meta/languages.json" - - "website/meta/site.json" - - "website/meta/sidebars.json" - - "website/meta/type-annotations.json" - - "website/pages/**" + - "website/**" - ".github/workflows/**" pull_request: types: [opened, synchronize, reopened, edited] paths-ignore: - "*.md" - "*.mdx" - - "website/docs/**" - - "website/src/**" - - "website/meta/*.tsx" - - "website/meta/*.mjs" - - "website/meta/languages.json" - - "website/meta/site.json" - - "website/meta/sidebars.json" - - "website/meta/type-annotations.json" - - "website/pages/**" + - "website/**" jobs: validate: @@ -56,9 +40,6 @@ jobs: run: | python -m pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics - - name: Validate website/meta/universe.json - run: | - python .github/validate_universe_json.py website/meta/universe.json tests: name: Test needs: Validate diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml new file mode 100644 index 000000000..f9e317aaa --- /dev/null +++ b/.github/workflows/universe_validation.yml @@ -0,0 +1,32 @@ +name: universe validation + +on: + push: + branches-ignore: + - "spacy.io" + - "nightly.spacy.io" + - "v2.spacy.io" + paths: + - "website/meta/universe.json" + pull_request: + types: [opened, synchronize, reopened, edited] + paths: + - "website/meta/universe.json" + +jobs: + validate: + name: Validate + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: "3.7" + architecture: x64 + + - name: Validate website/meta/universe.json + run: | + python .github/validate_universe_json.py website/meta/universe.json From d2d9e9e139cca82edb07685d60c02e9c1cf728bf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 22 Mar 2023 11:09:37 +0100 Subject: [PATCH 12/29] Add user survey alert to the top (#12452) * Add user survey alert to the top * Shorter --------- Co-authored-by: Sofie Van Landeghem --- website/src/styles/navigation.module.sass | 7 ++++--- website/src/templates/index.js | 15 ++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/website/src/styles/navigation.module.sass b/website/src/styles/navigation.module.sass index da5c18b6f..3adc5cd03 100644 --- a/website/src/styles/navigation.module.sass +++ b/website/src/styles/navigation.module.sass @@ -111,11 +111,12 @@ line-height: var(--line-height-xs) text-align: center -@include breakpoint(max, xs) - .list +@include breakpoint(max, md) + .alert display: none - .alert +@include breakpoint(max, xs) + .list display: none .has-alert diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 2ee29a9e9..4c10e09c5 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -25,11 +25,6 @@ const AlertSpace = ({ nightly, legacy }) => { const isOnline = useOnlineStatus() return ( <> - {isOnline && ( - - Take our survey here. - - )} {nightly && ( { ) } +// const navAlert = ( +// +// 💥 Out now: spaCy v3.5 +// +// ) + const navAlert = ( - - 💥 Out now: spaCy v3.5 + + 💥 Take the user survey! ) From 2953e7b7ce74b3451f099eb918eb12459976cb27 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 24 Mar 2023 16:28:51 +0100 Subject: [PATCH 13/29] Support floret for PretrainVectors (#12435) * Support floret for PretrainVectors * Format --- spacy/errors.py | 4 +-- spacy/ml/models/multi_task.py | 31 +++++++++++++++--------- spacy/tests/training/test_pretraining.py | 16 +++++------- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c897c29ff..40cfa8d92 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -549,8 +549,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x - E850 = ("The PretrainVectors objective currently only supports default " - "vectors, not {mode} vectors.") + E850 = ("The PretrainVectors objective currently only supports default or " + "floret vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " "but found value of '{val}'.") E852 = ("The tar file pulled from the remote attempted an unsafe path " diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 826fddd4f..7eb13b608 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,5 +1,5 @@ from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast -from thinc.types import Floats2d +from thinc.types import Floats2d, Ints1d from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import MultiSoftmax, list2array from thinc.api import to_categorical, CosineDistance, L2Distance @@ -7,7 +7,7 @@ from thinc.loss import Loss from ...util import registry, OOV_RANK from ...errors import Errors -from ...attrs import ID +from ...attrs import ID, ORTH from ...vectors import Mode as VectorsMode import numpy @@ -24,8 +24,6 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: - if vocab.vectors.mode != VectorsMode.default: - raise ValueError(Errors.E850.format(mode=vocab.vectors.mode)) if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( @@ -70,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance): """Compute a loss based on a distance between the documents' vectors and the prediction. """ - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = docs[0].vocab.vectors.data[ids] - target[ids == OOV_RANK] = 0 - d_target, loss = distance(prediction, target) + vocab = docs[0].vocab + if vocab.vectors.mode == VectorsMode.default: + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our + # tokens, and look them up all at once. This prevents data copying. + ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = docs[0].vocab.vectors.data[ids] + target[ids == OOV_RANK] = 0 + d_target, loss = distance(prediction, target) + elif vocab.vectors.mode == VectorsMode.floret: + keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs]) + target = vocab.vectors.get_batch(keys) + target = ops.as_contig(target) + d_target, loss = distance(prediction, target) + else: + raise ValueError(Errors.E850.format(mode=vocab.vectors.mode)) return loss, d_target diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index c0d64f1e7..d1db92de5 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -359,19 +359,15 @@ def test_pretrain_default_vectors(): nlp.vocab.vectors = Vectors(shape=(10, 10)) create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + # floret vectors are supported + nlp.vocab.vectors = Vectors( + data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1 + ) + create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + # error for no vectors with pytest.raises(ValueError, match="E875"): nlp.vocab.vectors = Vectors() create_pretrain_vectors(1, 1, "cosine")( nlp.vocab, nlp.get_pipe("tok2vec").model ) - - # error for floret vectors - with pytest.raises(ValueError, match="E850"): - ops = get_current_ops() - nlp.vocab.vectors = Vectors( - data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1 - ) - create_pretrain_vectors(1, 1, "cosine")( - nlp.vocab, nlp.get_pipe("tok2vec").model - ) From 4380d750f96a4c9d29a62e5b872c597ebdb09462 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Mon, 27 Mar 2023 10:27:11 +0200 Subject: [PATCH 14/29] add explanation about overwriting behaviour (#12464) * add explanation about overwriting behaviour * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * format --------- Co-authored-by: Adriane Boyd --- website/docs/api/spancategorizer.mdx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index c7de2324b..f54a8687b 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -20,8 +20,9 @@ output class probabilities are independent for each class. However, if you need to predict at most one true class for a span, then use `spancat_singlelabel`. It uses a `Softmax` layer and treats the task as a multi-class problem. -Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc. -Individual span scores can be found in `spangroup.attrs["scores"]`. +Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc +under `doc.spans[spans_key]`, where `spans_key` is a component config setting. +Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`. ## Assigned Attributes {id="assigned-attributes"} @@ -29,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will be saved in `SpanGroup.attrs["scores"]`. -`spans_key` defaults to `"sc"`, but can be passed as a parameter. +`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat` +component will overwrite any existing spans under the spans key +`doc.spans[spans_key]`. | Location | Value | | -------------------------------------- | -------------------------------------------------------- | From 0ecbeff1a646036764745dc1fd176f35f731b49b Mon Sep 17 00:00:00 2001 From: Prajakta Darade <107802412+prajakta-1527@users.noreply.github.com> Date: Mon, 27 Mar 2023 15:02:49 +0530 Subject: [PATCH 15/29] corrected example code (#12466) --- website/docs/api/coref.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/coref.mdx b/website/docs/api/coref.mdx index 8647f35d1..0b9ebb888 100644 --- a/website/docs/api/coref.mdx +++ b/website/docs/api/coref.mdx @@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters. > config={ > "model": DEFAULT_COREF_MODEL, > "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX, -> }, +> } > nlp.add_pipe("experimental_coref", config=config) > ``` From 79dcef17f758eaa84c9044272a1c5c037b60dd22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?sloev=20/=20Johannes=20Valbj=C3=B8rn?= Date: Mon, 27 Mar 2023 11:35:14 +0200 Subject: [PATCH 16/29] add spacy_onnx_sentiment_english to universe (#12422) * add spacy_onnx_sentiment_english to universe * rename to sentimental-onix * fix comma json error * fix typo * typo fix Co-authored-by: Adriane Boyd * mention need to download model before example works Co-authored-by: Adriane Boyd --------- Co-authored-by: Adriane Boyd --- website/meta/universe.json | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 16e3bc361..5fd1c2287 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3215,6 +3215,51 @@ "category": ["pipeline"], "tags": ["syllables", "multilingual"] }, + { + "id": "sentimental-onix", + "title": "Sentimental Onix", + "slogan": "Use onnx for sentiment models", + "description": "spaCy pipeline component for sentiment analysis using onnx", + "github": "sloev/sentimental-onix", + "pip": "sentimental-onix", + "code_example": [ + "# Download model:", + "# python -m sentimental_onix download en", + "import spacy", + "from sentimental_onix import pipeline", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"sentencizer\")", + "nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")", + "", + "sentences = [", + " (sent.text, sent._.sentiment)", + " for doc in nlp.pipe(", + " [", + " \"i hate pasta on tuesdays\",", + " \"i like movies on wednesdays\",", + " \"i find your argument ridiculous\",", + " \"soda with straws are my favorite\",", + " ]", + " )", + " for sent in doc.sents", + "]", + "", + "assert sentences == [", + " (\"i hate pasta on tuesdays\", \"Negative\"),", + " (\"i like movies on wednesdays\", \"Positive\"),", + " (\"i find your argument ridiculous\", \"Negative\"),", + " (\"soda with straws are my favorite\", \"Positive\"),", + "]" + ], + "thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp", + "author": "Johannes Valbjørn", + "author_links": { + "github": "sloev" + }, + "category": ["pipeline"], + "tags": ["sentiment", "english"] + }, { "id": "gobbli", "title": "gobbli", From 1b4a67bc5467f8a18d70e452b35ce3fdaaf2459b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Mar 2023 12:44:04 +0200 Subject: [PATCH 17/29] Restrict github workflows to explosion (#12470) --- .github/workflows/explosionbot.yml | 1 + .github/workflows/issue-manager.yml | 1 + .github/workflows/lock.yml | 1 + .github/workflows/spacy_universe_alert.yml | 1 + .github/workflows/tests.yml | 1 + .github/workflows/universe_validation.yml | 1 + 6 files changed, 6 insertions(+) diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 6b472cd12..910cfdc40 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -8,6 +8,7 @@ on: jobs: explosion-bot: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Dump GitHub context diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 8f3a151ea..6c7d7d5a6 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -13,6 +13,7 @@ on: jobs: issue-manager: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - uses: tiangolo/issue-manager@0.4.0 diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 794adee85..6c3985a93 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -13,6 +13,7 @@ concurrency: jobs: action: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - uses: dessant/lock-threads@v4 diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 837aaeb33..33851fbcc 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -7,6 +7,7 @@ on: jobs: build: + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 41ea6ce50..f226057c9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,6 +21,7 @@ on: jobs: validate: name: Validate + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Check out repo diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index f9e317aaa..a1e3253a9 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -16,6 +16,7 @@ on: jobs: validate: name: Validate + if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Check out repo From 888332dfb23eda3ee7dee2ada745236ee54b41f6 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 27 Mar 2023 13:15:14 +0200 Subject: [PATCH 18/29] Add info to stringstore and vocab (#12471) --- website/docs/api/stringstore.mdx | 7 +++++++ website/docs/api/vocab.mdx | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 47d3715c1..6a3e9d664 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of integer IDs. This ensures that strings always map to the same ID, even from different `StringStores`. + + +Note that a `StringStore` instance is not static. It increases in size as texts +with new tokens are processed. + + + ## StringStore.\_\_init\_\_ {id="init",tag="method"} Create the `StringStore`. diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 131e4ce0a..fe774d1a8 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared between `Doc` objects. + + +Note that a `Vocab` instance is not static. It increases in size as texts with +new tokens are processed. + + + ## Vocab.\_\_init\_\_ {id="init",tag="method"} Create the vocabulary. From 26da226a39998d385e58334ca6b514fd11c30ed9 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 29 Mar 2023 08:38:11 +0200 Subject: [PATCH 19/29] Fix spancat-singlelabel score (#12469) * debug argmax sort and add span scores * add missing tests for spanscores --- spacy/pipeline/spancat.py | 2 ++ spacy/tests/pipeline/test_spancat.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 983e1fba9..ff68a3703 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -726,6 +726,7 @@ class SpanCategorizer(TrainablePipe): if not allow_overlap: # Get the probabilities sort_idx = (argmax_scores.squeeze() * -1).argsort() + argmax_scores = argmax_scores[sort_idx] predicted = predicted[sort_idx] indices = indices[sort_idx] keeps = keeps[sort_idx] @@ -748,4 +749,5 @@ class SpanCategorizer(TrainablePipe): attrs_scores.append(argmax_scores[i]) spans.append(Span(doc, start, end, label=self.labels[label])) + spans.attrs["scores"] = numpy.array(attrs_scores) return spans diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index cf6304042..b06505a6d 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -190,17 +190,19 @@ def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results): spangroup = spancat._make_span_group_singlelabel( doc, indices, scores, allow_overlap ) - assert len(spangroup) == nr_results if threshold > 0.4: if allow_overlap: assert spangroup[0].text == "London" assert spangroup[0].label_ == "City" + assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5) assert spangroup[1].text == "Greater London" assert spangroup[1].label_ == "GreatCity" - + assert spangroup.attrs["scores"][1] == 0.9 + assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5) else: assert spangroup[0].text == "Greater London" assert spangroup[0].label_ == "GreatCity" + assert spangroup.attrs["scores"][0] == 0.9 else: if allow_overlap: assert spangroup[0].text == "Greater" @@ -256,22 +258,32 @@ def test_make_spangroup_negative_label(): assert len(spangroup_single) == 2 assert spangroup_single[0].text == "Greater" assert spangroup_single[0].label_ == "City" + assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5) assert spangroup_single[1].text == "Greater London" assert spangroup_single[1].label_ == "GreatCity" + assert spangroup_single.attrs["scores"][1] == 0.9 + assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5) assert len(spangroup_multi) == 6 assert spangroup_multi[0].text == "Greater" assert spangroup_multi[0].label_ == "City" + assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5) assert spangroup_multi[1].text == "Greater" assert spangroup_multi[1].label_ == "Person" + assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5) assert spangroup_multi[2].text == "London" assert spangroup_multi[2].label_ == "City" + assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5) assert spangroup_multi[3].text == "London" assert spangroup_multi[3].label_ == "GreatCity" + assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5) assert spangroup_multi[4].text == "Greater London" assert spangroup_multi[4].label_ == "Thing" + assert spangroup_multi[4].text == "Greater London" + assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5) assert spangroup_multi[5].text == "Greater London" assert spangroup_multi[5].label_ == "GreatCity" + assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5) def test_ngram_suggester(en_tokenizer): From 8d064872ff25c23ed6bfe0a7758456ce31a2ddf7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 29 Mar 2023 18:54:47 +0200 Subject: [PATCH 20/29] Fix Span.sents for edge case of Span being the only Span in the last sentence of a Doc. (#12484) --- spacy/tests/doc/test_span.py | 15 +++++++++++++++ spacy/tokens/span.pyx | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index adef5922f..a5c512dc0 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -716,3 +716,18 @@ def test_for_partial_ent_sents(): # equal to the sentences referenced in ent.sents. for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents): assert doc_sent == ent_sent + + +def test_for_no_ent_sents(): + """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full + sentence. + """ + doc = Doc( + English().vocab, + words=["This", "is", "a", "test.", "ENTITY"], + sent_starts=[1, 0, 0, 0, 1], + ) + doc.set_ents([Span(doc, 4, 5, "WORK")]) + sents = list(doc.ents[0].sents) + assert len(sents) == 1 + assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7750b16ed..29b8ce703 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -463,6 +463,10 @@ cdef class Span: elif i == self.doc.length - 1: yield Span(self.doc, start, self.doc.length) + # Ensure that trailing parts of the Span instance are included in last element of .sents. + if start == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) + @property def ents(self): """The named entities that fall completely within the span. Returns From b228875600d89b2b08eedaa54b717028e1f0ac37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ye=20Lei=20=28=E5=8F=B6=E7=A3=8A=29?= Date: Fri, 31 Mar 2023 15:44:01 +0800 Subject: [PATCH 21/29] Allow passing a Span to displacy.parse_deps (#12477) * Allow passing a Span to displacy.parse_deps * Update docstring Co-authored-by: Adriane Boyd * Update API docs --------- Co-authored-by: Adriane Boyd --- spacy/displacy/__init__.py | 8 ++++++-- spacy/tests/test_displacy.py | 14 ++++++++++++++ website/docs/api/top-level.mdx | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index ea6bba2c9..f42dad0c9 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -125,13 +125,17 @@ def app(environ, start_response): return [res] -def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: +def parse_deps( + orig_doc: Union[Doc, Span], options: Dict[str, Any] = {} +) -> Dict[str, Any]: """Generate dependency parse in {'words': [], 'arcs': []} format. - orig_doc (Doc): Document to parse. + orig_doc (Union[Doc, Span]): Document to parse. options (Dict[str, Any]): Dependency parse specific visualisation options. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ + if isinstance(orig_doc, Span): + orig_doc = orig_doc.as_doc() doc = Doc(orig_doc.vocab).from_bytes( orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) ) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index f298b38e0..837a92e02 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab): {"start": 2, "end": 3, "label": "det", "dir": "left"}, {"start": 1, "end": 3, "label": "attr", "dir": "right"}, ] + # Test that displacy.parse_deps converts Span to Doc + deps = displacy.parse_deps(doc[:]) + assert isinstance(deps, dict) + assert deps["words"] == [ + {"lemma": None, "text": words[0], "tag": pos[0]}, + {"lemma": None, "text": words[1], "tag": pos[1]}, + {"lemma": None, "text": words[2], "tag": pos[2]}, + {"lemma": None, "text": words[3], "tag": pos[3]}, + ] + assert deps["arcs"] == [ + {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "det", "dir": "left"}, + {"start": 1, "end": 3, "label": "attr", "dir": "right"}, + ] def test_displacy_invalid_arcs(): diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index d0851a59f..9193b2a7b 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -291,7 +291,7 @@ the `manual=True` argument in `displacy.render`. | Name | Description | | ----------- | ------------------------------------------------------------------- | -| `orig_doc` | Doc to parse dependencies. ~~Doc~~ | +| `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ | | `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ | | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ | From 57ee1212de0beb014183438b7a4746304d249df2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 31 Mar 2023 13:43:51 +0200 Subject: [PATCH 22/29] Fix pickle for ngram suggester (#12486) --- spacy/pipeline/spancat.py | 58 +++++++++++++++------------- spacy/tests/pipeline/test_spancat.py | 20 +++++++++- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index ff68a3703..5a087e42a 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,5 +1,6 @@ from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union from dataclasses import dataclass +from functools import partial from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d @@ -82,39 +83,42 @@ class Suggester(Protocol): ... +def ngram_suggester( + docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None +) -> Ragged: + if ops is None: + ops = get_current_ops() + spans = [] + lengths = [] + for doc in docs: + starts = ops.xp.arange(len(doc), dtype="i") + starts = starts.reshape((-1, 1)) + length = 0 + for size in sizes: + if size <= len(doc): + starts_size = starts[: len(doc) - (size - 1)] + spans.append(ops.xp.hstack((starts_size, starts_size + size))) + length += spans[-1].shape[0] + if spans: + assert spans[-1].ndim == 2, spans[-1].shape + lengths.append(length) + lengths_array = ops.asarray1i(lengths) + if len(spans) > 0: + output = Ragged(ops.xp.vstack(spans), lengths_array) + else: + output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) + + assert output.dataXd.ndim == 2 + return output + + @registry.misc("spacy.ngram_suggester.v1") def build_ngram_suggester(sizes: List[int]) -> Suggester: """Suggest all spans of the given lengths. Spans are returned as a ragged array of integers. The array has two columns, indicating the start and end position.""" - def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: - if ops is None: - ops = get_current_ops() - spans = [] - lengths = [] - for doc in docs: - starts = ops.xp.arange(len(doc), dtype="i") - starts = starts.reshape((-1, 1)) - length = 0 - for size in sizes: - if size <= len(doc): - starts_size = starts[: len(doc) - (size - 1)] - spans.append(ops.xp.hstack((starts_size, starts_size + size))) - length += spans[-1].shape[0] - if spans: - assert spans[-1].ndim == 2, spans[-1].shape - lengths.append(length) - lengths_array = ops.asarray1i(lengths) - if len(spans) > 0: - output = Ragged(ops.xp.vstack(spans), lengths_array) - else: - output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) - - assert output.dataXd.ndim == 2 - return output - - return ngram_suggester + return partial(ngram_suggester, sizes=sizes) @registry.misc("spacy.ngram_range_suggester.v1") diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index b06505a6d..199ef2b2a 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -1,7 +1,7 @@ import pytest import numpy from numpy.testing import assert_array_equal, assert_almost_equal -from thinc.api import get_current_ops, Ragged +from thinc.api import get_current_ops, NumpyOps, Ragged from spacy import util from spacy.lang.en import English @@ -577,3 +577,21 @@ def test_set_candidates(name): assert len(docs[0].spans["candidates"]) == 9 assert docs[0].spans["candidates"][0].text == "Just" assert docs[0].spans["candidates"][4].text == "Just a" + + +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +@pytest.mark.parametrize("n_process", [1, 2]) +def test_spancat_multiprocessing(name, n_process): + if isinstance(get_current_ops, NumpyOps) or n_process < 2: + nlp = Language() + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) + train_examples = make_examples(nlp) + nlp.initialize(get_examples=lambda: train_examples) + texts = [ + "Just a sentence.", + "I like London and Berlin", + "I like Berlin", + "I eat ham.", + ] + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == len(texts) From a5406a6c457c7809221e365b7a14020e957fe539 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 31 Mar 2023 13:48:15 +0200 Subject: [PATCH 23/29] Allow cupy 12.0 for extras (#12490) --- setup.cfg | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/setup.cfg b/setup.cfg index 27499805b..eea557337 100644 --- a/setup.cfg +++ b/setup.cfg @@ -78,41 +78,41 @@ transformers = ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<12.0.0 + cupy>=5.0.0b4,<13.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<12.0.0 + cupy-cuda80>=5.0.0b4,<13.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<12.0.0 + cupy-cuda90>=5.0.0b4,<13.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<12.0.0 + cupy-cuda91>=5.0.0b4,<13.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<12.0.0 + cupy-cuda92>=5.0.0b4,<13.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<12.0.0 + cupy-cuda100>=5.0.0b4,<13.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<12.0.0 + cupy-cuda101>=5.0.0b4,<13.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<12.0.0 + cupy-cuda102>=5.0.0b4,<13.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<12.0.0 + cupy-cuda110>=5.0.0b4,<13.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<12.0.0 + cupy-cuda111>=5.0.0b4,<13.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<12.0.0 + cupy-cuda112>=5.0.0b4,<13.0.0 cuda113 = - cupy-cuda113>=5.0.0b4,<12.0.0 + cupy-cuda113>=5.0.0b4,<13.0.0 cuda114 = - cupy-cuda114>=5.0.0b4,<12.0.0 + cupy-cuda114>=5.0.0b4,<13.0.0 cuda115 = - cupy-cuda115>=5.0.0b4,<12.0.0 + cupy-cuda115>=5.0.0b4,<13.0.0 cuda116 = - cupy-cuda116>=5.0.0b4,<12.0.0 + cupy-cuda116>=5.0.0b4,<13.0.0 cuda117 = - cupy-cuda117>=5.0.0b4,<12.0.0 + cupy-cuda117>=5.0.0b4,<13.0.0 cuda11x = - cupy-cuda11x>=11.0.0,<12.0.0 + cupy-cuda11x>=11.0.0,<13.0.0 cuda-autodetect = - cupy-wheel>=11.0.0,<12.0.0 + cupy-wheel>=11.0.0,<13.0.0 apple = thinc-apple-ops>=0.1.0.dev0,<1.0.0 # Language tokenizers with external dependencies From 0ec4dc5c29578f9857004b0c747e10529616b6cf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 3 Apr 2023 11:38:56 +0200 Subject: [PATCH 24/29] Remove redundant strings.add for Doc.char_span (#12429) --- spacy/tokens/doc.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7dfe0ca9f..3bc404dd0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -544,10 +544,6 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#char_span """ - if not isinstance(label, int): - label = self.vocab.strings.add(label) - if not isinstance(kb_id, int): - kb_id = self.vocab.strings.add(kb_id) alignment_modes = ("strict", "contract", "expand") if alignment_mode not in alignment_modes: raise ValueError( From bbf232e35520692f7964c03d310f80d809a1ad9c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 3 Apr 2023 15:11:12 +0200 Subject: [PATCH 25/29] Add Span.kb_id/Span.id strings to Doc/DocBin serialization if set (#12493) * Add Span.kb_id/Span.id strings to Doc/DocBin serialization if set * Format --- spacy/tests/serialize/test_serialize_doc.py | 9 ++++++++- spacy/tests/serialize/test_serialize_docbin.py | 9 ++++++++- spacy/tokens/_serialize.py | 4 ++++ spacy/tokens/doc.pyx | 4 ++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 15bf67bfd..eea13445e 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab): def test_serialize_doc_span_groups(en_vocab): doc = Doc(en_vocab, words=["hello", "world", "!"]) - doc.spans["content"] = [doc[0:2]] + span = doc[0:2] + span.label_ = "test_serialize_doc_span_groups_label" + span.id_ = "test_serialize_doc_span_groups_id" + span.kb_id_ = "test_serialize_doc_span_groups_kb_id" + doc.spans["content"] = [span] new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert len(new_doc.spans["content"]) == 1 + assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label" + assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id" + assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id" diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py index 9f8e5e06b..6f7b1001c 100644 --- a/spacy/tests/serialize/test_serialize_docbin.py +++ b/spacy/tests/serialize/test_serialize_docbin.py @@ -49,7 +49,11 @@ def test_serialize_doc_bin(): nlp = English() for doc in nlp.pipe(texts): doc.cats = cats - doc.spans["start"] = [doc[0:2]] + span = doc[0:2] + span.label_ = "UNUSUAL_SPAN_LABEL" + span.id_ = "UNUSUAL_SPAN_ID" + span.kb_id_ = "UNUSUAL_SPAN_KB_ID" + doc.spans["start"] = [span] doc[0].norm_ = "UNUSUAL_TOKEN_NORM" doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc_bin.add(doc) @@ -63,6 +67,9 @@ def test_serialize_doc_bin(): assert doc.text == texts[i] assert doc.cats == cats assert len(doc.spans) == 1 + assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL" + assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID" + assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID" assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index c4e8f26f4..73c857d1f 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -124,6 +124,10 @@ class DocBin: for key, group in doc.spans.items(): for span in group: self.strings.add(span.label_) + if span.kb_id in span.doc.vocab.strings: + self.strings.add(span.kb_id_) + if span.id in span.doc.vocab.strings: + self.strings.add(span.id_) def get_docs(self, vocab: Vocab) -> Iterator[Doc]: """Recover Doc objects from the annotations, using the given vocab. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3bc404dd0..a54b4ad3c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1346,6 +1346,10 @@ cdef class Doc: for group in self.spans.values(): for span in group: strings.add(span.label_) + if span.kb_id in span.doc.vocab.strings: + strings.add(span.kb_id_) + if span.id in span.doc.vocab.strings: + strings.add(span.id_) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope From 2fbd080a03ec7af20026a8938e72ea1a512b7285 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 3 Apr 2023 15:24:03 +0200 Subject: [PATCH 26/29] Add model-last saving mechanism to pretraining (#12459) * Adjust pretrain command * chane naming and add finally block * Add unit test * Add unit test assertions * Update spacy/training/pretrain.py Co-authored-by: Adriane Boyd * change finally block * Add to docs * Update website/docs/usage/embeddings-transformers.mdx * Add flag to skip saving model-last --------- Co-authored-by: Adriane Boyd --- spacy/cli/pretrain.py | 2 + spacy/tests/training/test_pretraining.py | 10 ++++- spacy/training/pretrain.py | 41 +++++++++++-------- website/docs/api/cli.mdx | 23 ++++++----- .../docs/usage/embeddings-transformers.mdx | 9 ++-- 5 files changed, 53 insertions(+), 32 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 381d589cf..45042e605 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -23,6 +23,7 @@ def pretrain_cli( resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), + skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"), # fmt: on ): """ @@ -74,6 +75,7 @@ def pretrain_cli( epoch_resume=epoch_resume, use_gpu=use_gpu, silent=False, + skip_last=skip_last, ) msg.good("Successfully finished pretrain") diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index d1db92de5..6cfdeed20 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -165,7 +165,8 @@ def test_pretraining_default(): @pytest.mark.parametrize("objective", CHAR_OBJECTIVES) -def test_pretraining_tok2vec_characters(objective): +@pytest.mark.parametrize("skip_last", (True, False)) +def test_pretraining_tok2vec_characters(objective, skip_last): """Test that pretraining works with the character objective""" config = Config().from_str(pretrain_string_listener) config["pretraining"]["objective"] = objective @@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective): filled["paths"]["raw_text"] = file_path filled = filled.interpolate() assert filled["pretraining"]["component"] == "tok2vec" - pretrain(filled, tmp_dir) + pretrain(filled, tmp_dir, skip_last=skip_last) assert Path(tmp_dir / "model0.bin").exists() assert Path(tmp_dir / "model4.bin").exists() assert not Path(tmp_dir / "model5.bin").exists() + if skip_last: + assert not Path(tmp_dir / "model-last.bin").exists() + else: + assert Path(tmp_dir / "model-last.bin").exists() @pytest.mark.parametrize("objective", VECTOR_OBJECTIVES) @@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config): pretrain(filled, tmp_dir) assert Path(tmp_dir / "model0.bin").exists() assert Path(tmp_dir / "model4.bin").exists() + assert Path(tmp_dir / "model-last.bin").exists() assert not Path(tmp_dir / "model5.bin").exists() diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 52af84aaf..ebbc5d837 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -24,6 +24,7 @@ def pretrain( epoch_resume: Optional[int] = None, use_gpu: int = -1, silent: bool = True, + skip_last: bool = False, ): msg = Printer(no_print=silent) if config["training"]["seed"] is not None: @@ -60,10 +61,14 @@ def pretrain( row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) - def _save_model(epoch, is_temp=False): + def _save_model(epoch, is_temp=False, is_last=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): - with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: + if is_last: + save_path = output_dir / f"model-last.bin" + else: + save_path = output_dir / f"model{epoch}{is_temp_str}.bin" + with (save_path).open("wb") as file_: file_.write(model.get_ref("tok2vec").to_bytes()) log = { "nr_word": tracker.nr_word, @@ -76,22 +81,26 @@ def pretrain( # TODO: I think we probably want this to look more like the # 'create_train_batches' function? - for epoch in range(epoch_resume, P["max_epochs"]): - for batch_id, batch in enumerate(batcher(corpus(nlp))): - docs = ensure_docs(batch) - loss = make_update(model, docs, optimizer, objective) - progress = tracker.update(epoch, loss, docs) - if progress: - msg.row(progress, **row_settings) - if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): - _save_model(epoch, is_temp=True) + try: + for epoch in range(epoch_resume, P["max_epochs"]): + for batch_id, batch in enumerate(batcher(corpus(nlp))): + docs = ensure_docs(batch) + loss = make_update(model, docs, optimizer, objective) + progress = tracker.update(epoch, loss, docs) + if progress: + msg.row(progress, **row_settings) + if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): + _save_model(epoch, is_temp=True) - if P["n_save_epoch"]: - if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1: + if P["n_save_epoch"]: + if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1: + _save_model(epoch) + else: _save_model(epoch) - else: - _save_model(epoch) - tracker.epoch_loss = 0.0 + tracker.epoch_loss = 0.0 + finally: + if not skip_last: + _save_model(P["max_epochs"], is_last=True) def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 2bb0199fc..323ea2a92 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides] ``` -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | -| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | -| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | -| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | +| Name | Description | +| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | +| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | +| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--skip-last`, `-L` 3.5.2 | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | +| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | ## evaluate {id="evaluate",version="2",tag="command"} diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx index cf80822fb..5f1e5b817 100644 --- a/website/docs/usage/embeddings-transformers.mdx +++ b/website/docs/usage/embeddings-transformers.mdx @@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file that you want to use from pretraining. A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as -an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To -make use of the final output, you could fill in this value in your config file: +an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a +copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can +configure `n_save_epoch` to tell pretraining in which epoch interval it should +save the current training progress. To use the final output to initialize your +`tok2vec` layer, you could fill in this value in your config file: ```ini {title="config.cfg"} [paths] -init_tok2vec = "pretrain/model4.bin" +init_tok2vec = "pretrain/model-last.bin" [initialize] init_tok2vec = ${paths.init_tok2vec} From 314a7cea7392be23a5123fddb2cfd6a3703d5dc9 Mon Sep 17 00:00:00 2001 From: Will Frey Date: Tue, 4 Apr 2023 14:53:07 -0400 Subject: [PATCH 27/29] Fix invalid ConsoleLogger.v3 example config (#12498) Replace `progress_bar = "all_steps"` with `progress_bar = "eval"`, which is consistent with the default behavior for `spacy.ConsoleLogger.v1` and `spacy.ConsoleLogger.v2`. --- website/docs/api/top-level.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 9193b2a7b..975c16aaa 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -577,7 +577,7 @@ start decreasing across epochs. > ```ini > [training.logger] > @loggers = "spacy.ConsoleLogger.v3" -> progress_bar = "all_steps" +> progress_bar = "eval" > console_output = true > output_file = "training_log.jsonl" > ``` From 9fbb8ee912585fb2de64360a8b955a8d17e5b28a Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Thu, 6 Apr 2023 11:45:19 +0200 Subject: [PATCH 28/29] Add more information to custom code docs (#12491) * Add info to sections * Update website/docs/usage/training.mdx --------- Co-authored-by: Adriane Boyd --- website/docs/api/top-level.mdx | 5 ++++- website/docs/usage/training.mdx | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 975c16aaa..6de1acdf0 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path, spaCy will assume it's a data directory, load its [`config.cfg`](/api/data-formats#config) and use the language and pipeline information to construct the `Language` class. The data will be loaded in via -[`Language.from_disk`](/api/language#from_disk). +[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a +package will also import any custom code, if present, whereas loading from a +directory does not. For these cases, you need to manually import your custom +code. diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx index 6cda975cb..6caf2e94b 100644 --- a/website/docs/usage/training.mdx +++ b/website/docs/usage/training.mdx @@ -758,6 +758,15 @@ any custom architectures, functions or your pipeline and registered when it's loaded. See the documentation on [saving and loading pipelines](/usage/saving-loading#models-custom) for details. + + +Note that the unpackaged models produced by `spacy train` are data directories +that **do not include custom code**. You need to import the code in your script +before loading in unpackaged models. For more details, see +[`spacy.load`](/api/top-level#spacy.load). + + + #### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"} For many use cases, you don't necessarily want to implement the whole `Language` From f66d55fe5bba268ec3b4a747d0ea00c621d6d65a Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Thu, 6 Apr 2023 11:45:58 +0200 Subject: [PATCH 29/29] `Docs`: Fix rule-based matching example that expands named entities (#12495) --- website/docs/usage/rule-based-matching.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 55c043015..7e88bdc1f 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1682,6 +1682,8 @@ def expand_person_entities(doc): if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."): new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label) new_ents.append(new_ent) + else: + new_ents.append(ent) else: new_ents.append(ent) doc.ents = new_ents