From 18a2a88a951b2324af94e08b59e8ecbe924e8501 Mon Sep 17 00:00:00 2001 From: andyjessen <62343929+andyjessen@users.noreply.github.com> Date: Fri, 7 Apr 2023 07:31:04 -0600 Subject: [PATCH 01/21] Add category to spaCy project (#12506) ScispaCy fits within biomedical domain. Consider adding this category. --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 5fd1c2287..1d2881f9c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1555,7 +1555,7 @@ "twitter": "allenai_org", "website": "http://allenai.org" }, - "category": ["scientific", "models", "research"] + "category": ["scientific", "models", "research", "biomedical"] }, { "id": "textacy", From 6be67db59fd1f0384bed679035d8bc6a7028537b Mon Sep 17 00:00:00 2001 From: TAN Long <71320000+tanloong@users.noreply.github.com> Date: Mon, 17 Apr 2023 19:14:01 +0800 Subject: [PATCH 02/21] docs(REL_OP): modify docs for REL_OPs to match Semgrex's update on CoreNLP v4.5.2 (#12531) Co-authored-by: Tan Long --- website/docs/api/dependencymatcher.mdx | 44 +++++++++++----------- website/docs/usage/rule-based-matching.mdx | 44 +++++++++++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 14e0916d1..d0971da55 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 7e88bdc1f..39be5f47b 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} From b0e5aed5ed75a619b421db294979aa08e0d2b046 Mon Sep 17 00:00:00 2001 From: TAN Long <71320000+tanloong@users.noreply.github.com> Date: Mon, 17 Apr 2023 19:16:34 +0800 Subject: [PATCH 03/21] perf(REL_OP): Replace some token.children with token.rights or token.lefts (#12528) Co-authored-by: Tan Long --- spacy/matcher/dependencymatcher.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index adf96702b..48fb3eb2a 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -432,22 +432,22 @@ cdef class DependencyMatcher: return [doc[child.i] for child in doc[node].head.children if child.i < node] def _imm_right_child(self, doc, node): - for child in doc[node].children: + for child in doc[node].rights: if child.i == node + 1: return [doc[child.i]] return [] def _imm_left_child(self, doc, node): - for child in doc[node].children: + for child in doc[node].lefts: if child.i == node - 1: return [doc[child.i]] return [] def _right_child(self, doc, node): - return [doc[child.i] for child in doc[node].children if child.i > node] + return [child for child in doc[node].rights] def _left_child(self, doc, node): - return [doc[child.i] for child in doc[node].children if child.i < node] + return [child for child in doc[node].lefts] def _imm_right_parent(self, doc, node): if doc[node].head.i == node + 1: From 7bf1db87adb810802076c73ebf3cfe40162e5006 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 19 Apr 2023 10:59:33 +0200 Subject: [PATCH 04/21] fix typo (#12543) --- spacy/cli/debug_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index f20673f25..729d623b9 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -336,7 +336,7 @@ def debug_data( show=verbose, ) else: - msg.good("Examples without ocurrences available for all labels") + msg.good("Examples without occurrences available for all labels") if "ner" in factory_names: # Get all unique NER labels present in the data From 357fdd48710c72c4adc282e9adda160568f5b889 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 11:30:34 +0200 Subject: [PATCH 05/21] Load exceptions last in Tokenizer.from_bytes (#12553) In `Tokenizer.from_bytes`, the exceptions should be loaded last so that they are only processed once as part of loading the model. The exceptions are tokenized as phrase matcher patterns in the background and the internal tokenization needs to be synced with all the remaining tokenizer settings. If the exceptions are not loaded last, there are speed regressions for `Tokenizer.from_bytes/disk` vs. `Tokenizer.add_special_case` as the caches are reloaded more than necessary during deserialization. --- spacy/tokenizer.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e75b5f7a..a4a68ae8e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -834,10 +834,12 @@ cdef class Tokenizer: self.token_match = re.compile(data["token_match"]).match if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match - if "rules" in data and isinstance(data["rules"], dict): - self.rules = data["rules"] if "faster_heuristics" in data: self.faster_heuristics = data["faster_heuristics"] + # always load rules last so that all other settings are set before the + # internal tokenization for the phrase matcher + if "rules" in data and isinstance(data["rules"], dict): + self.rules = data["rules"] return self From e05b2ccc7cca4e28d6031fa9b19bc3bb3c8c258f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 14:06:32 +0200 Subject: [PATCH 06/21] Add default option to MorphAnalysis.get (#12545) * Add default to MorphAnalysis.get Similar to `dict`, allow a `default` option for `MorphAnalysis.get` for the user to provide a default return value if the field is not found. The default return value remains `[]`, which is not the same as `dict.get`, but is already established as this method's default return value with the return type `List[str]`. However the new `default` option does not enforce that the user-provided default is actually `List[str]`. * Restore test case --- spacy/tests/doc/test_morphanalysis.py | 2 ++ spacy/tokens/morphanalysis.pyi | 4 ++-- spacy/tokens/morphanalysis.pyx | 6 +++++- website/docs/api/morphology.mdx | 9 +++++---- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 918d4acdc..49e32b936 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -33,6 +33,8 @@ def test_token_morph_key(i_has): def test_morph_props(i_has): assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[1].morph.get("PronType") == [] + assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"] + assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"] def test_morph_iter(i_has): diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi index b86203cc4..a5376e80d 100644 --- a/spacy/tokens/morphanalysis.pyi +++ b/spacy/tokens/morphanalysis.pyi @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterator, List, Union +from typing import Any, Dict, Iterator, List, Optional, Union from ..vocab import Vocab class MorphAnalysis: @@ -13,7 +13,7 @@ class MorphAnalysis: def __hash__(self) -> int: ... def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] - def get(self, field: Any) -> List[str]: ... + def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ... def to_json(self) -> str: ... def to_dict(self) -> Dict[str, str]: ... def __str__(self) -> str: ... diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index a7d1f2e44..baa3800a1 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -58,10 +58,14 @@ cdef class MorphAnalysis: def __ne__(self, other): return self.key != other.key - def get(self, field): + def get(self, field, default=None): """Retrieve feature values by field.""" cdef attr_t field_id = self.vocab.strings.as_int(field) cdef np.ndarray results = get_by_field(&self.c, field_id) + if len(results) == 0: + if default is None: + default = [] + return default features = [self.vocab.strings[result] for result in results] return [f.split(Morphology.FIELD_SEP)[1] for f in features] diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 68d80b814..5d4affafe 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -213,10 +213,11 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Description | -| ----------- | ------------------------------------------------ | -| `field` | The field to retrieve. ~~str~~ | -| **RETURNS** | A list of the individual features. ~~List[str]~~ | +| Name | Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `field` | The field to retrieve. ~~str~~ | +| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ | +| **RETURNS** | A list of the individual features. ~~List[str]~~ | ### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"} From 664a53ffbeedb95f1d98d7c8c57b94e365735051 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 21 Apr 2023 15:05:53 +0200 Subject: [PATCH 07/21] CI: Disable Azure (#12560) --- .github/azure-steps.yml | 118 --------------------------------------- azure-pipelines.yml | 120 ---------------------------------------- 2 files changed, 238 deletions(-) delete mode 100644 .github/azure-steps.yml delete mode 100644 azure-pipelines.yml diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml deleted file mode 100644 index 20d4582cb..000000000 --- a/.github/azure-steps.yml +++ /dev/null @@ -1,118 +0,0 @@ -parameters: - python_version: '' - architecture: 'x64' - num_build_jobs: 2 - -steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: ${{ parameters.python_version }} - architecture: ${{ parameters.architecture }} - allowUnstable: true - - - bash: | - echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" - displayName: 'Set variables' - - - script: | - python -m pip install -U build pip setuptools - python -m pip install -U -r requirements.txt - displayName: "Install dependencies" - - - script: | - python -m build --sdist - displayName: "Build sdist" - - - script: | - python -m mypy spacy - displayName: 'Run mypy' - condition: ne(variables['python_version'], '3.6') - - - task: DeleteFiles@1 - inputs: - contents: "spacy" - displayName: "Delete source directory" - - - task: DeleteFiles@1 - inputs: - contents: "*.egg-info" - displayName: "Delete egg-info directory" - - - script: | - python -m pip freeze > installed.txt - python -m pip uninstall -y -r installed.txt - displayName: "Uninstall all packages" - - - bash: | - SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST - displayName: "Install from sdist" - - - script: | - python -W error -c "import spacy" - displayName: "Test import" - - - script: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md - python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - displayName: 'Test download CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -W error -m spacy info ca_core_news_sm | grep -q download_url - displayName: 'Test download_url in info CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" - displayName: 'Test no warnings on load (#11713)' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . - displayName: 'Test convert CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m spacy init config -p ner -l ca ner.cfg - python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy - displayName: 'Test debug config CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - # will have errors due to sparse data, check for summary in output - python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary - displayName: 'Test debug data CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 - displayName: 'Test train CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - displayName: 'Test assemble CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" - python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - displayName: 'Test assemble CLI vectors warning' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m pip install -U -r requirements.txt - displayName: "Install test requirements" - - - script: | - python -m pytest --pyargs spacy -W error - displayName: "Run CPU tests" - - - script: | - python -m pip install 'spacy[apple]' - python -m pytest --pyargs spacy - displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 83c57a164..000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,120 +0,0 @@ -trigger: - batch: true - branches: - include: - - "*" - exclude: - - "spacy.io" - - "nightly.spacy.io" - - "v2.spacy.io" - paths: - exclude: - - "website/*" - - "*.md" - - "*.mdx" - - ".github/workflows/*" -pr: - paths: - exclude: - - "*.md" - - "*.mdx" - - "website/docs/*" - - "website/src/*" - - "website/meta/*.tsx" - - "website/meta/*.mjs" - - "website/meta/languages.json" - - "website/meta/site.json" - - "website/meta/sidebars.json" - - "website/meta/type-annotations.json" - - "website/pages/*" - - ".github/workflows/*" - -jobs: - # Check formatting and linting. Perform basic checks for most important errors - # (syntax etc.) Uses the config defined in setup.cfg and overwrites the - # selected codes. - - job: "Validate" - pool: - vmImage: "ubuntu-latest" - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: "3.7" - - script: | - pip install black -c requirements.txt - python -m black spacy --check - displayName: "black" - - script: | - pip install flake8==5.0.4 - python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics - displayName: "flake8" - - script: | - python .github/validate_universe_json.py website/meta/universe.json - displayName: 'Validate website/meta/universe.json' - - - job: "Test" - dependsOn: "Validate" - strategy: - matrix: - # We're only running one platform per Python version to speed up builds - Python36Linux: - imageName: "ubuntu-20.04" - python.version: "3.6" - # Python36Windows: - # imageName: "windows-latest" - # python.version: "3.6" - # Python36Mac: - # imageName: "macos-latest" - # python.version: "3.6" - # Python37Linux: - # imageName: "ubuntu-20.04" - # python.version: "3.7" - Python37Windows: - imageName: "windows-latest" - python.version: "3.7" - # Python37Mac: - # imageName: "macos-latest" - # python.version: "3.7" - # Python38Linux: - # imageName: "ubuntu-latest" - # python.version: "3.8" - # Python38Windows: - # imageName: "windows-latest" - # python.version: "3.8" - Python38Mac: - imageName: "macos-latest" - python.version: "3.8" - Python39Linux: - imageName: "ubuntu-latest" - python.version: "3.9" - # Python39Windows: - # imageName: "windows-latest" - # python.version: "3.9" - # Python39Mac: - # imageName: "macos-latest" - # python.version: "3.9" - # Python310Linux: - # imageName: "ubuntu-latest" - # python.version: "3.10" - Python310Windows: - imageName: "windows-latest" - python.version: "3.10" - # Python310Mac: - # imageName: "macos-latest" - # python.version: "3.10" - Python311Linux: - imageName: 'ubuntu-latest' - python.version: '3.11' - Python311Windows: - imageName: 'windows-latest' - python.version: '3.11' - Python311Mac: - imageName: 'macos-latest' - python.version: '3.11' - maxParallel: 4 - pool: - vmImage: $(imageName) - steps: - - template: .github/azure-steps.yml - parameters: - python_version: '$(python.version)' From e9945ccd0458bb3d9785a4bb5bf94ba4a5ddaf88 Mon Sep 17 00:00:00 2001 From: moxley01 Date: Tue, 25 Apr 2023 12:30:19 +0200 Subject: [PATCH 08/21] add spacysee project (#12568) --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 1d2881f9c..b91d7dada 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "spacysee", + "title": "spaCysee", + "slogan": "Visualize spaCy's Dependency Parsing, POS tagging, and morphological analysis", + "description": "A project that helps you visualize your spaCy docs in Jupyter notebooks. Each of the dependency tags, POS tags and morphological features are clickable. Clicking on a tag will bring up the relevant documentation for that tag.", + "github": "moxley01/spacysee", + "pip": "spacysee", + "code_example": [ + "import spacy", + "from spacysee import render", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('This is a neat way to visualize your spaCy docs')", + "render(doc, width='500', height='500')" + ], + "code_language": "python", + "thumb": "https://www.mattoxley.com/static/images/spacysee_logo.svg", + "image": "https://www.mattoxley.com/static/images/spacysee_logo.svg", + "author": "Matt Oxley", + "author_links": { + "twitter": "matt0xley", + "github": "moxley01", + "website": "https://mattoxley.com" + }, + "category": ["visualizers"], + "tags": ["visualization"] + }, { "id": "grecy", "title": "greCy", From 1f8f91055483943e486651d1daddc1a8039d6bb5 Mon Sep 17 00:00:00 2001 From: Victoria <80417010+victorialslocum@users.noreply.github.com> Date: Wed, 26 Apr 2023 14:18:40 +0200 Subject: [PATCH 09/21] Add spacy-wasm to universe (#12572) * add spacy-wasm to universe * add tag --- website/meta/universe.json | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index b91d7dada..4067c4d1e 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,21 @@ { "resources": [ + { + "id": "spacy-wasm", + "title": "spacy-wasm", + "slogan": "spaCy in the browser using WebAssembly", + "description": "Run spaCy directly in the browser with WebAssembly. Using Pyodide, the application loads the spaCy model and renders the text prompt with displaCy.", + "url": "https://spacy-wasm.vercel.app/", + "github": "SyedAhkam/spacy-wasm", + "code_language": "python", + "author": "Syed Ahkam", + "author_links": { + "twitter": "@SyedAhkam1", + "github": "SyedAhkam" + }, + "category": ["visualizers"], + "tags": ["visualization", "deployment"] + }, { "id": "spacysee", "title": "spaCysee", From 0de1f8bf73439ab1cc66a6b8c12164d2d034db07 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Thu, 27 Apr 2023 15:27:13 +0200 Subject: [PATCH 10/21] Spancat speed improvement (#12577) * avoid nesting then flattening * mypy fix * Apply suggestions from code review * Add type for indices * Run full matrix for mypy * Add back modified type: ignore * Revert "Run full matrix for mypy" This reverts commit e218873d049d5634e6faa0341ada9af5d53b5a29. --------- Co-authored-by: Adriane Boyd --- spacy/ml/extract_spans.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py index d5e9bc07c..af6be78db 100644 --- a/spacy/ml/extract_spans.py +++ b/spacy/ml/extract_spans.py @@ -1,4 +1,4 @@ -from typing import Tuple, Callable +from typing import List, Tuple, Callable from thinc.api import Model, to_numpy from thinc.types import Ragged, Ints1d @@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d: indices will be [5, 6, 7, 8, 8, 9]. """ spans, lengths = _ensure_cpu(spans, lengths) - indices = [] + indices: List[int] = [] offset = 0 for i, length in enumerate(lengths): spans_i = spans[i].dataXd + offset for j in range(spans_i.shape[0]): - indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index] + indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore[arg-type, call-overload] offset += length - return ops.flatten(indices, dtype="i", ndim_if_empty=1) + return ops.asarray1i(indices) def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: From 139368d9ce3ccec1d25bfa5cc98b148a657c7a38 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 28 Apr 2023 14:29:51 +0200 Subject: [PATCH 11/21] CI: Only run test suite once with thinc-apple-ops for macos python 3.11 (#12436) * CI: Only run test suite once with thinc-apple-ops for macos python 3.11 * Adjust syntax * Try alternate syntax * Try alternate syntax * Try alternate syntax --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f226057c9..21b660989 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -165,6 +165,7 @@ jobs: - name: "Run CPU tests" run: | python -m pytest --pyargs spacy -W error + if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')" - name: "Run CPU tests with thinc-apple-ops" run: | From 9ec12fcfde931d3ef4eae72b02d2058a8df2a933 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Fri, 28 Apr 2023 20:32:52 +0800 Subject: [PATCH 12/21] Add spans in spacy benchmark (#12575) * Add spans in spacy benchmark The current implementation of spaCy benchmark accuracy / spacy evaluate doesn't include the "spans" type, so calling the command doesn't render the HTML displaCy file needed. This PR attempts to fix that by creating a new parameter for "spans" and calling the appropriate displaCy value. * Reformat file with black * Add tests for evaluate * Fix spans -> span for displacy style * Update test to check render instead * Update source so mypy passes * Add parser information to avoid warnings --- spacy/cli/evaluate.py | 9 ++++++ spacy/tests/test_cli.py | 66 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 8f3d6b859..363c02cd3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -122,6 +122,8 @@ def evaluate( docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) render_deps = "parser" in factory_names render_ents = "ner" in factory_names + render_spans = "spancat" in factory_names + render_parses( docs, displacy_path, @@ -129,6 +131,7 @@ def evaluate( limit=displacy_limit, deps=render_deps, ents=render_ents, + spans=render_spans, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) @@ -182,6 +185,7 @@ def render_parses( limit: int = 250, deps: bool = True, ents: bool = True, + spans: bool = True, ): docs[0].user_data["title"] = model_name if ents: @@ -195,6 +199,11 @@ def render_parses( with (output_path / "parses.html").open("w", encoding="utf8") as file_: file_.write(html) + if spans: + html = displacy.render(docs[:limit], style="span", page=True) + with (output_path / "spans.html").open("w", encoding="utf8") as file_: + file_.write(html) + def print_prf_per_type( msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 1fdf059b3..351e6bf11 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,6 +12,7 @@ import srsly from click import NoSuchOption from packaging.specifiers import SpecifierSet from thinc.api import Config, ConfigValidationError +from spacy.tokens import DocBin from spacy import about from spacy.cli import info @@ -27,6 +28,7 @@ from spacy.cli.debug_data import _get_span_characteristics from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version +from spacy.cli.evaluate import render_parses from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_pipeline import _init_labels from spacy.cli.package import get_third_party_dependencies @@ -144,6 +146,70 @@ def test_issue11235(): assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" +@pytest.mark.issue(12566) +@pytest.mark.parametrize( + "factory,output_file", + [("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")], +) +def test_issue12566(factory: str, output_file: str): + """ + Test if all displaCy types (ents, dep, spans) produce an HTML file + """ + with make_tempdir() as tmp_dir: + # Create sample spaCy file + doc_json = { + "ents": [ + {"end": 54, "label": "nam_adj_country", "start": 44}, + {"end": 83, "label": "nam_liv_person", "start": 69}, + {"end": 100, "label": "nam_pro_title_book", "start": 86}, + ], + "spans": { + "sc": [ + {"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44}, + {"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69}, + { + "end": 100, + "kb_id": "", + "label": "nam_pro_title_book", + "start": 86, + }, + ] + }, + "text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , " + "Briana McNaira - Cultural Chaos .", + "tokens": [ + # fmt: off + {"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, }, + {"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, }, + {"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, }, + {"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, }, + {"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, }, + {"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, }, + {"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, }, + {"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, }, + {"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, }, + {"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, }, + {"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, }, + {"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, }, + {"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, }, + {"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, }, + {"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, }, + # fmt: on + ], + } + + # Create a .spacy file + nlp = spacy.blank("pl") + doc = Doc(nlp.vocab).from_json(doc_json) + + # Run the evaluate command and check if the html files exist + render_parses( + docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True} + ) + + assert (tmp_dir / output_file).is_file() + + def test_cli_info(): nlp = Dutch() nlp.add_pipe("textcat") From 4e1db35f6e7796f335da020cce2549f84f8f51c6 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 2 May 2023 03:51:13 -0700 Subject: [PATCH 13/21] Update inmemorylookupkb.mdx (#12586) Example does not refer to the in memory lookup --- website/docs/api/inmemorylookupkb.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index c24fe78d6..15b1d3bf2 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -292,7 +292,7 @@ Restore the state of the knowledge base from a given directory. Note that the > ```python > from spacy.vocab import Vocab > vocab = Vocab().from_disk("/path/to/vocab") -> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64) +> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64) > kb.from_disk("/path/to/kb") > ``` From 42e5043816e241dd965fd522386336eadca7e49b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 4 May 2023 17:13:12 +0200 Subject: [PATCH 14/21] Remove #egg from download URLs (#12567) The current URLs will become invalid in pip 25.0. According to the pip docs, the egg= URLs are currently only needed for editable VCS installs. --- spacy/cli/download.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0c9a32b93..df4bca53d 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -81,11 +81,8 @@ def download( def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: dl_tpl = "{m}-{v}/{m}-{v}{s}" - egg_tpl = "#egg={m}=={v}" suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX filename = dl_tpl.format(m=model_name, v=version, s=suffix) - if sdist: - filename += egg_tpl.format(m=model_name, v=version) return filename From 2cfbc1209d07eac704dab890425ba2bbc185a2ad Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 16:51:58 +0200 Subject: [PATCH 15/21] In initialize only calculate current vectors hash if needed (#12607) --- spacy/training/initialize.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e90617852..9cf759c55 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -133,10 +133,11 @@ def init_vocab( logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) - vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) - for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): - if vectors_hash != sourced_vectors_hash: - warnings.warn(Warnings.W113.format(name=sourced_component)) + if len(sourced_vectors_hashes) > 0: + vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object") From 7ae4fc19a137b60eb198cba55536b0780068463f Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 9 May 2023 06:02:45 -0400 Subject: [PATCH 16/21] Add LatinCy models to universe.json (#12597) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add LatinCy models to universe.json * Update website/meta/universe.json Add install code for LatinCy models to 'code_example' Co-authored-by: Adriane Boyd * Update LatinCy ‘code_example’ in website/meta/universe.json Co-authored-by: Adriane Boyd --------- Co-authored-by: Adriane Boyd --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 4067c4d1e..05877cfc6 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "latincy", + "title": "LatinCy", + "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png", + "slogan": "Synthetic trained spaCy pipelines for Latin NLP", + "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.", + "url": "https://huggingface.co/latincy", + "code_example": [ + "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", + "import spacy", + "nlp = spacy.load('la_core_web_lg')", + "doc = nlp('Haec narranatur a poetis de Perseo')", + "", + "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", + "", + "# > Haec, haec, hic, DET" + ], + "code_language": "python", + "author": "Patrick J. Burns", + "author_links": { + "twitter": "@diyclassics", + "github": "diyclassics", + "website": "https://diyclassics.github.io/" + }, + "category": ["pipeline", "research"], + "tags": ["latin"] + }, { "id": "spacy-wasm", "title": "spacy-wasm", From 54d9198e62cbeb3755e64510aff9a7681b6d938a Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 9 May 2023 09:52:34 -0400 Subject: [PATCH 17/21] Fix typo (#12615) --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 05877cfc6..b39ebb528 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -11,7 +11,7 @@ "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", "import spacy", "nlp = spacy.load('la_core_web_lg')", - "doc = nlp('Haec narranatur a poetis de Perseo')", + "doc = nlp('Haec narrantur a poetis de Perseo')", "", "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", "", From 81488fa88bf9bfa26b00c9d83b7aa99047eb64f3 Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Wed, 10 May 2023 13:16:16 +0200 Subject: [PATCH 18/21] chore: added adept-augmentations to the spacy universe (#12609) * chore: added adept-augmentations to the spacy universe * Apply suggestions from code review Co-authored-by: Basile Dura * Update universe.json --------- Co-authored-by: Basile Dura --- website/meta/universe.json | 50 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index b39ebb528..e36ba5676 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2837,6 +2837,56 @@ "tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"], "spacy_version": 3 }, + { + "id": "adeptaugmentations", + "title": "Adept Augmentations", + "slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.", + "description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".", + "github": "davidberenstein1957/adept-augmentations", + "pip": "adept-augmentations", + "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png", + "code_example": [ + "import spacy", + "from spacy.tokens import DocBin", + "", + "from adept_augmentations import EntitySwapAugmenter", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "", + "TRAIN_DATA = [", + " \"Apple is looking at buying U.K. startup for $1 billion\",", + " \"Microsoft acquires GitHub for $7.5 billion\"", + "]", + "docs = nlp.pipe(TRAIN_DATA)", + "", + "# Create a new DocBin", + "doc_bin = DocBin(docs=docs)", + "", + "# Augment Data", + "doc_bin = EntitySwapAugmenter(doc_bin).augment(4)", + "for doc in doc_bin.get_docs(nlp.vocab):", + " print(doc.text)", + "", + "# Output", + "#", + "# GitHub is looking at buying U.K. startup for $ 7.5 billion", + "# Microsoft is looking at buying U.K. startup for $ 1 billion", + "# Microsoft is looking at buying U.K. startup for $ 7.5 billion", + "# GitHub is looking at buying U.K. startup for $ 1 billion", + "# Microsoft acquires Apple for $ 7.5 billion", + "# Apple acquires Microsoft for $ 1 billion", + "# Microsoft acquires Microsoft for $ 7.5 billion", + "# GitHub acquires GitHub for $ 1 billion" + ], + "author": "David Berenstein", + "author_links": { + "github": "davidberenstein1957", + "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" + }, + "category": ["standalone"], + "tags": ["ner", "few-shot", "augmentation", "datasets", "training"], + "spacy_version": 3 + }, { "id": "blackstone", "title": "Blackstone", From 7c49d251c7ca8860b1bc926b493c302aaa074d76 Mon Sep 17 00:00:00 2001 From: royashcenazi <37100955+royashcenazi@users.noreply.github.com> Date: Wed, 10 May 2023 14:19:28 +0300 Subject: [PATCH 19/21] parsigs universe (#12616) * parsigs universe * added model installation explanation in the description * Update website/meta/universe.json Co-authored-by: Basile Dura * added model installement instruction in the code example --------- Co-authored-by: Basile Dura --- website/meta/universe.json | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index e36ba5676..f2b199275 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,28 @@ { "resources": [ + { + "id": "parsigs", + "title": "parsigs", + "slogan": "Structuring prescriptions text made simple using spaCy", + "description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`", + "github": "royashcenazi/parsigs", + "pip": "parsigs", + "code_language": "python", + "author": "Roy Ashcenazi", + "code_example": [ + "# You'll need to install the trained model, see instructions in the description section", + "from parsigs.parse_sig_api import StructuredSig, SigParser", + "sig_parser = SigParser()", + "", + "sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'", + "parsed_sig = sig_parser.parse(sig)" + ], + "author_links": { + "github": "royashcenazi" + }, + "category": ["model", "research"], + "tags": ["sigs", "prescription","pharma"] + }, { "id": "latincy", "title": "LatinCy", @@ -26,7 +49,7 @@ }, "category": ["pipeline", "research"], "tags": ["latin"] - }, + }, { "id": "spacy-wasm", "title": "spacy-wasm", From 931a46308f4eac91223b7867a956e7dd43eeb87d Mon Sep 17 00:00:00 2001 From: royashcenazi <37100955+royashcenazi@users.noreply.github.com> Date: Wed, 10 May 2023 14:49:51 +0300 Subject: [PATCH 20/21] Parsigs universe 3 (#12617) * parsigs universe * added model installation explanation in the description * Update website/meta/universe.json Co-authored-by: Basile Dura * added model installement instruction in the code example * added biomedical category --------- Co-authored-by: Basile Dura --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index f2b199275..33185ca30 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -20,7 +20,7 @@ "author_links": { "github": "royashcenazi" }, - "category": ["model", "research"], + "category": ["model", "research", "biomedical"], "tags": ["sigs", "prescription","pharma"] }, { From 9beaec6a030e71d9c2b31fa9c0ba0f6230a79619 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 12 May 2023 00:40:28 -0700 Subject: [PATCH 21/21] docs: remove invalid huggingface-hub push argument (#12624) --- website/docs/api/cli.mdx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 323ea2a92..05328b7eb 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1640,7 +1640,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details, see the spaCy project [integration](/usage/projects#huggingface_hub). ```bash -$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose] +$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose] ``` > #### Example @@ -1654,6 +1654,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] | `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | | `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | | `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | -| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ | | `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | | **UPLOADS** | The pipeline to the hub. |