From 02259fa1954d7912f30e34495b27b304ec93d7ce Mon Sep 17 00:00:00 2001 From: andyjessen <62343929+andyjessen@users.noreply.github.com> Date: Fri, 7 Apr 2023 07:31:04 -0600 Subject: [PATCH 01/11] Add category to spaCy project (#12506) ScispaCy fits within biomedical domain. Consider adding this category. --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 5fd1c2287..1d2881f9c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1555,7 +1555,7 @@ "twitter": "allenai_org", "website": "http://allenai.org" }, - "category": ["scientific", "models", "research"] + "category": ["scientific", "models", "research", "biomedical"] }, { "id": "textacy", From 119f9592181e421372ad3511b6f63354247cdc34 Mon Sep 17 00:00:00 2001 From: TAN Long <71320000+tanloong@users.noreply.github.com> Date: Mon, 17 Apr 2023 19:14:01 +0800 Subject: [PATCH 02/11] docs(REL_OP): modify docs for REL_OPs to match Semgrex's update on CoreNLP v4.5.2 (#12531) Co-authored-by: Tan Long --- website/docs/api/dependencymatcher.mdx | 44 +++++++++++----------- website/docs/usage/rule-based-matching.mdx | 44 +++++++++++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 14e0916d1..d0971da55 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 7e88bdc1f..39be5f47b 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} From 923d24e885ec478269a618d2a582338d0d9babc2 Mon Sep 17 00:00:00 2001 From: TAN Long <71320000+tanloong@users.noreply.github.com> Date: Mon, 17 Apr 2023 19:16:34 +0800 Subject: [PATCH 03/11] perf(REL_OP): Replace some token.children with token.rights or token.lefts (#12528) Co-authored-by: Tan Long --- spacy/matcher/dependencymatcher.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index adf96702b..48fb3eb2a 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -432,22 +432,22 @@ cdef class DependencyMatcher: return [doc[child.i] for child in doc[node].head.children if child.i < node] def _imm_right_child(self, doc, node): - for child in doc[node].children: + for child in doc[node].rights: if child.i == node + 1: return [doc[child.i]] return [] def _imm_left_child(self, doc, node): - for child in doc[node].children: + for child in doc[node].lefts: if child.i == node - 1: return [doc[child.i]] return [] def _right_child(self, doc, node): - return [doc[child.i] for child in doc[node].children if child.i > node] + return [child for child in doc[node].rights] def _left_child(self, doc, node): - return [doc[child.i] for child in doc[node].children if child.i < node] + return [child for child in doc[node].lefts] def _imm_right_parent(self, doc, node): if doc[node].head.i == node + 1: From 8e6a3d58d8fa092eede0fe323441b2aaa3c2042e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 19 Apr 2023 10:59:33 +0200 Subject: [PATCH 04/11] fix typo (#12543) --- spacy/cli/debug_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 97b4db285..2826cd084 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -337,7 +337,7 @@ def debug_data( show=verbose, ) else: - msg.good("Examples without ocurrences available for all labels") + msg.good("Examples without occurrences available for all labels") if "ner" in factory_names: # Get all unique NER labels present in the data From dc0a1a98086ac038bf62221d0483b9933d5d0260 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 11:30:34 +0200 Subject: [PATCH 05/11] Load exceptions last in Tokenizer.from_bytes (#12553) In `Tokenizer.from_bytes`, the exceptions should be loaded last so that they are only processed once as part of loading the model. The exceptions are tokenized as phrase matcher patterns in the background and the internal tokenization needs to be synced with all the remaining tokenizer settings. If the exceptions are not loaded last, there are speed regressions for `Tokenizer.from_bytes/disk` vs. `Tokenizer.add_special_case` as the caches are reloaded more than necessary during deserialization. --- spacy/tokenizer.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e75b5f7a..a4a68ae8e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -834,10 +834,12 @@ cdef class Tokenizer: self.token_match = re.compile(data["token_match"]).match if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match - if "rules" in data and isinstance(data["rules"], dict): - self.rules = data["rules"] if "faster_heuristics" in data: self.faster_heuristics = data["faster_heuristics"] + # always load rules last so that all other settings are set before the + # internal tokenization for the phrase matcher + if "rules" in data and isinstance(data["rules"], dict): + self.rules = data["rules"] return self From b60b027927d734db627cf12b040fb75d9cb8894a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 14:06:32 +0200 Subject: [PATCH 06/11] Add default option to MorphAnalysis.get (#12545) * Add default to MorphAnalysis.get Similar to `dict`, allow a `default` option for `MorphAnalysis.get` for the user to provide a default return value if the field is not found. The default return value remains `[]`, which is not the same as `dict.get`, but is already established as this method's default return value with the return type `List[str]`. However the new `default` option does not enforce that the user-provided default is actually `List[str]`. * Restore test case --- spacy/tests/doc/test_morphanalysis.py | 2 ++ spacy/tokens/morphanalysis.pyi | 4 ++-- spacy/tokens/morphanalysis.pyx | 6 +++++- website/docs/api/morphology.mdx | 9 +++++---- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 918d4acdc..49e32b936 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -33,6 +33,8 @@ def test_token_morph_key(i_has): def test_morph_props(i_has): assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[1].morph.get("PronType") == [] + assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"] + assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"] def test_morph_iter(i_has): diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi index b86203cc4..a5376e80d 100644 --- a/spacy/tokens/morphanalysis.pyi +++ b/spacy/tokens/morphanalysis.pyi @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterator, List, Union +from typing import Any, Dict, Iterator, List, Optional, Union from ..vocab import Vocab class MorphAnalysis: @@ -13,7 +13,7 @@ class MorphAnalysis: def __hash__(self) -> int: ... def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] - def get(self, field: Any) -> List[str]: ... + def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ... def to_json(self) -> str: ... def to_dict(self) -> Dict[str, str]: ... def __str__(self) -> str: ... diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index a7d1f2e44..baa3800a1 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -58,10 +58,14 @@ cdef class MorphAnalysis: def __ne__(self, other): return self.key != other.key - def get(self, field): + def get(self, field, default=None): """Retrieve feature values by field.""" cdef attr_t field_id = self.vocab.strings.as_int(field) cdef np.ndarray results = get_by_field(&self.c, field_id) + if len(results) == 0: + if default is None: + default = [] + return default features = [self.vocab.strings[result] for result in results] return [f.split(Morphology.FIELD_SEP)[1] for f in features] diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 68d80b814..5d4affafe 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -213,10 +213,11 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Description | -| ----------- | ------------------------------------------------ | -| `field` | The field to retrieve. ~~str~~ | -| **RETURNS** | A list of the individual features. ~~List[str]~~ | +| Name | Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `field` | The field to retrieve. ~~str~~ | +| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ | +| **RETURNS** | A list of the individual features. ~~List[str]~~ | ### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"} From ab4ba04c32422345dc17b51fe5fa4e07ad3e7b76 Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Thu, 20 Apr 2023 10:55:40 -0400 Subject: [PATCH 07/11] Update LatinDefaults for lang 'la' (#12538) * Add noun chunking to la syntax iterators * Expand list of numeral, ordinal words * Expand abbreviations in la tokenizer_exceptions * Add example sents * Update spacy/lang/la/syntax_iterators.py Reorganize la syntax iterators Co-authored-by: Sofie Van Landeghem * Minor updates based on review * fix call --------- Co-authored-by: Sofie Van Landeghem --- spacy/lang/la/__init__.py | 2 + spacy/lang/la/examples.py | 22 +++++++ spacy/lang/la/lex_attrs.py | 17 +++-- spacy/lang/la/syntax_iterators.py | 85 +++++++++++++++++++++++++ spacy/lang/la/tokenizer_exceptions.py | 68 +++----------------- spacy/tests/lang/la/test_noun_chunks.py | 52 +++++++++++++++ 6 files changed, 178 insertions(+), 68 deletions(-) create mode 100644 spacy/lang/la/examples.py create mode 100644 spacy/lang/la/syntax_iterators.py create mode 100644 spacy/tests/lang/la/test_noun_chunks.py diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py index 15b87c5b9..37164c3f3 100644 --- a/spacy/lang/la/__init__.py +++ b/spacy/lang/la/__init__.py @@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS class LatinDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS class Latin(Language): diff --git a/spacy/lang/la/examples.py b/spacy/lang/la/examples.py new file mode 100644 index 000000000..db8550070 --- /dev/null +++ b/spacy/lang/la/examples.py @@ -0,0 +1,22 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.la.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +# > Caes. BG 1.1 +# > Cic. De Amic. 1 +# > V. Georg. 1.1-5 +# > Gen. 1:1 +# > Galileo, Sid. Nunc. +# > van Schurman, Opusc. arg. 1 + +sentences = [ + "Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.", + "Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.", + "Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam", + "In principio creavit Deus caelum et terram.", + "Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.", + "Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.", +] diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py index 9efb4dd3c..9db1218a4 100644 --- a/spacy/lang/la/lex_attrs.py +++ b/spacy/lang/la/lex_attrs.py @@ -6,17 +6,16 @@ roman_numerals_compile = re.compile( r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$" ) -_num_words = set( - """ -unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem +_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille """.split() -) -_ordinal_words = set( - """ -primus prima primum secundus secunda secundum tertius tertia tertium -""".split() -) +_num_words += [item.replace("v", "u") for item in _num_words] +_num_words = set(_num_words) + +_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split() + +_ordinal_words += [item.replace("v", "u") for item in _ordinal_words] +_ordinal_words = set(_ordinal_words) def like_num(text): diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py new file mode 100644 index 000000000..7093bacf9 --- /dev/null +++ b/spacy/lang/la/syntax_iterators.py @@ -0,0 +1,85 @@ +from typing import Union, Iterator, Tuple +from ...tokens import Doc, Span +from ...symbols import NOUN, PROPN, PRON, VERB, AUX +from ...errors import Errors + +# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB] + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + def is_verb_token(tok): + return tok.pos in [VERB, AUX] + + def get_left_bound(root): + left_bound = root + for tok in reversed(list(root.lefts)): + if tok.dep in np_left_deps: + left_bound = tok + return left_bound + + def get_right_bound(doc, root): + right_bound = root + for tok in root.rights: + if tok.dep in np_right_deps: + right = get_right_bound(doc, tok) + if list( + filter( + lambda t: is_verb_token(t) or t.dep in stop_deps, + doc[root.i : right.i], + ) + ): + break + else: + right_bound = right + return right_bound + + def get_bounds(doc, root): + return get_left_bound(root), get_right_bound(doc, root) + + doc = doclike.doc # Ensure works on both Doc and Span. + + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + if not len(doc): + return + + left_labels = [ + "det", + "fixed", + "nmod:poss", + "amod", + "flat", + "goeswith", + "nummod", + "appos", + ] + right_labels = [ + "fixed", + "nmod:poss", + "amod", + "flat", + "goeswith", + "nummod", + "appos", + "nmod", + "det", + ] + stop_labels = ["punct"] + + np_label = doc.vocab.strings.add("NP") + np_left_deps = [doc.vocab.strings.add(label) for label in left_labels] + np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] + stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] + + prev_right = -1 + for token in doclike: + if token.pos in [PROPN, NOUN, PRON]: + left, right = get_bounds(doc, token) + if left.i <= prev_right: + continue + yield left.i, right.i + 1, np_label + prev_right = right.i + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py index 060f6e085..6d14b92c5 100644 --- a/spacy/lang/la/tokenizer_exceptions.py +++ b/spacy/lang/la/tokenizer_exceptions.py @@ -12,65 +12,15 @@ _exc = { "uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}], } -for orth in [ - "A.", - "Agr.", - "Ap.", - "C.", - "Cn.", - "D.", - "F.", - "K.", - "L.", - "M'.", - "M.", - "Mam.", - "N.", - "Oct.", - "Opet.", - "P.", - "Paul.", - "Post.", - "Pro.", - "Q.", - "S.", - "Ser.", - "Sert.", - "Sex.", - "St.", - "Sta.", - "T.", - "Ti.", - "V.", - "Vol.", - "Vop.", - "U.", - "Uol.", - "Uop.", - "Ian.", - "Febr.", - "Mart.", - "Apr.", - "Mai.", - "Iun.", - "Iul.", - "Aug.", - "Sept.", - "Oct.", - "Nov.", - "Nou.", - "Dec.", - "Non.", - "Id.", - "A.D.", - "Coll.", - "Cos.", - "Ord.", - "Pl.", - "S.C.", - "Suff.", - "Trib.", -]: +_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split() + +_abbrev_exc += [item.lower() for item in _abbrev_exc] +_abbrev_exc += [item.upper() for item in _abbrev_exc] +_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc] + +_abbrev_exc += ["d.N."] + +for orth in set(_abbrev_exc): _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/la/test_noun_chunks.py b/spacy/tests/lang/la/test_noun_chunks.py new file mode 100644 index 000000000..ba8f5658b --- /dev/null +++ b/spacy/tests/lang/la/test_noun_chunks.py @@ -0,0 +1,52 @@ +import pytest +from spacy.tokens import Doc + + +def test_noun_chunks_is_parsed(la_tokenizer): + """Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = la_tokenizer("Haec est sententia.") + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +LA_NP_TEST_EXAMPLES = [ + ( + "Haec narrantur a poetis de Perseo.", + ["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"], + ["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"], + [1, 0, -1, -1, -3, -1, -5], + ["poetis", "Perseo"], + ), + ( + "Perseus autem in sinu matris dormiebat.", + ["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"], + ["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"], + [5, 4, 3, -1, -1, 0, -1], + ["Perseus", "sinu matris"], + ), +] + + +@pytest.mark.parametrize( + "text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES +) +def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks): + tokens = la_tokenizer(text) + + assert len(heads) == len(pos) + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[head + i for i, head in enumerate(heads)], + deps=deps, + pos=pos, + ) + + noun_chunks = list(doc.noun_chunks) + assert len(noun_chunks) == len(expected_noun_chunks) + for i, np in enumerate(noun_chunks): + assert np.text == expected_noun_chunks[i] From e1154085144039637274d41a27640f634a43d54b Mon Sep 17 00:00:00 2001 From: Victoria <80417010+victorialslocum@users.noreply.github.com> Date: Fri, 21 Apr 2023 10:22:26 +0200 Subject: [PATCH 08/11] remove survey link (#12559) --- website/src/templates/index.js | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 4c10e09c5..227b25be8 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -57,15 +57,9 @@ const AlertSpace = ({ nightly, legacy }) => { ) } -// const navAlert = ( -// -// 💥 Out now: spaCy v3.5 -// -// ) - const navAlert = ( - - 💥 Take the user survey! + + 💥 Out now: spaCy v3.5 ) From 68da580a4cc410a361eab433bee60bcf3fcf5068 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 21 Apr 2023 15:05:53 +0200 Subject: [PATCH 09/11] CI: Disable Azure (#12560) --- .github/azure-steps.yml | 118 --------------------------------------- azure-pipelines.yml | 120 ---------------------------------------- 2 files changed, 238 deletions(-) delete mode 100644 .github/azure-steps.yml delete mode 100644 azure-pipelines.yml diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml deleted file mode 100644 index 20d4582cb..000000000 --- a/.github/azure-steps.yml +++ /dev/null @@ -1,118 +0,0 @@ -parameters: - python_version: '' - architecture: 'x64' - num_build_jobs: 2 - -steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: ${{ parameters.python_version }} - architecture: ${{ parameters.architecture }} - allowUnstable: true - - - bash: | - echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" - displayName: 'Set variables' - - - script: | - python -m pip install -U build pip setuptools - python -m pip install -U -r requirements.txt - displayName: "Install dependencies" - - - script: | - python -m build --sdist - displayName: "Build sdist" - - - script: | - python -m mypy spacy - displayName: 'Run mypy' - condition: ne(variables['python_version'], '3.6') - - - task: DeleteFiles@1 - inputs: - contents: "spacy" - displayName: "Delete source directory" - - - task: DeleteFiles@1 - inputs: - contents: "*.egg-info" - displayName: "Delete egg-info directory" - - - script: | - python -m pip freeze > installed.txt - python -m pip uninstall -y -r installed.txt - displayName: "Uninstall all packages" - - - bash: | - SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST - displayName: "Install from sdist" - - - script: | - python -W error -c "import spacy" - displayName: "Test import" - - - script: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md - python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - displayName: 'Test download CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -W error -m spacy info ca_core_news_sm | grep -q download_url - displayName: 'Test download_url in info CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" - displayName: 'Test no warnings on load (#11713)' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . - displayName: 'Test convert CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m spacy init config -p ner -l ca ner.cfg - python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy - displayName: 'Test debug config CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - # will have errors due to sparse data, check for summary in output - python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary - displayName: 'Test debug data CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 - displayName: 'Test train CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - displayName: 'Test assemble CLI' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" - python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - displayName: 'Test assemble CLI vectors warning' - condition: eq(variables['python_version'], '3.9') - - - script: | - python -m pip install -U -r requirements.txt - displayName: "Install test requirements" - - - script: | - python -m pytest --pyargs spacy -W error - displayName: "Run CPU tests" - - - script: | - python -m pip install 'spacy[apple]' - python -m pytest --pyargs spacy - displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 83c57a164..000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,120 +0,0 @@ -trigger: - batch: true - branches: - include: - - "*" - exclude: - - "spacy.io" - - "nightly.spacy.io" - - "v2.spacy.io" - paths: - exclude: - - "website/*" - - "*.md" - - "*.mdx" - - ".github/workflows/*" -pr: - paths: - exclude: - - "*.md" - - "*.mdx" - - "website/docs/*" - - "website/src/*" - - "website/meta/*.tsx" - - "website/meta/*.mjs" - - "website/meta/languages.json" - - "website/meta/site.json" - - "website/meta/sidebars.json" - - "website/meta/type-annotations.json" - - "website/pages/*" - - ".github/workflows/*" - -jobs: - # Check formatting and linting. Perform basic checks for most important errors - # (syntax etc.) Uses the config defined in setup.cfg and overwrites the - # selected codes. - - job: "Validate" - pool: - vmImage: "ubuntu-latest" - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: "3.7" - - script: | - pip install black -c requirements.txt - python -m black spacy --check - displayName: "black" - - script: | - pip install flake8==5.0.4 - python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics - displayName: "flake8" - - script: | - python .github/validate_universe_json.py website/meta/universe.json - displayName: 'Validate website/meta/universe.json' - - - job: "Test" - dependsOn: "Validate" - strategy: - matrix: - # We're only running one platform per Python version to speed up builds - Python36Linux: - imageName: "ubuntu-20.04" - python.version: "3.6" - # Python36Windows: - # imageName: "windows-latest" - # python.version: "3.6" - # Python36Mac: - # imageName: "macos-latest" - # python.version: "3.6" - # Python37Linux: - # imageName: "ubuntu-20.04" - # python.version: "3.7" - Python37Windows: - imageName: "windows-latest" - python.version: "3.7" - # Python37Mac: - # imageName: "macos-latest" - # python.version: "3.7" - # Python38Linux: - # imageName: "ubuntu-latest" - # python.version: "3.8" - # Python38Windows: - # imageName: "windows-latest" - # python.version: "3.8" - Python38Mac: - imageName: "macos-latest" - python.version: "3.8" - Python39Linux: - imageName: "ubuntu-latest" - python.version: "3.9" - # Python39Windows: - # imageName: "windows-latest" - # python.version: "3.9" - # Python39Mac: - # imageName: "macos-latest" - # python.version: "3.9" - # Python310Linux: - # imageName: "ubuntu-latest" - # python.version: "3.10" - Python310Windows: - imageName: "windows-latest" - python.version: "3.10" - # Python310Mac: - # imageName: "macos-latest" - # python.version: "3.10" - Python311Linux: - imageName: 'ubuntu-latest' - python.version: '3.11' - Python311Windows: - imageName: 'windows-latest' - python.version: '3.11' - Python311Mac: - imageName: 'macos-latest' - python.version: '3.11' - maxParallel: 4 - pool: - vmImage: $(imageName) - steps: - - template: .github/azure-steps.yml - parameters: - python_version: '$(python.version)' From 070fa1654534590fc2b219355c77a21f07f2e10e Mon Sep 17 00:00:00 2001 From: moxley01 Date: Tue, 25 Apr 2023 12:30:19 +0200 Subject: [PATCH 10/11] add spacysee project (#12568) --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 1d2881f9c..b91d7dada 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "spacysee", + "title": "spaCysee", + "slogan": "Visualize spaCy's Dependency Parsing, POS tagging, and morphological analysis", + "description": "A project that helps you visualize your spaCy docs in Jupyter notebooks. Each of the dependency tags, POS tags and morphological features are clickable. Clicking on a tag will bring up the relevant documentation for that tag.", + "github": "moxley01/spacysee", + "pip": "spacysee", + "code_example": [ + "import spacy", + "from spacysee import render", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('This is a neat way to visualize your spaCy docs')", + "render(doc, width='500', height='500')" + ], + "code_language": "python", + "thumb": "https://www.mattoxley.com/static/images/spacysee_logo.svg", + "image": "https://www.mattoxley.com/static/images/spacysee_logo.svg", + "author": "Matt Oxley", + "author_links": { + "twitter": "matt0xley", + "github": "moxley01", + "website": "https://mattoxley.com" + }, + "category": ["visualizers"], + "tags": ["visualization"] + }, { "id": "grecy", "title": "greCy", From a8dfc66135cdb696fdc965f2ef369ecce093db6a Mon Sep 17 00:00:00 2001 From: Victoria <80417010+victorialslocum@users.noreply.github.com> Date: Wed, 26 Apr 2023 14:18:40 +0200 Subject: [PATCH 11/11] Add spacy-wasm to universe (#12572) * add spacy-wasm to universe * add tag --- website/meta/universe.json | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index b91d7dada..4067c4d1e 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,21 @@ { "resources": [ + { + "id": "spacy-wasm", + "title": "spacy-wasm", + "slogan": "spaCy in the browser using WebAssembly", + "description": "Run spaCy directly in the browser with WebAssembly. Using Pyodide, the application loads the spaCy model and renders the text prompt with displaCy.", + "url": "https://spacy-wasm.vercel.app/", + "github": "SyedAhkam/spacy-wasm", + "code_language": "python", + "author": "Syed Ahkam", + "author_links": { + "twitter": "@SyedAhkam1", + "github": "SyedAhkam" + }, + "category": ["visualizers"], + "tags": ["visualization", "deployment"] + }, { "id": "spacysee", "title": "spaCysee",