From 8e6a3d58d8fa092eede0fe323441b2aaa3c2042e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 19 Apr 2023 10:59:33 +0200 Subject: [PATCH 1/3] fix typo (#12543) --- spacy/cli/debug_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 97b4db285..2826cd084 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -337,7 +337,7 @@ def debug_data( show=verbose, ) else: - msg.good("Examples without ocurrences available for all labels") + msg.good("Examples without occurrences available for all labels") if "ner" in factory_names: # Get all unique NER labels present in the data From dc0a1a98086ac038bf62221d0483b9933d5d0260 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 11:30:34 +0200 Subject: [PATCH 2/3] Load exceptions last in Tokenizer.from_bytes (#12553) In `Tokenizer.from_bytes`, the exceptions should be loaded last so that they are only processed once as part of loading the model. The exceptions are tokenized as phrase matcher patterns in the background and the internal tokenization needs to be synced with all the remaining tokenizer settings. If the exceptions are not loaded last, there are speed regressions for `Tokenizer.from_bytes/disk` vs. `Tokenizer.add_special_case` as the caches are reloaded more than necessary during deserialization. --- spacy/tokenizer.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e75b5f7a..a4a68ae8e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -834,10 +834,12 @@ cdef class Tokenizer: self.token_match = re.compile(data["token_match"]).match if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match - if "rules" in data and isinstance(data["rules"], dict): - self.rules = data["rules"] if "faster_heuristics" in data: self.faster_heuristics = data["faster_heuristics"] + # always load rules last so that all other settings are set before the + # internal tokenization for the phrase matcher + if "rules" in data and isinstance(data["rules"], dict): + self.rules = data["rules"] return self From b60b027927d734db627cf12b040fb75d9cb8894a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 14:06:32 +0200 Subject: [PATCH 3/3] Add default option to MorphAnalysis.get (#12545) * Add default to MorphAnalysis.get Similar to `dict`, allow a `default` option for `MorphAnalysis.get` for the user to provide a default return value if the field is not found. The default return value remains `[]`, which is not the same as `dict.get`, but is already established as this method's default return value with the return type `List[str]`. However the new `default` option does not enforce that the user-provided default is actually `List[str]`. * Restore test case --- spacy/tests/doc/test_morphanalysis.py | 2 ++ spacy/tokens/morphanalysis.pyi | 4 ++-- spacy/tokens/morphanalysis.pyx | 6 +++++- website/docs/api/morphology.mdx | 9 +++++---- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 918d4acdc..49e32b936 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -33,6 +33,8 @@ def test_token_morph_key(i_has): def test_morph_props(i_has): assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[1].morph.get("PronType") == [] + assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"] + assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"] def test_morph_iter(i_has): diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi index b86203cc4..a5376e80d 100644 --- a/spacy/tokens/morphanalysis.pyi +++ b/spacy/tokens/morphanalysis.pyi @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterator, List, Union +from typing import Any, Dict, Iterator, List, Optional, Union from ..vocab import Vocab class MorphAnalysis: @@ -13,7 +13,7 @@ class MorphAnalysis: def __hash__(self) -> int: ... def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] - def get(self, field: Any) -> List[str]: ... + def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ... def to_json(self) -> str: ... def to_dict(self) -> Dict[str, str]: ... def __str__(self) -> str: ... diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index a7d1f2e44..baa3800a1 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -58,10 +58,14 @@ cdef class MorphAnalysis: def __ne__(self, other): return self.key != other.key - def get(self, field): + def get(self, field, default=None): """Retrieve feature values by field.""" cdef attr_t field_id = self.vocab.strings.as_int(field) cdef np.ndarray results = get_by_field(&self.c, field_id) + if len(results) == 0: + if default is None: + default = [] + return default features = [self.vocab.strings[result] for result in results] return [f.split(Morphology.FIELD_SEP)[1] for f in features] diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 68d80b814..5d4affafe 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -213,10 +213,11 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Description | -| ----------- | ------------------------------------------------ | -| `field` | The field to retrieve. ~~str~~ | -| **RETURNS** | A list of the individual features. ~~List[str]~~ | +| Name | Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `field` | The field to retrieve. ~~str~~ | +| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ | +| **RETURNS** | A list of the individual features. ~~List[str]~~ | ### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}