From d0578c2ede80890ed610573c95f11ad30b2f8cd2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Aug 2022 16:41:20 +0200 Subject: [PATCH 01/20] Add scorer to textcat API docs config settings (#11263) --- website/docs/api/textcategorizer.md | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 2ff569bad..5bc40fa9e 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -84,6 +84,7 @@ architectures and their arguments and hyperparameters. | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/textcat.py From d993df41e5af01a2524fa436d27bc349ecb212b3 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Wed, 3 Aug 2022 22:53:02 +0800 Subject: [PATCH 02/20] Update docs for pipeline initialize() methods (#11221) * Update documentation for dependency parser * Update documentation for trainable_lemmatizer * Update documentation for entity_linker * Update documentation for ner * Update documentation for morphologizer * Update documentation for senter * Update documentation for spancat * Update documentation for tagger * Update documentation for textcat * Update documentation for tok2vec * Run prettier on edited files * Apply similar changes in transformer docs * Remove need to say annotated example explicitly I removed the need to say "Must contain at least one annotated Example" because it's often a given that Examples will contain some gold-standard annotation. * Run prettier on transformer docs --- website/docs/api/dependencyparser.md | 12 ++++++------ website/docs/api/edittreelemmatizer.md | 12 ++++++------ website/docs/api/entitylinker.md | 22 +++++++++++----------- website/docs/api/entityrecognizer.md | 12 ++++++------ website/docs/api/morphologizer.md | 12 ++++++------ website/docs/api/sentencerecognizer.md | 20 ++++++++++---------- website/docs/api/spancategorizer.md | 16 ++++++++-------- website/docs/api/tagger.md | 12 ++++++------ website/docs/api/textcategorizer.md | 12 ++++++------ website/docs/api/tok2vec.md | 20 ++++++++++---------- website/docs/api/transformer.md | 20 ++++++++++---------- 11 files changed, 85 insertions(+), 85 deletions(-) diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 103e0826e..27e315592 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -158,10 +158,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and ## DependencyParser.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -179,7 +179,7 @@ This method was previously called `begin_training`. > > ```python > parser = nlp.add_pipe("parser") -> parser.initialize(lambda: [], nlp=nlp) +> parser.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -193,7 +193,7 @@ This method was previously called `begin_training`. | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ | diff --git a/website/docs/api/edittreelemmatizer.md b/website/docs/api/edittreelemmatizer.md index 99a705f5e..63e4bf910 100644 --- a/website/docs/api/edittreelemmatizer.md +++ b/website/docs/api/edittreelemmatizer.md @@ -141,10 +141,10 @@ and [`pipe`](/api/edittreelemmatizer#pipe) delegate to the ## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -156,7 +156,7 @@ config. > > ```python > lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer") -> lemmatizer.initialize(lambda: [], nlp=nlp) +> lemmatizer.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -170,7 +170,7 @@ config. | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index a55cce352..43e08a39c 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -185,10 +185,10 @@ with the current vocab. ## EntityLinker.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). @@ -208,15 +208,15 @@ This method was previously called `begin_training`. > > ```python > entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb) +> entity_linker.initialize(lambda: examples, nlp=nlp, kb_loader=my_kb) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | ## EntityLinker.predict {#predict tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 7c153f064..a535e8316 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -154,10 +154,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and ## EntityRecognizer.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -175,7 +175,7 @@ This method was previously called `begin_training`. > > ```python > ner = nlp.add_pipe("ner") -> ner.initialize(lambda: [], nlp=nlp) +> ner.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -189,7 +189,7 @@ This method was previously called `begin_training`. | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ | diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 434c56833..f874e8bea 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -147,10 +147,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and ## Morphologizer.initialize {#initialize tag="method"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -162,7 +162,7 @@ config. > > ```python > morphologizer = nlp.add_pipe("morphologizer") -> morphologizer.initialize(lambda: [], nlp=nlp) +> morphologizer.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -176,7 +176,7 @@ config. | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 29bf10393..2f50350ae 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -132,10 +132,10 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the ## SentenceRecognizer.initialize {#initialize tag="method"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). @@ -144,14 +144,14 @@ by [`Language.initialize`](/api/language#initialize). > > ```python > senter = nlp.add_pipe("senter") -> senter.initialize(lambda: [], nlp=nlp) +> senter.initialize(lambda: examples, nlp=nlp) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## SentenceRecognizer.predict {#predict tag="method"} diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md index f09ac8bdb..58a06bcf5 100644 --- a/website/docs/api/spancategorizer.md +++ b/website/docs/api/spancategorizer.md @@ -56,7 +56,7 @@ architectures and their arguments and hyperparameters. | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | | `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | @@ -93,7 +93,7 @@ shortcut for this and instantiate the component using its string name and | `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | _keyword-only_ | | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | @@ -147,10 +147,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/spancategorizer#call) and ## SpanCategorizer.initialize {#initialize tag="method"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -162,7 +162,7 @@ config. > > ```python > spancat = nlp.add_pipe("spancat") -> spancat.initialize(lambda: [], nlp=nlp) +> spancat.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -176,7 +176,7 @@ config. | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index b51864d3a..90a49b197 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -130,10 +130,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and ## Tagger.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -151,7 +151,7 @@ This method was previously called `begin_training`. > > ```python > tagger = nlp.add_pipe("tagger") -> tagger.initialize(lambda: [], nlp=nlp) +> tagger.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -165,7 +165,7 @@ This method was previously called `begin_training`. | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 5bc40fa9e..042b4ab76 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -176,10 +176,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and ## TextCategorizer.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that -returns an iterable of [`Example`](/api/example) objects. The data examples are -used to **initialize the model** of the component and can either be the full -training data or a representative sample. Initialization includes validating the -network, +returns an iterable of [`Example`](/api/example) objects. **At least one example +should be supplied.** The data examples are used to **initialize the model** of +the component and can either be the full training data or a representative +sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize @@ -197,7 +197,7 @@ This method was previously called `begin_training`. > > ```python > textcat = nlp.add_pipe("textcat") -> textcat.initialize(lambda: [], nlp=nlp) +> textcat.initialize(lambda: examples, nlp=nlp) > ``` > > ```ini @@ -212,7 +212,7 @@ This method was previously called `begin_training`. | Name | Description | | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 70c352b4d..2dcb1a013 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -127,10 +127,10 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +function that returns an iterable of [`Example`](/api/example) objects. **At +least one example should be supplied.** The data examples are used to +**initialize the model** of the component and can either be the full training +data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). @@ -139,14 +139,14 @@ by [`Language.initialize`](/api/language#initialize). > > ```python > tok2vec = nlp.add_pipe("tok2vec") -> tok2vec.initialize(lambda: [], nlp=nlp) +> tok2vec.initialize(lambda: examples, nlp=nlp) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Tok2Vec.predict {#predict tag="method"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index b1673cdbe..e747ad383 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -175,10 +175,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +function that returns an iterable of [`Example`](/api/example) objects. **At +least one example should be supplied.** The data examples are used to +**initialize the model** of the component and can either be the full training +data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). @@ -187,14 +187,14 @@ by [`Language.initialize`](/api/language#initialize). > > ```python > trf = nlp.add_pipe("transformer") -> trf.initialize(lambda: iter([]), nlp=nlp) +> trf.initialize(lambda: examples, nlp=nlp) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Transformer.predict {#predict tag="method"} From cd09614ab2be485a796a572274d336c1c47ca4a9 Mon Sep 17 00:00:00 2001 From: Jules Belveze <32683010+JulesBelveze@users.noreply.github.com> Date: Thu, 4 Aug 2022 08:42:38 +0200 Subject: [PATCH 03/20] chore: add 'concepCy' to spacy universe (#11255) * chore: add 'concepCy' to spacy universe * docs: add 'slogan' to concepCy --- website/meta/universe.json | 42 ++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index a128f0795..6c8caa6a6 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,39 @@ { "resources": [ + { + "id": "concepcy", + "title": "concepCy", + "slogan": "A multilingual knowledge graph in spaCy", + "description": "A spaCy wrapper for ConceptNet, a freely-available semantic network designed to help computers understand the meaning of words.", + "github": "JulesBelveze/concepcy", + "pip": "concepcy", + "code_example": [ + "import spacy", + "import concepcy", + "", + "nlp = spacy.load('en_core_web_sm')", + "# Using default concepCy configuration", + "nlp.add_pipe('concepcy')", + "", + "doc = nlp('WHO is a lovely company')", + "", + "# Access all the 'RelatedTo' relations from the Doc", + "for word, relations in doc._.relatedto.items():", + " print(f'Word: {word}\n{relations}')", + "", + "# Access the 'RelatedTo' relations word by word", + "for token in doc:", + " print(f'Word: {token}\n{token._.relatedto}')" + ], + "category": ["pipeline"], + "image": "https://github.com/JulesBelveze/concepcy/blob/main/figures/concepcy.png", + "tags": ["semantic", "ConceptNet"], + "author": "Jules Belveze", + "author_links": { + "github": "JulesBelveze", + "website": "https://www.linkedin.com/in/jules-belveze/" + } + }, { "id": "spacyfishing", "title": "spaCy fishing", @@ -2604,7 +2638,7 @@ " Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.", " Later, add some oranges and chickens.\"\"\"", "", - "# use any model that has internal spacy embeddings", + "# use any model that has internal spacy embeddings", "nlp = spacy.load('en_core_web_lg')", "nlp.add_pipe(\"concise_concepts\", ", " config={\"data\": data}", @@ -2650,7 +2684,7 @@ " At that location, Nissin was founded.", " Many students survived by eating these noodles, but they don't even know him.\"\"\"", "", - "# use any model that has internal spacy embeddings", + "# use any model that has internal spacy embeddings", "nlp = spacy.load('en_core_web_sm')", "nlp.add_pipe(", " \"xx_coref\", config={\"chunk_size\": 2500, \"chunk_overlap\": 2, \"device\": 0})", @@ -2833,7 +2867,7 @@ "doc = nlp(\"AE died in Princeton in 1955.\")", "", "print(doc._.clauses)", - "# Output:", + "# Output:", "# ", "", "propositions = doc._.clauses[0].to_propositions(as_text=True)", @@ -3599,7 +3633,7 @@ "", "#Lexico Semantic (LxSem) Features", "TTRF = LingFeat.TTRF_() #Type Token Ratio Features", - "VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features", + "VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features", "PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)", "WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)", "", From b07708d5d073bf1af55d0b50eb11760e48221500 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 4 Aug 2022 15:14:19 +0200 Subject: [PATCH 04/20] Support full prerelease versions in the compat table (#11228) * Support full prerelease versions in the compat table * Fix types --- spacy/cli/download.py | 6 +++++- spacy/util.py | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 4ea9a8f0e..b7de88729 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -7,6 +7,7 @@ import typer from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX from .. import about from ..util import is_package, get_minor_version, run_command +from ..util import is_prerelease_version from ..errors import OLD_MODEL_SHORTCUTS @@ -74,7 +75,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) - def get_compatibility() -> dict: - version = get_minor_version(about.__version__) + if is_prerelease_version(about.__version__): + version: Optional[str] = about.__version__ + else: + version = get_minor_version(about.__version__) r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( diff --git a/spacy/util.py b/spacy/util.py index 4f21d618a..d170fc15b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -795,6 +795,15 @@ def get_model_lower_version(constraint: str) -> Optional[str]: return None +def is_prerelease_version(version: str) -> bool: + """Check whether a version is a prerelease version. + + version (str): The version, e.g. "3.0.0.dev1". + RETURNS (bool): Whether the version is a prerelease version. + """ + return Version(version).is_prerelease + + def get_base_version(version: str) -> str: """Generate the base version without any prerelease identifiers. From 23749cfc91110a77e4c6bbaa71ad90d8c056ca0b Mon Sep 17 00:00:00 2001 From: stefawolf Date: Fri, 5 Aug 2022 12:26:38 +0200 Subject: [PATCH 05/20] adding spans to doc_annotation in Example.to_dict (#11261) * adding spans to doc_annotation in Example.to_dict * to_dict compatible with from_dict: tuples instead of spans * use strings for label and kb_id * Simplify test * Update data formats docs Co-authored-by: Stefanie Wolf Co-authored-by: Adriane Boyd --- spacy/tests/training/test_new_example.py | 38 ++++++++++++++++++++++++ spacy/training/example.pyx | 13 ++++++++ website/docs/api/data-formats.md | 6 ++-- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index a39d40ded..6b15603b3 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab): example = Example(predicted, reference) assert example.get_aligned("TAG", as_string=True) == tags + + +@pytest.mark.issue("11260") +def test_issue11260(): + annots = { + "words": ["I", "like", "New", "York", "."], + "spans": { + "cities": [(7, 15, "LOC", "")], + "people": [(0, 1, "PERSON", "")], + }, + } + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(example.reference.spans["cities"]) == 1 + assert len(example.reference.spans["people"]) == 1 + + output_dict = example.to_dict() + assert "spans" in output_dict["doc_annotation"] + assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"] + assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"] + + output_example = Example.from_dict(predicted, output_dict) + + assert len(output_example.reference.spans["cities"]) == len( + example.reference.spans["cities"] + ) + assert len(output_example.reference.spans["people"]) == len( + example.reference.spans["people"] + ) + for span in example.reference.spans["cities"]: + assert span.label_ == "LOC" + assert span.text == "New York" + assert span.start_char == 7 + for span in example.reference.spans["people"]: + assert span.label_ == "PERSON" + assert span.text == "I" + assert span.start_char == 0 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index d592e5a52..dfd337b9e 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -361,6 +361,7 @@ cdef class Example: "doc_annotation": { "cats": dict(self.reference.cats), "entities": doc_to_biluo_tags(self.reference), + "spans": self._spans_to_dict(), "links": self._links_to_dict() }, "token_annotation": { @@ -376,6 +377,18 @@ cdef class Example: } } + def _spans_to_dict(self): + span_dict = {} + for key in self.reference.spans: + span_tuples = [] + for span in self.reference.spans[key]: + span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_) + span_tuples.append(span_tuple) + span_dict[key] = span_tuples + + return span_dict + + def _links_to_dict(self): links = {} for ent in self.reference.ents: diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index b7aedc511..ce06c4ea8 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own > "pos": List[str], > "morphs": List[str], > "sent_starts": List[Optional[bool]], -> "deps": List[string], +> "deps": List[str], > "heads": List[int], > "entities": List[str], > "entities": List[(int, int, str)], > "cats": Dict[str, float], > "links": Dict[(int, int), dict], +> "spans": Dict[str, List[Tuple]], > } > ``` @@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own | `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ | | `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ | | `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ | -| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ | +| `entities` | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ | | `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ | | `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ | +| `spans` | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~ | From fc4246558be4f6e9b3e71afb814019552764cfb1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 9 Aug 2022 10:59:36 +0200 Subject: [PATCH 06/20] Fix regex invalid escape sequences (#11276) --- spacy/lang/ko/punctuation.py | 2 +- spacy/schemas.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py index 7f7b40c5b..f5f1c51da 100644 --- a/spacy/lang/ko/punctuation.py +++ b/spacy/lang/ko/punctuation.py @@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES _infixes = ( - ["·", "ㆍ", "\(", "\)"] + ["·", "ㆍ", r"\(", r"\)"] + [r"(?<=[0-9])~(?=[0-9-])"] + LIST_QUOTES + BASE_TOKENIZER_INFIXES diff --git a/spacy/schemas.py b/spacy/schemas.py index 658e45268..9f91451a9 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -207,7 +207,7 @@ class TokenPatternOperatorSimple(str, Enum): class TokenPatternOperatorMinMax(ConstrainedStr): - regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$") + regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] From e700358ba00cecb2185add0448cf0588b2fc351f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 9 Aug 2022 12:15:13 +0200 Subject: [PATCH 07/20] Add W605 to the errors raised by flake8 in the CI (#11283) --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4624b2eb2..f475b7fdd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ jobs: versionSpec: "3.7" - script: | pip install flake8==3.9.2 - python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics + python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics displayName: "flake8" - job: "Test" From 231a17817db0997caab1379e601dac1b9a90b46c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 9 Aug 2022 21:50:50 +0900 Subject: [PATCH 08/20] Clean up automated label-based issue handling (#11284) * Clean up automated label-based issue handline 1. upgrade tiangolo/issue-manager to latest 2. move needs-more-info to tiangolo 3. change needs-more-info close time to 7 days 4. delete old needs-more-info config * Use old, longer message * Fix label name --- .github/no-response.yml | 13 ------------- .github/workflows/issue-manager.yml | 8 +++++++- 2 files changed, 7 insertions(+), 14 deletions(-) delete mode 100644 .github/no-response.yml diff --git a/.github/no-response.yml b/.github/no-response.yml deleted file mode 100644 index ea78104b9..000000000 --- a/.github/no-response.yml +++ /dev/null @@ -1,13 +0,0 @@ -# Configuration for probot-no-response - https://github.com/probot/no-response - -# Number of days of inactivity before an Issue is closed for lack of response -daysUntilClose: 14 -# Label requiring a response -responseRequiredLabel: more-info-needed -# Comment to post when closing an Issue for lack of response. Set to `false` to disable -closeComment: > - This issue has been automatically closed because there has been no response - to a request for more information from the original author. With only the - information that is currently in the issue, there's not enough information - to take action. If you're the original author, feel free to reopen the issue - if you have or find the answers needed to investigate further. diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 3fb42ed01..8f3a151ea 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -15,7 +15,7 @@ jobs: issue-manager: runs-on: ubuntu-latest steps: - - uses: tiangolo/issue-manager@0.2.1 + - uses: tiangolo/issue-manager@0.4.0 with: token: ${{ secrets.GITHUB_TOKEN }} config: > @@ -25,5 +25,11 @@ jobs: "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", "remove_label_on_comment": true, "remove_label_on_close": true + }, + "more-info-needed": { + "delay": "P7D", + "message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.", + "remove_label_on_comment": true, + "remove_label_on_close": true } } From ed4ad309e6dd6fb420cbf18e4fd5e8de3291eeba Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 10 Aug 2022 09:49:08 +0200 Subject: [PATCH 09/20] Fix Dutch noun chunks to skip overlapping spans (#11275) * Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/lang/nl/syntax_iterators.py | 11 +++++++---- spacy/tests/lang/nl/test_noun_chunks.py | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py index 1ab5e7cff..be9beabe6 100644 --- a/spacy/lang/nl/syntax_iterators.py +++ b/spacy/lang/nl/syntax_iterators.py @@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: span_label = doc.vocab.strings.add("NP") # Only NOUNS and PRONOUNS matter + end_span = -1 for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): # For NOUNS # Pick children from syntactic parse (only those with certain dependencies) @@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: children_i = [c.i for c in children] + [word.i] start_span = min(children_i) - end_span = max(children_i) + 1 - yield start_span, end_span, span_label + if start_span >= end_span: + end_span = max(children_i) + 1 + yield start_span, end_span, span_label # PRONOUNS only if it is the subject of a verb elif word.pos == PRON: if word.dep in pronoun_deps: start_span = word.i - end_span = word.i + 1 - yield start_span, end_span, span_label + if start_span >= end_span: + end_span = word.i + 1 + yield start_span, end_span, span_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/lang/nl/test_noun_chunks.py b/spacy/tests/lang/nl/test_noun_chunks.py index 73b501e4a..8962e3b75 100644 --- a/spacy/tests/lang/nl/test_noun_chunks.py +++ b/spacy/tests/lang/nl/test_noun_chunks.py @@ -1,5 +1,6 @@ -from spacy.tokens import Doc import pytest +from spacy.tokens import Doc +from spacy.util import filter_spans @pytest.fixture @@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking): """ chunks = [s.text.lower() for s in nl_sample.noun_chunks] assert chunks == nl_reference_chunking + + +@pytest.mark.issue(10846) +def test_no_overlapping_chunks(nl_vocab): + # fmt: off + doc = Doc( + nl_vocab, + words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"], + deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"], + heads=[1, 3, 3, 3, 8, 8, 5, 8, 3], + pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"], + ) + # fmt: on + chunks = list(doc.noun_chunks) + assert filter_spans(chunks) == chunks From db7b9938a40830f95f3674c00f122f90805b4f5a Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 16 Aug 2022 11:23:34 -0400 Subject: [PATCH 10/20] Docs: displaCy documentation - data types, `parse_{deps,ents,spans}`, spans example (#10950) * add in spans example and parse references * rm autoformatter * rm extra ents copy * TypedDict draft * type fixes * restore non-documentation files * docs update * fix spans example * fix hyperlinks * add parse example * example fix + argument fix * fix api arg in docs * fix bad variable replacement * fix spacing in style Co-authored-by: Sofie Van Landeghem * fix spacing on table * fix spacing on table * rm temp files Co-authored-by: Sofie Van Landeghem --- spacy/displacy/__init__.py | 5 ++- website/docs/api/top-level.md | 71 ++++++++++++++++++++++++++++++- website/docs/usage/visualizers.md | 39 ++++++++++++++--- 3 files changed, 104 insertions(+), 11 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 5d49b6eb7..7bb300afa 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -123,7 +123,8 @@ def app(environ, start_response): def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: """Generate dependency parse in {'words': [], 'arcs': []} format. - doc (Doc): Document do parse. + orig_doc (Doc): Document to parse. + options (Dict[str, Any]): Dependency parse specific visualisation options. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ doc = Doc(orig_doc.vocab).from_bytes( @@ -209,7 +210,7 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: - """Generate spans in [{start: i, end: i, label: 'label'}] format. + """Generate spans in [{start_token: i, end_token: i, label: 'label'}] format. doc (Doc): Document to parse. options (Dict[str, any]): Span-specific visualisation options. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index c96c571e9..1e1925442 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -240,7 +240,7 @@ browser. Will run a simple web server. | Name | Description | | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | -| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | +| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | @@ -265,7 +265,7 @@ Render a dependency parse tree or named entity visualization. | Name | Description | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ | -| `style` | Visualization style,`"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | +| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | @@ -273,6 +273,73 @@ Render a dependency parse tree or named entity visualization. | `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ | | **RETURNS** | The rendered HTML markup. ~~str~~ | +### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"} + +Generate dependency parse in `{'words': [], 'arcs': []}` format. +For use with the `manual=True` argument in `displacy.render`. + +> #### Example +> +> ```python +> import spacy +> from spacy import displacy +> nlp = spacy.load("en_core_web_sm") +> doc = nlp("This is a sentence.") +> deps_parse = displacy.parse_deps(doc) +> html = displacy.render(deps_parse, style="dep", manual=True) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------- | +| `orig_doc` | Doc to parse dependencies. ~~Doc~~ | +| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ | +| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ | + +### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"} + +Generate named entities in `[{start: i, end: i, label: 'label'}]` format. +For use with the `manual=True` argument in `displacy.render`. + +> #### Example +> +> ```python +> import spacy +> from spacy import displacy +> nlp = spacy.load("en_core_web_sm") +> doc = nlp("But Google is starting from behind.") +> ents_parse = displacy.parse_ents(doc) +> html = displacy.render(ents_parse, style="ent", manual=True) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------- | +| `doc` | Doc to parse entities. ~~Doc~~ | +| `options` | NER-specific visualisation options. ~~Dict[str, Any]~~ | +| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ | + +### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"} + +Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. +For use with the `manual=True` argument in `displacy.render`. + +> #### Example +> +> ```python +> import spacy +> from spacy import displacy +> nlp = spacy.load("en_core_web_sm") +> doc = nlp("But Google is starting from behind.") +> doc.spans['orgs'] = [doc[1:2]] +> ents_parse = displacy.parse_spans(doc, options={"spans_key" : "orgs"}) +> html = displacy.render(ents_parse, style="span", manual=True) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------- | +| `doc` | Doc to parse entities. ~~Doc~~ | +| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ | +| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ | + ### Visualizer options {#displacy_options} The `options` argument lets you specify additional settings for each visualizer. diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index d2892b863..da847d939 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -198,12 +198,12 @@ import DisplacySpanHtml from 'images/displacy-span.html' The span visualizer lets you customize the following `options`: -| Argument | Description | -|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| -| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ | +| Argument | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ | | `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ | -| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | Because spans can be stored across different keys in `doc.spans`, you need to specify which one displaCy should use with `spans_key` (`sc` is the default). @@ -343,9 +343,21 @@ want to visualize output from other libraries, like [NLTK](http://www.nltk.org) or [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet). If you set `manual=True` on either `render()` or `serve()`, you can pass in data -in displaCy's format as a dictionary (instead of `Doc` objects). +in displaCy's format as a dictionary (instead of `Doc` objects). There are helper +functions for converting `Doc` objects to displaCy's format for use with `manual=True`: +[`displacy.parse_deps`](/api/top-level#displacy.parse_deps), +[`displacy.parse_ents`](/api/top-level#displacy.parse_ents), +and [`displacy.parse_spans`](/api/top-level#displacy.parse_spans). -> #### Example +> #### Example with parse function +> +> ```python +> doc = nlp("But Google is starting from behind.") +> ex = displacy.parse_ents(doc) +> html = displacy.render(ex, style="ent", manual=True) +> ``` + +> #### Example with raw data > > ```python > ex = [{"text": "But Google is starting from behind.", @@ -354,6 +366,7 @@ in displaCy's format as a dictionary (instead of `Doc` objects). > html = displacy.render(ex, style="ent", manual=True) > ``` + ```python ### DEP input { @@ -389,6 +402,18 @@ in displaCy's format as a dictionary (instead of `Doc` objects). } ``` +```python +### SPANS input +{ + "text": "Welcome to the Bank of China.", + "spans": [ + {"start_token": 3, "end_token": 6, "label": "ORG"}, + {"start_token": 5, "end_token": 6, "label": "GPE"}, + ], + "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."], +} +``` + ## Using displaCy in a web application {#webapp} If you want to use the visualizers as part of a web application, for example to From cab263791ff25a713bd2a0e72759fa48aff36b9f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 17 Aug 2022 19:55:54 +0200 Subject: [PATCH 11/20] include span_ruler for default warning filter (#11333) --- spacy/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index fd412a4da..9a679ae2c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -16,8 +16,8 @@ def setup_default_warnings(): filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa - # warn about entity_ruler & matcher having no patterns only once - for pipe in ["matcher", "entity_ruler"]: + # warn about entity_ruler, span_ruler & matcher having no patterns only once + for pipe in ["matcher", "entity_ruler", "span_ruler"]: filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) # warn once about lemmatizer without required POS From 09b3118b26520786db5fee468008be4f0653614d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 18 Aug 2022 14:04:57 +0200 Subject: [PATCH 12/20] Add uk pipelines to website (#11332) --- website/meta/languages.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/website/meta/languages.json b/website/meta/languages.json index 6bc2309ed..87c91f791 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -467,10 +467,20 @@ "code": "uk", "name": "Ukrainian", "has_examples": true, + "models": [ + "uk_core_news_sm", + "uk_core_news_md", + "uk_core_news_lg", + "uk_core_news_trf" + ], "dependencies": [ { "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" + }, + { + "name": "pymorphy2-dicts-uk", + "url": "https://github.com/kmike/pymorphy2-dicts/" } ] }, From 3e4cf1bbe1745a55ede0dece31353aebc3f82729 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Aug 2022 09:52:12 +0200 Subject: [PATCH 13/20] Check for . in factory names (#11336) --- spacy/errors.py | 2 ++ spacy/language.py | 9 +++++++-- spacy/tests/test_language.py | 11 +++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 9a679ae2c..40e50aaa9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -540,6 +540,8 @@ class Errors(metaclass=ErrorsWithCodes): E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E853 = ("Unsupported component factory name '{name}'. The character '.' is " + "not permitted in factory names.") E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " "permit overlapping spans.") E855 = ("Invalid {obj}: {obj} is not from the same doc.") diff --git a/spacy/language.py b/spacy/language.py index 816bd6531..e89ae142b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -465,6 +465,8 @@ class Language: """ if not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="factory")) + if "." in name: + raise ValueError(Errors.E853.format(name=name)) if not isinstance(default_config, dict): err = Errors.E962.format( style="default config", name=name, cfg_type=type(default_config) @@ -543,8 +545,11 @@ class Language: DOCS: https://spacy.io/api/language#component """ - if name is not None and not isinstance(name, str): - raise ValueError(Errors.E963.format(decorator="component")) + if name is not None: + if not isinstance(name, str): + raise ValueError(Errors.E963.format(decorator="component")) + if "." in name: + raise ValueError(Errors.E853.format(name=name)) component_name = name if name is not None else util.get_object_name(func) def add_component(component_func: "Pipe") -> Callable: diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index c5fdc8eb0..6f3ba8acc 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -659,3 +659,14 @@ def test_multiprocessing_gpu_warning(nlp2, texts): # Trigger multi-processing. for _ in docs: pass + + +def test_dot_in_factory_names(nlp): + Language.component("my_evil_component", func=evil_component) + nlp.add_pipe("my_evil_component") + + with pytest.raises(ValueError, match="not permitted"): + Language.component("my.evil.component.v1", func=evil_component) + + with pytest.raises(ValueError, match="not permitted"): + Language.factory("my.evil.component.v1", func=evil_component) From 04c6e5cb9526c3ac3ce395be7de5fa607ddefe4b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 22 Aug 2022 11:28:13 +0200 Subject: [PATCH 14/20] Improve floret vectors display in pipeline docs (#11343) --- website/src/templates/models.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 69cec3376..df53f8c3c 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -114,7 +114,11 @@ function formatVectors(data) { if (!data) return 'n/a' if (Object.values(data).every(n => n === 0)) return 'context vectors only' const { keys, vectors, width } = data - return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)` + if (keys >= 0) { + return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)` + } else { + return `${abbrNum(vectors)} floret vectors (${width} dimensions)` + } } function formatAccuracy(data, lang) { From 0f07defe2ca0ba7a726aafb4a30c89627510bae1 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 22 Aug 2022 18:29:05 +0900 Subject: [PATCH 15/20] Remove reference to voting on issue (#11335) Not clear which issue this refers to, we don't suggest this for any other issues, and we don't use votes in general. --- spacy/errors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 40e50aaa9..a1420c8fc 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -535,8 +535,7 @@ class Errors(metaclass=ErrorsWithCodes): E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.") - E200 = ("Can't yet set {attr} from Span. Vote for this feature on the " - "issue tracker: http://github.com/explosion/spaCy/issues") + E200 = ("Can't set {attr} from Span.") E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x From f55bb7470d2f7267937d8491ae6651fbcf505094 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 22 Aug 2022 12:04:30 +0200 Subject: [PATCH 16/20] Clean up warnings in the test suite (#11331) --- .github/azure-steps.yml | 4 ++-- spacy/tests/doc/test_doc_api.py | 5 +++-- spacy/tests/lang/ru/test_lemmatizer.py | 3 +++ spacy/tests/lang/uk/test_lemmatizer.py | 4 ++++ spacy/tests/matcher/test_phrase_matcher.py | 9 +++++---- spacy/tests/pipeline/test_entity_linker.py | 4 ++++ spacy/training/initialize.py | 2 ++ 7 files changed, 23 insertions(+), 8 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index aae08c7f3..18224ba8c 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -54,12 +54,12 @@ steps: condition: eq(${{ parameters.gpu }}, true) - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy + ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error displayName: "Run CPU tests" condition: eq(${{ parameters.gpu }}, false) - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu + ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu displayName: "Run GPU tests" condition: eq(${{ parameters.gpu }}, true) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index dd4942989..a64ab2ba8 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -3,6 +3,7 @@ import weakref import numpy from numpy.testing import assert_array_equal import pytest +import warnings from thinc.api import NumpyOps, get_current_ops from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS @@ -529,9 +530,9 @@ def test_doc_from_array_sent_starts(en_vocab): # no warning using default attrs attrs = doc._get_array_attrs() arr = doc.to_array(attrs) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") new_doc.from_array(attrs, arr) - assert len(record) == 0 # only SENT_START uses SENT_START attrs = [SENT_START] arr = doc.to_array(attrs) diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 3810323bf..9ca7f441b 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -2,6 +2,9 @@ import pytest from spacy.tokens import Doc +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + def test_ru_doc_lemmatization(ru_lemmatizer): words = ["мама", "мыла", "раму"] pos = ["NOUN", "VERB", "NOUN"] diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 4a787b2a6..57dd4198a 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -1,6 +1,10 @@ +import pytest from spacy.tokens import Doc +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 3b24f3ba8..8a8d9eb84 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,4 +1,5 @@ import pytest +import warnings import srsly from mock import Mock @@ -344,13 +345,13 @@ def test_phrase_matcher_validation(en_vocab): matcher.add("TEST1", [doc1]) with pytest.warns(UserWarning): matcher.add("TEST2", [doc2]) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") matcher.add("TEST3", [doc3]) - assert not record.list matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") matcher.add("TEST4", [doc2]) - assert not record.list def test_attr_validation(en_vocab): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 14995d7b8..82bc976bb 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1048,6 +1048,10 @@ def test_no_gold_ents(patterns): for eg in train_examples: eg.predicted = ruler(eg.predicted) + # Entity ruler is no longer needed (initialization below wipes out the + # patterns and causes warnings) + nlp.remove_pipe("entity_ruler") + def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 48ff7b589..6304e4a84 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -337,3 +337,5 @@ def ensure_shape(vectors_loc): # store all the results in a list in memory lines2 = open_file(vectors_loc) yield from lines2 + lines2.close() + lines.close() From 6e20842370bf9ed33b184013241c42f3d2f2a321 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 22 Aug 2022 15:52:53 +0200 Subject: [PATCH 17/20] dev docs: numeric comparators (#11334) * add section on numeric comparators * edit * prettier * Update extra/DEVELOPER_DOCS/Code Conventions.md Co-authored-by: Adriane Boyd * note on typing imports Co-authored-by: Adriane Boyd --- extra/DEVELOPER_DOCS/Code Conventions.md | 25 +++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md index 31a87d362..7294ac38b 100644 --- a/extra/DEVELOPER_DOCS/Code Conventions.md +++ b/extra/DEVELOPER_DOCS/Code Conventions.md @@ -191,6 +191,8 @@ def load_model(name: str) -> "Language": ... ``` +Note that we typically put the `from typing` import statements on the first line(s) of the Python module. + ## Structuring logic ### Positional and keyword arguments @@ -275,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely + return [v.strip() for v in value.split(",")] ``` +### Numeric comparisons + +For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently +apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors. + +One exception to this rule is the ternary case. With a chain like + +```python +if value >= 0 and value < max: + ... +``` + +it's fine to rewrite this to the shorter form + +```python +if 0 <= value < max: + ... +``` + +even though this requires the usage of the `<=` operator. + ### Iteration and comprehensions We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions. @@ -451,7 +474,7 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`. -Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. +Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file. From 7e75327893a60a2985de66bde73f3e1664cdf123 Mon Sep 17 00:00:00 2001 From: Tal Zussman <32444106+tzussman@users.noreply.github.com> Date: Tue, 23 Aug 2022 01:40:38 -0400 Subject: [PATCH 18/20] Fix menu order in linguistic-features.md (#11364) Swap 'Vectors & Similarity' and 'Mappings & Exceptions' in menu to match order in body --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 9dae6f2ee..82472c67e 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -11,8 +11,8 @@ menu: - ['Tokenization', 'tokenization'] - ['Merging & Splitting', 'retokenization'] - ['Sentence Segmentation', 'sbd'] - - ['Vectors & Similarity', 'vectors-similarity'] - ['Mappings & Exceptions', 'mappings-exceptions'] + - ['Vectors & Similarity', 'vectors-similarity'] - ['Language Data', 'language-data'] --- From 5afa98aabfc18a23f19b07b13e2cd12ddb6ee009 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Tue, 23 Aug 2022 10:05:02 +0200 Subject: [PATCH 19/20] Support custom attributes for tokens and spans in json conversion (#11125) * Add token and span custom attributes to to_json() * Change logic for to_json * Add functionality to from_json * Small adjustments * Move token/span attributes to new dict key * Fix test * Fix the same test but much better * Add backwards compatibility tests and adjust logic * Add test to check if attributes not set in underscore are not saved in the json * Add tests for json compatibility * Adjust test names * Fix tests and clean up code * Fix assert json tests * small adjustment * adjust naming and code readability * Adjust naming, added more tests and changed logic * Fix typo * Adjust errors, naming, and small test optimization * Fix byte tests * Fix bytes tests * Change naming and json structure * update schema * Update spacy/schemas.py Co-authored-by: Adriane Boyd * Update spacy/tokens/doc.pyx Co-authored-by: Adriane Boyd * Update spacy/tokens/doc.pyx Co-authored-by: Adriane Boyd * Update spacy/schemas.py Co-authored-by: Adriane Boyd * Update schema for underscore attributes * Adjust underscore schema * adjust schema tests Co-authored-by: Adriane Boyd --- spacy/errors.py | 2 +- spacy/schemas.py | 12 +- spacy/tests/doc/test_json_doc_conversion.py | 194 +++++++++++++++++++- spacy/tokens/doc.pyx | 59 ++++-- 4 files changed, 243 insertions(+), 24 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index a1420c8fc..608305a06 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -389,7 +389,7 @@ class Errors(metaclass=ErrorsWithCodes): "consider using doc.spans instead.") E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore " "settings: {opts}") - E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}") + E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}") E109 = ("Component '{name}' could not be run. Did you forget to " "call `initialize()`?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") diff --git a/spacy/schemas.py b/spacy/schemas.py index 9f91451a9..048082134 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -514,6 +514,14 @@ class DocJSONSchema(BaseModel): tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field( ..., title="Token information - ID, start, annotations" ) - _: Optional[Dict[StrictStr, Any]] = Field( - None, title="Any custom data stored in the document's _ attribute" + underscore_doc: Optional[Dict[StrictStr, Any]] = Field( + None, + title="Any custom data stored in the document's _ attribute", + alias="_", + ) + underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( + None, title="Any custom data stored in the token's _ attribute" + ) + underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( + None, title="Any custom data stored in the span's _ attribute" ) diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py index 85e4def29..0d7c061c9 100644 --- a/spacy/tests/doc/test_json_doc_conversion.py +++ b/spacy/tests/doc/test_json_doc_conversion.py @@ -1,12 +1,15 @@ import pytest import spacy from spacy import schemas -from spacy.tokens import Doc, Span +from spacy.tokens import Doc, Span, Token +import srsly +from .test_underscore import clean_underscore # noqa: F401 @pytest.fixture() def doc(en_vocab): words = ["c", "d", "e"] + spaces = [True, True, True] pos = ["VERB", "NOUN", "NOUN"] tags = ["VBP", "NN", "NN"] heads = [0, 0, 1] @@ -17,6 +20,7 @@ def doc(en_vocab): return Doc( en_vocab, words=words, + spaces=spaces, pos=pos, tags=tags, heads=heads, @@ -45,6 +49,47 @@ def doc_without_deps(en_vocab): ) +@pytest.fixture() +def doc_json(): + return { + "text": "c d e ", + "ents": [{"start": 2, "end": 3, "label": "ORG"}], + "sents": [{"start": 0, "end": 5}], + "tokens": [ + { + "id": 0, + "start": 0, + "end": 1, + "tag": "VBP", + "pos": "VERB", + "morph": "Feat1=A", + "dep": "ROOT", + "head": 0, + }, + { + "id": 1, + "start": 2, + "end": 3, + "tag": "NN", + "pos": "NOUN", + "morph": "Feat1=B", + "dep": "dobj", + "head": 0, + }, + { + "id": 2, + "start": 4, + "end": 5, + "tag": "NN", + "pos": "NOUN", + "morph": "Feat1=A|Feat2=D", + "dep": "dobj", + "head": 1, + }, + ], + } + + def test_doc_to_json(doc): json_doc = doc.to_json() assert json_doc["text"] == "c d e " @@ -56,7 +101,8 @@ def test_doc_to_json(doc): assert json_doc["ents"][0]["start"] == 2 # character offset! assert json_doc["ents"][0]["end"] == 3 # character offset! assert json_doc["ents"][0]["label"] == "ORG" - assert not schemas.validate(schemas.DocJSONSchema, json_doc) + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 + assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc def test_doc_to_json_underscore(doc): @@ -64,11 +110,96 @@ def test_doc_to_json_underscore(doc): Doc.set_extension("json_test2", default=False) doc._.json_test1 = "hello world" doc._.json_test2 = [1, 2, 3] + json_doc = doc.to_json(underscore=["json_test1", "json_test2"]) assert "_" in json_doc assert json_doc["_"]["json_test1"] == "hello world" assert json_doc["_"]["json_test2"] == [1, 2, 3] - assert not schemas.validate(schemas.DocJSONSchema, json_doc) + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 + assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc + + +def test_doc_to_json_with_token_span_attributes(doc): + Doc.set_extension("json_test1", default=False) + Doc.set_extension("json_test2", default=False) + Token.set_extension("token_test", default=False) + Span.set_extension("span_test", default=False) + + doc._.json_test1 = "hello world" + doc._.json_test2 = [1, 2, 3] + doc[0:1]._.span_test = "span_attribute" + doc[0]._.token_test = 117 + doc.spans["span_group"] = [doc[0:1]] + json_doc = doc.to_json( + underscore=["json_test1", "json_test2", "token_test", "span_test"] + ) + + assert "_" in json_doc + assert json_doc["_"]["json_test1"] == "hello world" + assert json_doc["_"]["json_test2"] == [1, 2, 3] + assert "underscore_token" in json_doc + assert "underscore_span" in json_doc + assert json_doc["underscore_token"]["token_test"]["value"] == 117 + assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 + assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc + + +def test_doc_to_json_with_custom_user_data(doc): + Doc.set_extension("json_test", default=False) + Token.set_extension("token_test", default=False) + Span.set_extension("span_test", default=False) + + doc._.json_test = "hello world" + doc[0:1]._.span_test = "span_attribute" + doc[0]._.token_test = 117 + json_doc = doc.to_json(underscore=["json_test", "token_test", "span_test"]) + doc.user_data["user_data_test"] = 10 + doc.user_data[("user_data_test2", True)] = 10 + + assert "_" in json_doc + assert json_doc["_"]["json_test"] == "hello world" + assert "underscore_token" in json_doc + assert "underscore_span" in json_doc + assert json_doc["underscore_token"]["token_test"]["value"] == 117 + assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 + assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc + + +def test_doc_to_json_with_token_span_same_identifier(doc): + Doc.set_extension("my_ext", default=False) + Token.set_extension("my_ext", default=False) + Span.set_extension("my_ext", default=False) + + doc._.my_ext = "hello world" + doc[0:1]._.my_ext = "span_attribute" + doc[0]._.my_ext = 117 + json_doc = doc.to_json(underscore=["my_ext"]) + + assert "_" in json_doc + assert json_doc["_"]["my_ext"] == "hello world" + assert "underscore_token" in json_doc + assert "underscore_span" in json_doc + assert json_doc["underscore_token"]["my_ext"]["value"] == 117 + assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute" + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 + assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc + + +def test_doc_to_json_with_token_attributes_missing(doc): + Token.set_extension("token_test", default=False) + Span.set_extension("span_test", default=False) + + doc[0:1]._.span_test = "span_attribute" + doc[0]._.token_test = 117 + json_doc = doc.to_json(underscore=["span_test"]) + + assert "underscore_token" in json_doc + assert "underscore_span" in json_doc + assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" + assert "token_test" not in json_doc["underscore_token"] + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 def test_doc_to_json_underscore_error_attr(doc): @@ -94,11 +225,29 @@ def test_doc_to_json_span(doc): assert len(json_doc["spans"]) == 1 assert len(json_doc["spans"]["test"]) == 2 assert json_doc["spans"]["test"][0]["start"] == 0 - assert not schemas.validate(schemas.DocJSONSchema, json_doc) + assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 def test_json_to_doc(doc): - new_doc = Doc(doc.vocab).from_json(doc.to_json(), validate=True) + json_doc = doc.to_json() + json_doc = srsly.json_loads(srsly.json_dumps(json_doc)) + new_doc = Doc(doc.vocab).from_json(json_doc, validate=True) + assert new_doc.text == doc.text == "c d e " + assert len(new_doc) == len(doc) == 3 + assert new_doc[0].pos == doc[0].pos + assert new_doc[0].tag == doc[0].tag + assert new_doc[0].dep == doc[0].dep + assert new_doc[0].head.idx == doc[0].head.idx + assert new_doc[0].lemma == doc[0].lemma + assert len(new_doc.ents) == 1 + assert new_doc.ents[0].start == 1 + assert new_doc.ents[0].end == 2 + assert new_doc.ents[0].label_ == "ORG" + assert doc.to_bytes() == new_doc.to_bytes() + + +def test_json_to_doc_compat(doc, doc_json): + new_doc = Doc(doc.vocab).from_json(doc_json, validate=True) new_tokens = [token for token in new_doc] assert new_doc.text == doc.text == "c d e " assert len(new_tokens) == len([token for token in doc]) == 3 @@ -114,11 +263,8 @@ def test_json_to_doc(doc): def test_json_to_doc_underscore(doc): - if not Doc.has_extension("json_test1"): - Doc.set_extension("json_test1", default=False) - if not Doc.has_extension("json_test2"): - Doc.set_extension("json_test2", default=False) - + Doc.set_extension("json_test1", default=False) + Doc.set_extension("json_test2", default=False) doc._.json_test1 = "hello world" doc._.json_test2 = [1, 2, 3] json_doc = doc.to_json(underscore=["json_test1", "json_test2"]) @@ -126,6 +272,34 @@ def test_json_to_doc_underscore(doc): assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)]) assert new_doc._.json_test1 == "hello world" assert new_doc._.json_test2 == [1, 2, 3] + assert doc.to_bytes() == new_doc.to_bytes() + + +def test_json_to_doc_with_token_span_attributes(doc): + Doc.set_extension("json_test1", default=False) + Doc.set_extension("json_test2", default=False) + Token.set_extension("token_test", default=False) + Span.set_extension("span_test", default=False) + doc._.json_test1 = "hello world" + doc._.json_test2 = [1, 2, 3] + doc[0:1]._.span_test = "span_attribute" + doc[0]._.token_test = 117 + + json_doc = doc.to_json( + underscore=["json_test1", "json_test2", "token_test", "span_test"] + ) + json_doc = srsly.json_loads(srsly.json_dumps(json_doc)) + new_doc = Doc(doc.vocab).from_json(json_doc, validate=True) + + assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)]) + assert new_doc._.json_test1 == "hello world" + assert new_doc._.json_test2 == [1, 2, 3] + assert new_doc[0]._.token_test == 117 + assert new_doc[0:1]._.span_test == "span_attribute" + assert new_doc.user_data == doc.user_data + assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes( + exclude=["user_data"] + ) def test_json_to_doc_spans(doc): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d9a104ac8..7ba9a3341 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1602,13 +1602,30 @@ cdef class Doc: ents.append(char_span) self.ents = ents - # Add custom attributes. Note that only Doc extensions are currently considered, Token and Span extensions are - # not yet supported. + # Add custom attributes for the whole Doc object. for attr in doc_json.get("_", {}): if not Doc.has_extension(attr): Doc.set_extension(attr) self._.set(attr, doc_json["_"][attr]) + if doc_json.get("underscore_token", {}): + for token_attr in doc_json["underscore_token"]: + token_start = doc_json["underscore_token"][token_attr]["token_start"] + value = doc_json["underscore_token"][token_attr]["value"] + + if not Token.has_extension(token_attr): + Token.set_extension(token_attr) + self[token_start]._.set(token_attr, value) + + if doc_json.get("underscore_span", {}): + for span_attr in doc_json["underscore_span"]: + token_start = doc_json["underscore_span"][span_attr]["token_start"] + token_end = doc_json["underscore_span"][span_attr]["token_end"] + value = doc_json["underscore_span"][span_attr]["value"] + + if not Span.has_extension(span_attr): + Span.set_extension(span_attr) + self[token_start:token_end]._.set(span_attr, value) return self def to_json(self, underscore=None): @@ -1650,20 +1667,40 @@ cdef class Doc: for span_group in self.spans: data["spans"][span_group] = [] for span in self.spans[span_group]: - span_data = { - "start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_ - } + span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_} data["spans"][span_group].append(span_data) if underscore: - data["_"] = {} + user_keys = set() + if self.user_data: + data["_"] = {} + data["underscore_token"] = {} + data["underscore_span"] = {} + for data_key in self.user_data: + if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": + attr = data_key[1] + start = data_key[2] + end = data_key[3] + if attr in underscore: + user_keys.add(attr) + value = self.user_data[data_key] + if not srsly.is_json_serializable(value): + raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) + # Check if doc attribute + if start is None: + data["_"][attr] = value + # Check if token attribute + elif end is None: + if attr not in data["underscore_token"]: + data["underscore_token"][attr] = {"token_start": start, "value": value} + # Else span attribute + else: + if attr not in data["underscore_span"]: + data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value} + for attr in underscore: - if not self.has_extension(attr): + if attr not in user_keys: raise ValueError(Errors.E106.format(attr=attr, opts=underscore)) - value = self._.get(attr) - if not srsly.is_json_serializable(value): - raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) - data["_"][attr] = value return data def to_utf8_array(self, int nr_char=-1): From c09d2fa25bae47f0c70a3dde6bc2bc43c044b231 Mon Sep 17 00:00:00 2001 From: Tobius Saul <30893923+tobiusaolo@users.noreply.github.com> Date: Tue, 23 Aug 2022 14:09:36 +0300 Subject: [PATCH 20/20] luganda language extension (#10847) * luganda language extension * __init__.py changes * New enhancements * Lexical attribute changed * punctuaction and sentence additions * Remove comment header * Fix typos, reformat * reformated version * Add tokenizer test * Remove contractions from stop words * Format * Add Luganda to website Co-authored-by: Adriane Boyd --- spacy/lang/lg/__init__.py | 18 +++++ spacy/lang/lg/examples.py | 17 +++++ spacy/lang/lg/lex_attrs.py | 95 +++++++++++++++++++++++++++ spacy/lang/lg/punctuation.py | 19 ++++++ spacy/lang/lg/stop_words.py | 19 ++++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/lg/__init__.py | 0 spacy/tests/lang/lg/test_tokenizer.py | 15 +++++ website/meta/languages.json | 5 ++ 9 files changed, 193 insertions(+) create mode 100644 spacy/lang/lg/__init__.py create mode 100644 spacy/lang/lg/examples.py create mode 100644 spacy/lang/lg/lex_attrs.py create mode 100644 spacy/lang/lg/punctuation.py create mode 100644 spacy/lang/lg/stop_words.py create mode 100644 spacy/tests/lang/lg/__init__.py create mode 100644 spacy/tests/lang/lg/test_tokenizer.py diff --git a/spacy/lang/lg/__init__.py b/spacy/lang/lg/__init__.py new file mode 100644 index 000000000..6f7153fce --- /dev/null +++ b/spacy/lang/lg/__init__.py @@ -0,0 +1,18 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from ...language import Language, BaseDefaults + + +class LugandaDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + + +class Luganda(Language): + lang = "lg" + Defaults = LugandaDefaults + + +__all__ = ["Luganda"] diff --git a/spacy/lang/lg/examples.py b/spacy/lang/lg/examples.py new file mode 100644 index 000000000..5450c5520 --- /dev/null +++ b/spacy/lang/lg/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.lg.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Mpa ebyafaayo ku byalo Nakatu ne Nkajja", + "Okuyita Ttembo kitegeeza kugwa ddalu", + "Ekifumu kino kyali kya mulimu ki?", + "Ekkovu we liyise wayitibwa mukululo", + "Akola mulimu ki oguvaamu ssente?", + "Emisumaali egikomerera embaawo giyitibwa nninga", + "Abooluganda ab’emmamba ababiri", + "Ekisaawe ky'ebyenjigiriza kya mugaso nnyo", +] diff --git a/spacy/lang/lg/lex_attrs.py b/spacy/lang/lg/lex_attrs.py new file mode 100644 index 000000000..3c60e3d0e --- /dev/null +++ b/spacy/lang/lg/lex_attrs.py @@ -0,0 +1,95 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "nnooti", # Zero + "zeero", # zero + "emu", # one + "bbiri", # two + "ssatu", # three + "nnya", # four + "ttaano", # five + "mukaaga", # six + "musanvu", # seven + "munaana", # eight + "mwenda", # nine + "kkumi", # ten + "kkumi n'emu", # eleven + "kkumi na bbiri", # twelve + "kkumi na ssatu", # thirteen + "kkumi na nnya", # forteen + "kkumi na ttaano", # fifteen + "kkumi na mukaaga", # sixteen + "kkumi na musanvu", # seventeen + "kkumi na munaana", # eighteen + "kkumi na mwenda", # nineteen + "amakumi abiri", # twenty + "amakumi asatu", # thirty + "amakumi ana", # forty + "amakumi ataano", # fifty + "nkaaga", # sixty + "nsanvu", # seventy + "kinaana", # eighty + "kyenda", # ninety + "kikumi", # hundred + "lukumi", # thousand + "kakadde", # million + "kawumbi", # billion + "kase", # trillion + "katabalika", # quadrillion + "keesedde", # gajillion + "kafukunya", # bazillion + "ekisooka", # first + "ekyokubiri", # second + "ekyokusatu", # third + "ekyokuna", # fourth + "ekyokutaano", # fifith + "ekyomukaaga", # sixth + "ekyomusanvu", # seventh + "eky'omunaana", # eighth + "ekyomwenda", # nineth + "ekyekkumi", # tenth + "ekyekkumi n'ekimu", # eleventh + "ekyekkumi n'ebibiri", # twelveth + "ekyekkumi n'ebisatu", # thirteenth + "ekyekkumi n'ebina", # fourteenth + "ekyekkumi n'ebitaano", # fifteenth + "ekyekkumi n'omukaaga", # sixteenth + "ekyekkumi n'omusanvu", # seventeenth + "ekyekkumi n'omunaana", # eigteenth + "ekyekkumi n'omwenda", # nineteenth + "ekyamakumi abiri", # twentieth + "ekyamakumi asatu", # thirtieth + "ekyamakumi ana", # fortieth + "ekyamakumi ataano", # fiftieth + "ekyenkaaga", # sixtieth + "ekyensanvu", # seventieth + "ekyekinaana", # eightieth + "ekyekyenda", # ninetieth + "ekyekikumi", # hundredth + "ekyolukumi", # thousandth + "ekyakakadde", # millionth + "ekyakawumbi", # billionth + "ekyakase", # trillionth + "ekyakatabalika", # quadrillionth + "ekyakeesedde", # gajillionth + "ekyakafukunya", # bazillionth +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/lg/punctuation.py b/spacy/lang/lg/punctuation.py new file mode 100644 index 000000000..5d3eb792e --- /dev/null +++ b/spacy/lang/lg/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lg/stop_words.py b/spacy/lang/lg/stop_words.py new file mode 100644 index 000000000..7bad59344 --- /dev/null +++ b/spacy/lang/lg/stop_words.py @@ -0,0 +1,19 @@ +STOP_WORDS = set( + """ +abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu +atya awamu aweebwa ayinza ba baali babadde babalina bajja +bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye +bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe +byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo +endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati +kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda +kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe +lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde +nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda +okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya +oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina +tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula +wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe +ye yenna yennyini yina yonna ziba zijja zonna +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index eb643ec2f..5193bd301 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -261,6 +261,11 @@ def lb_tokenizer(): return get_lang_class("lb")().tokenizer +@pytest.fixture(scope="session") +def lg_tokenizer(): + return get_lang_class("lg")().tokenizer + + @pytest.fixture(scope="session") def lt_tokenizer(): return get_lang_class("lt")().tokenizer diff --git a/spacy/tests/lang/lg/__init__.py b/spacy/tests/lang/lg/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/lg/test_tokenizer.py b/spacy/tests/lang/lg/test_tokenizer.py new file mode 100644 index 000000000..958385a77 --- /dev/null +++ b/spacy/tests/lang/lg/test_tokenizer.py @@ -0,0 +1,15 @@ +import pytest + +LG_BASIC_TOKENIZATION_TESTS = [ + ( + "Abooluganda ab’emmamba ababiri", + ["Abooluganda", "ab’emmamba", "ababiri"], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS) +def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens): + tokens = lg_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/website/meta/languages.json b/website/meta/languages.json index 87c91f791..79e1fc5d5 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -265,6 +265,11 @@ "name": "Luxembourgish", "has_examples": true }, + { + "code": "lg", + "name": "Luganda", + "has_examples": true + }, { "code": "lij", "name": "Ligurian",