Merge pull request #12218 from adrianeboyd/chore/update-v4-from-master-7

Update v4 from master
This commit is contained in:
Sofie Van Landeghem 2023-02-03 12:04:20 +01:00 committed by GitHub
commit c47ec5b5c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 96 additions and 45 deletions

View File

@ -175,6 +175,18 @@ def test_modify_span_group(doc):
assert group[0].label == doc.vocab.strings["TEST"] assert group[0].label == doc.vocab.strings["TEST"]
def test_char_span_attributes(doc):
label = "LABEL"
kb_id = "KB_ID"
span_id = "SPAN_ID"
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
assert span1.text == span2.text
assert span1.label_ == span2.label_ == label
assert span1.kb_id_ == span2.kb_id_ == kb_id
assert span1.id_ == span2.id_ == span_id
def test_spans_sent_spans(doc): def test_spans_sent_spans(doc):
sents = list(doc.sents) sents = list(doc.sents)
assert sents[0].start == 0 assert sents[0].start == 0
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
) )
# Span.char_span + alignment mode "contract"
span2 = doc[0:2].char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
def test_span_to_array(doc): def test_span_to_array(doc):
span = doc[1:-2] span = doc[1:-2]

View File

@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing():
def test_cli_find_threshold(capsys): def test_cli_find_threshold(capsys):
thresholds = numpy.linspace(0, 1, 10)
def make_examples(nlp: Language) -> List[Example]: def make_examples(nlp: Language) -> List[Example]:
docs: List[Example] = [] docs: List[Example] = []
@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys):
scores_key="cats_macro_f", scores_key="cats_macro_f",
silent=True, silent=True,
) )
assert best_threshold != thresholds[0]
assert thresholds[0] < best_threshold < thresholds[9]
assert best_score == max(res.values()) assert best_score == max(res.values())
assert res[1.0] == 0.0 assert res[1.0] == 0.0
@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys):
nlp, _ = init_nlp((("spancat", {}),)) nlp, _ = init_nlp((("spancat", {}),))
with make_tempdir() as nlp_dir: with make_tempdir() as nlp_dir:
nlp.to_disk(nlp_dir) nlp.to_disk(nlp_dir)
res = find_threshold( best_threshold, best_score, res = find_threshold(
model=nlp_dir, model=nlp_dir,
data_path=docs_dir / "docs.spacy", data_path=docs_dir / "docs.spacy",
pipe_name="spancat", pipe_name="spancat",
@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys):
scores_key="spans_sc_f", scores_key="spans_sc_f",
silent=True, silent=True,
) )
assert res[0] != thresholds[0] assert best_score == max(res.values())
assert thresholds[0] < res[0] < thresholds[8] assert res[1.0] == 0.0
assert res[1] >= 0.6
assert res[2][1.0] == 0.0
# Having multiple textcat_multilabel components should work, since the name has to be specified. # Having multiple textcat_multilabel components should work, since the name has to be specified.
nlp, _ = init_nlp((("textcat_multilabel", {}),)) nlp, _ = init_nlp((("textcat_multilabel", {}),))

View File

@ -4,7 +4,7 @@ from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc from spacy.tokens import DocBin, Doc
from spacy.cli._util import app from spacy.cli._util import app
from .util import make_tempdir from .util import make_tempdir, normalize_whitespace
def test_convert_auto(): def test_convert_auto():
@ -38,8 +38,8 @@ def test_benchmark_accuracy_alias():
# Verify that the `evaluate` alias works correctly. # Verify that the `evaluate` alias works correctly.
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
assert result_benchmark.stdout == result_evaluate.stdout.replace( assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
"spacy evaluate", "spacy benchmark accuracy" result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
) )

View File

@ -1,6 +1,7 @@
import numpy import numpy
import tempfile import tempfile
import contextlib import contextlib
import re
import srsly import srsly
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2 assert k1 == k2
assert v1 == v2 assert v1 == v2
def normalize_whitespace(s):
return re.sub(r"\s+", " ", s)

View File

@ -108,6 +108,7 @@ class Doc:
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
alignment_mode: str = ..., alignment_mode: str = ...,
span_id: Union[int, str] = ...,
) -> Span: ... ) -> Span: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property @property

View File

@ -528,9 +528,9 @@ cdef class Doc:
doc (Doc): The parent document. doc (Doc): The parent document.
start_idx (int): The index of the first character of the span. start_idx (int): The index of the first character of the span.
end_idx (int): The index of the first character after the span. end_idx (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for label (Union[int, str]): A label to attach to the Span, e.g. for
named entities. named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
named entity. named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
@ -539,6 +539,7 @@ cdef class Doc:
with token boundaries), "contract" (span of all tokens completely with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict". partially covered by the character span). Defaults to "strict".
span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span DOCS: https://spacy.io/api/doc#char_span

View File

@ -96,6 +96,9 @@ class Span:
label: Union[int, str] = ..., label: Union[int, str] = ...,
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
id: Union[int, str] = ...,
alignment_mode: str = ...,
span_id: Union[int, str] = ...,
) -> Span: ... ) -> Span: ...
@property @property
def conjuncts(self) -> Tuple[Token]: ... def conjuncts(self) -> Tuple[Token]: ...

View File

@ -656,22 +656,29 @@ cdef class Span:
else: else:
return self.doc[root] return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice `span.text[start : end]`. """Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span. start (int): The index of the first character of the span.
end (int): The index of the first character after the span. end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for label (Union[int, str]): A label to attach to the Span, e.g. for
named entities. named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
id (Union[int, str]): Unused.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
cdef SpanC* span_c = self.span_c() cdef SpanC* span_c = self.span_c()
start_idx += span_c.start_char start_idx += span_c.start_char
end_idx += span_c.start_char end_idx += span_c.start_char
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
@property @property
def conjuncts(self): def conjuncts(self):

View File

@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir]
> $ python -m spacy project assets [--sparse] > $ python -m spacy project assets [--sparse]
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | | `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
### project run {id="project-run",tag="command"} ### project run {id="project-run",tag="command"}

View File

@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
@ -209,15 +209,16 @@ alignment mode `"strict".
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Doc.set_ents {id="set_ents",tag="method",version="3"} ## Doc.set_ents {id="set_ents",tag="method",version="3"}

View File

@ -186,14 +186,17 @@ the character indices don't map to a valid span.
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------------------------------- | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | `id` | Unused. ~~Union[int, str]~~ |
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Span.similarity {id="similarity",tag="method",model="vectors"} ## Span.similarity {id="similarity",tag="method",model="vectors"}

View File

@ -21,8 +21,8 @@ menu:
## Package naming conventions {id="conventions"} ## Package naming conventions {id="conventions"}
In general, spaCy expects all pipeline packages to follow the naming convention In general, spaCy expects all pipeline packages to follow the naming convention
of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
into three components: three components:
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
tagging, parsing, lemmatization and named entity recognition, or `dep` for tagging, parsing, lemmatization and named entity recognition, or `dep` for

View File

@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0` `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
as explained in the [docs](/api/textcategorizer#assigned-attributes). as explained in the [docs](/api/textcategorizer#assigned-attributes).
### Using the default knowledge base
As `KnowledgeBase` is now an abstract class, you should call the constructor of
the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
implementation:
```diff
- kb = KnowledgeBase()
+ kb = InMemoryLookupKB()
```
If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
instead.
### Updated scorers for tokenization and textcat {id="scores"} ### Updated scorers for tokenization and textcat {id="scores"}
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported