Merge branch 'master' into fix/windows-quoting

This commit is contained in:
Paul O'Leary McCann 2022-10-18 15:21:21 +09:00
commit fb1d671ed4
22 changed files with 177 additions and 49 deletions

View File

@ -10,7 +10,7 @@ about: Use this template if you came across a bug or unexpected behaviour differ
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. --> <!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
## Your Environment ## Your Environment
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.--> <!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
* Operating System: * Operating System:
* Python Version Used: * Python Version Used:
* spaCy Version Used: * spaCy Version Used:

View File

@ -27,7 +27,7 @@ steps:
- script: python -m mypy spacy - script: python -m mypy spacy
displayName: 'Run mypy' displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.10') condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1 - task: DeleteFiles@1
inputs: inputs:

View File

@ -6,7 +6,7 @@ repos:
language_version: python3.7 language_version: python3.7
additional_dependencies: ['click==8.0.4'] additional_dependencies: ['click==8.0.4']
- repo: https://gitlab.com/pycqa/flake8 - repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: args:

View File

@ -31,7 +31,7 @@ jobs:
inputs: inputs:
versionSpec: "3.7" versionSpec: "3.7"
- script: | - script: |
pip install flake8==3.9.2 pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8" displayName: "flake8"

View File

@ -15,7 +15,7 @@ pathy>=0.3.5
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0 langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
@ -28,9 +28,9 @@ cython>=0.25,<3.0
pytest>=5.2.0,!=7.1.0 pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.910,<0.970; platform_machine!='aarch64' mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-setuptools>=57.0.0 types-setuptools>=57.0.0

View File

@ -56,7 +56,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools

View File

@ -573,3 +573,12 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
local_msg.info("Using CPU") local_msg.info("Using CPU")
if gpu_is_available(): if gpu_is_available():
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0") local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
as happens with `round(number, ndigits)`"""
if isinstance(number, float):
return f"{number:.{ndigits}f}"
else:
return str(number)

View File

@ -9,7 +9,7 @@ import typer
import math import math
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli from ._util import import_code, debug_cli, _format_number
from ..training import Example, remove_bilu_prefix from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
@ -989,7 +989,8 @@ def _get_kl_divergence(p: Counter, q: Counter) -> float:
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]: def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
"""Compile into one list for easier reporting""" """Compile into one list for easier reporting"""
d = { d = {
label: [label] + list(round(d[label], 2) for d in span_data) for label in labels label: [label] + list(_format_number(d[label]) for d in span_data)
for label in labels
} }
return list(d.values()) return list(d.values())
@ -1004,6 +1005,10 @@ def _get_span_characteristics(
label: _gmean(l) label: _gmean(l)
for label, l in compiled_gold["spans_length"][spans_key].items() for label, l in compiled_gold["spans_length"][spans_key].items()
} }
spans_per_type = {
label: len(spans)
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
}
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()] min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()] max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
@ -1031,6 +1036,7 @@ def _get_span_characteristics(
return { return {
"sd": span_distinctiveness, "sd": span_distinctiveness,
"bd": sb_distinctiveness, "bd": sb_distinctiveness,
"spans_per_type": spans_per_type,
"lengths": span_length, "lengths": span_length,
"min_length": min(min_lengths), "min_length": min(min_lengths),
"max_length": max(max_lengths), "max_length": max(max_lengths),
@ -1045,12 +1051,15 @@ def _get_span_characteristics(
def _print_span_characteristics(span_characteristics: Dict[str, Any]): def _print_span_characteristics(span_characteristics: Dict[str, Any]):
"""Print all span characteristics into a table""" """Print all span characteristics into a table"""
headers = ("Span Type", "Length", "SD", "BD") headers = ("Span Type", "Length", "SD", "BD", "N")
# Wasabi has this at 30 by default, but we might have some long labels
max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
# Prepare table data with all span characteristics # Prepare table data with all span characteristics
table_data = [ table_data = [
span_characteristics["lengths"], span_characteristics["lengths"],
span_characteristics["sd"], span_characteristics["sd"],
span_characteristics["bd"], span_characteristics["bd"],
span_characteristics["spans_per_type"],
] ]
table = _format_span_row( table = _format_span_row(
span_data=table_data, labels=span_characteristics["labels"] span_data=table_data, labels=span_characteristics["labels"]
@ -1061,8 +1070,18 @@ def _print_span_characteristics(span_characteristics: Dict[str, Any]):
span_characteristics["avg_sd"], span_characteristics["avg_sd"],
span_characteristics["avg_bd"], span_characteristics["avg_bd"],
] ]
footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data]
msg.table(table, footer=footer, header=headers, divider=True) footer = (
["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
)
msg.table(
table,
footer=footer,
header=headers,
divider=True,
aligns=["l"] + ["r"] * (len(footer_data) + 1),
max_col=max_col,
)
def _get_spans_length_freq_dist( def _get_spans_length_freq_dist(

View File

@ -299,8 +299,8 @@ def get_meta(
} }
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta.update(nlp.meta) meta.update(nlp.meta)
meta.update(existing_meta)
meta["spacy_version"] = util.get_minor_version_range(about.__version__) meta["spacy_version"] = util.get_minor_version_range(about.__version__)
meta.update(existing_meta)
meta["vectors"] = { meta["vectors"] = {
"width": nlp.vocab.vectors_length, "width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors), "vectors": len(nlp.vocab.vectors),

View File

@ -110,9 +110,6 @@ def update_dvc_config(
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command # some flags that apply to every command
if verbose and quiet:
# don't allow contradictions
msg.fail("Can't set both --verbose and --quiet", exits=1)
flags = [] flags = []
if verbose: if verbose:
flags.append("--verbose") flags.append("--verbose")

View File

@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score, scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode in {"pymorphy2", "pymorphy2_lookup"}:
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score, scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode in {"pymorphy2", "pymorphy2_lookup"}:
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -1,7 +1,6 @@
from typing import cast, Any, Callable, Dict, Iterable, List, Optional from typing import cast, Any, Callable, Dict, Iterable, List, Optional
from typing import Sequence, Tuple, Union from typing import Tuple
from collections import Counter from collections import Counter
from copy import deepcopy
from itertools import islice from itertools import islice
import numpy as np import numpy as np
@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
n_labels = len(self.cfg["labels"]) n_labels = len(self.cfg["labels"])
guesses: List[Ints2d] = [ guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
]
assert len(guesses) == n_docs assert len(guesses) == n_docs
return guesses return guesses
scores = self.model.predict(docs) scores = self.model.predict(docs)

View File

@ -1,6 +1,5 @@
import warnings
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
from typing import cast import warnings
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -317,7 +316,7 @@ class EntityRuler(Pipe):
phrase_pattern["id"] = ent_id phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern) phrase_patterns.append(phrase_pattern)
for entry in token_patterns + phrase_patterns: # type: ignore[operator] for entry in token_patterns + phrase_patterns: # type: ignore[operator]
label = entry["label"] label = entry["label"] # type: ignore
if "id" in entry: if "id" in entry:
ent_label = label ent_label = label
label = self._create_label(label, entry["id"]) label = self._create_label(label, entry["id"])

View File

@ -133,6 +133,9 @@ def make_spancat(
spans_key (str): Key of the doc.spans dict to save the spans under. During spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the initialization and training, the component will look for spans on the
reference document under the same key. reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
threshold (float): Minimum probability to consider a prediction positive. threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to Spans with a positive prediction will be saved on the Doc. Defaults to
0.5. 0.5.

View File

@ -96,8 +96,8 @@ def make_multilabel_textcat(
model: Model[List[Doc], List[Floats2d]], model: Model[List[Doc], List[Floats2d]],
threshold: float, threshold: float,
scorer: Optional[Callable], scorer: Optional[Callable],
) -> "TextCategorizer": ) -> "MultiLabel_TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels to be non-mutually exclusive, which means that there can be zero or more labels
per doc). per doc).
@ -105,6 +105,7 @@ def make_multilabel_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
""" """
return MultiLabel_TextCategorizer( return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer nlp.vocab, model, name, threshold=threshold, scorer=scorer
@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
DOCS: https://spacy.io/api/textcategorizer#init DOCS: https://spacy.io/api/textcategorizer#init
""" """

View File

@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
class Config: class Config:
extra = "forbid" extra = "forbid"
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
# fmt: off # fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset") dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset") url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset") description: StrictStr = Field("", title="Description of asset")
# fmt: on # fmt: on
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
class ProjectConfigAssetGit(BaseModel): class ProjectConfigAssetGit(BaseModel):
# fmt: off # fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset") description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on # fmt: on
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
None, title="Indices of sentences' start and end indices" None, title="Indices of sentences' start and end indices"
) )
text: StrictStr = Field(..., title="Document text") text: StrictStr = Field(..., title="Document text")
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field( spans: Optional[
None, title="Span information - end/start indices, label, KB ID" Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
) ] = Field(None, title="Span information - end/start indices, label, KB ID")
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field( tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
..., title="Token information - ID, start, annotations" ..., title="Token information - ID, start, annotations"
) )

View File

@ -343,6 +343,14 @@ def ru_lemmatizer():
return get_lang_class("ru")().add_pipe("lemmatizer") return get_lang_class("ru")().add_pipe("lemmatizer")
@pytest.fixture
def ru_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def sa_tokenizer(): def sa_tokenizer():
return get_lang_class("sa")().tokenizer return get_lang_class("sa")().tokenizer
@ -422,6 +430,15 @@ def uk_lemmatizer():
return get_lang_class("uk")().add_pipe("lemmatizer") return get_lang_class("uk")().add_pipe("lemmatizer")
@pytest.fixture
def uk_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2_dicts_uk")
return get_lang_class("uk")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ur_tokenizer(): def ur_tokenizer():
return get_lang_class("ur")().tokenizer return get_lang_class("ur")().tokenizer

View File

@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"]
morphs = [
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
]
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
doc = ru_lookup_lemmatizer(doc)
lemmas = [token.lemma_ for token in doc]
assert lemmas == ["мама", "мыла", "раму"]

View File

@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs.""" """Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
uk_lemmatizer(doc) uk_lemmatizer(doc)
assert [token.lemma for token in doc]
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
"""Check that the lookup uk lemmatizer runs."""
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
uk_lookup_lemmatizer(doc)
assert [token.lemma for token in doc]

View File

@ -164,6 +164,9 @@ examples, see the
Apply the pipeline to some text. The text can span multiple sentences, and can Apply the pipeline to some text. The text can span multiple sentences, and can
contain arbitrary whitespace. Alignment into the original string is preserved. contain arbitrary whitespace. Alignment into the original string is preserved.
Instead of text, a `Doc` can be passed as input, in which case tokenization is
skipped, but the rest of the pipeline is run.
> #### Example > #### Example
> >
> ```python > ```python
@ -173,7 +176,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
| Name | Description | | Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | The text to be processed. ~~str~~ | | `text` | The text to be processed, or a Doc. ~~Union[str, Doc]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
@ -184,6 +187,9 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
Process texts as a stream, and yield `Doc` objects in order. This is usually Process texts as a stream, and yield `Doc` objects in order. This is usually
more efficient than processing texts one-by-one. more efficient than processing texts one-by-one.
Instead of text, a `Doc` object can be passed as input. In this case
tokenization is skipped but the rest of the pipeline is run.
> #### Example > #### Example
> >
> ```python > ```python
@ -194,7 +200,7 @@ more efficient than processing texts one-by-one.
| Name | Description | | Name | Description |
| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts` | A sequence of strings. ~~Iterable[str]~~ | | `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | | `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | | `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |

View File

@ -1,5 +1,62 @@
{ {
"resources": [ "resources": [
{
"id": "Zshot",
"title": "Zshot",
"slogan": "Zero and Few shot named entity & relationships recognition",
"github": "ibm/zshot",
"pip": "zshot",
"code_example": [
"import spacy",
"from zshot import PipelineConfig, displacy",
"from zshot.linker import LinkerRegen",
"from zshot.mentions_extractor import MentionsExtractorSpacy",
"from zshot.utils.data_models import Entity",
"",
"nlp = spacy.load('en_core_web_sm')",
"# zero shot definition of entities",
"nlp_config = PipelineConfig(",
" mentions_extractor=MentionsExtractorSpacy(),",
" linker=LinkerRegen(),",
" entities=[",
" Entity(name='Paris',",
" description='Paris is located in northern central France, in a north-bending arc of the river Seine'),",
" Entity(name='IBM',",
" description='International Business Machines Corporation (IBM) is an American multinational technology corporation headquartered in Armonk, New York'),",
" Entity(name='New York', description='New York is a city in U.S. state'),",
" Entity(name='Florida', description='southeasternmost U.S. state'),",
" Entity(name='American',",
" description='American, something of, from, or related to the United States of America, commonly known as the United States or America'),",
" Entity(name='Chemical formula',",
" description='In chemistry, a chemical formula is a way of presenting information about the chemical proportions of atoms that constitute a particular chemical compound or molecul'),",
" Entity(name='Acetamide',",
" description='Acetamide (systematic name: ethanamide) is an organic compound with the formula CH3CONH2. It is the simplest amide derived from acetic acid. It finds some use as a plasticizer and as an industrial solvent.'),",
" Entity(name='Armonk',",
" description='Armonk is a hamlet and census-designated place (CDP) in the town of North Castle, located in Westchester County, New York, United States.'),",
" Entity(name='Acetic Acid',",
" description='Acetic acid, systematically named ethanoic acid, is an acidic, colourless liquid and organic compound with the chemical formula CH3COOH'),",
" Entity(name='Industrial solvent',",
" description='Acetamide (systematic name: ethanamide) is an organic compound with the formula CH3CONH2. It is the simplest amide derived from acetic acid. It finds some use as a plasticizer and as an industrial solvent.'),",
" ]",
")",
"nlp.add_pipe('zshot', config=nlp_config, last=True)",
"",
"text = 'International Business Machines Corporation (IBM) is an American multinational technology corporation' \\",
" ' headquartered in Armonk, New York, with operations in over 171 countries.'",
"",
"doc = nlp(text)",
"displacy.serve(doc, style='ent')"
],
"thumb": "https://ibm.github.io/zshot/img/graph.png",
"url": "https://ibm.github.io/zshot/",
"author": "IBM Research",
"author_links": {
"github": "ibm",
"twitter": "IBMResearch",
"website": "https://research.ibm.com/labs/ireland/"
},
"category": ["scientific", "models", "research"]
},
{ {
"id": "concepcy", "id": "concepcy",
"title": "concepCy", "title": "concepCy",
@ -2403,20 +2460,20 @@
"import spacy", "import spacy",
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
"", "",
"# Load an spacy model (supported models are \"es\" and \"en\") ", "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
"nlp = spacy.load('en')", "nlp = spacy.load('en_core_web_sm')",
"# Spacy 3.x", "# spaCy 3.x",
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
"# Spacy 2.x", "# spaCy 2.x",
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
"token = nlp('prices')[0]", "token = nlp('prices')[0]",
"", "",
"# wordnet object link spacy token with nltk wordnet interface by giving acces to", "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
"# synsets and lemmas ", "# synsets and lemmas ",
"token._.wordnet.synsets()", "token._.wordnet.synsets()",
"token._.wordnet.lemmas()", "token._.wordnet.lemmas()",
"", "",
"# And automatically tags with wordnet domains", "# And automatically add info about WordNet domains",
"token._.wordnet.wordnet_domains()" "token._.wordnet.wordnet_domains()"
], ],
"author": "recognai", "author": "recognai",