mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 13:44:55 +03:00
Merge branch 'master' into fix/windows-quoting
This commit is contained in:
commit
fb1d671ed4
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
|
@ -10,7 +10,7 @@ about: Use this template if you came across a bug or unexpected behaviour differ
|
||||||
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
||||||
|
|
||||||
## Your Environment
|
## Your Environment
|
||||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
<!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||||
* Operating System:
|
* Operating System:
|
||||||
* Python Version Used:
|
* Python Version Used:
|
||||||
* spaCy Version Used:
|
* spaCy Version Used:
|
||||||
|
|
2
.github/azure-steps.yml
vendored
2
.github/azure-steps.yml
vendored
|
@ -27,7 +27,7 @@ steps:
|
||||||
|
|
||||||
- script: python -m mypy spacy
|
- script: python -m mypy spacy
|
||||||
displayName: 'Run mypy'
|
displayName: 'Run mypy'
|
||||||
condition: ne(variables['python_version'], '3.10')
|
condition: ne(variables['python_version'], '3.6')
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
|
|
|
@ -6,7 +6,7 @@ repos:
|
||||||
language_version: python3.7
|
language_version: python3.7
|
||||||
additional_dependencies: ['click==8.0.4']
|
additional_dependencies: ['click==8.0.4']
|
||||||
- repo: https://gitlab.com/pycqa/flake8
|
- repo: https://gitlab.com/pycqa/flake8
|
||||||
rev: 3.9.2
|
rev: 5.0.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
args:
|
args:
|
||||||
|
|
|
@ -31,7 +31,7 @@ jobs:
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.9.2
|
pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ pathy>=0.3.5
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||||
jinja2
|
jinja2
|
||||||
langcodes>=3.2.0,<4.0.0
|
langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
|
@ -28,9 +28,9 @@ cython>=0.25,<3.0
|
||||||
pytest>=5.2.0,!=7.1.0
|
pytest>=5.2.0,!=7.1.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<3.10.0
|
flake8>=3.8.0,<6.0.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
mypy>=0.910,<0.970; platform_machine!='aarch64'
|
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
|
||||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||||
types-mock>=0.1.1
|
types-mock>=0.1.1
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
|
|
|
@ -56,7 +56,7 @@ install_requires =
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
|
|
@ -573,3 +573,12 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||||
local_msg.info("Using CPU")
|
local_msg.info("Using CPU")
|
||||||
if gpu_is_available():
|
if gpu_is_available():
|
||||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||||
|
|
||||||
|
|
||||||
|
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||||
|
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||||
|
as happens with `round(number, ndigits)`"""
|
||||||
|
if isinstance(number, float):
|
||||||
|
return f"{number:.{ndigits}f}"
|
||||||
|
else:
|
||||||
|
return str(number)
|
||||||
|
|
|
@ -9,7 +9,7 @@ import typer
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli, _format_number
|
||||||
from ..training import Example, remove_bilu_prefix
|
from ..training import Example, remove_bilu_prefix
|
||||||
from ..training.initialize import get_sourced_components
|
from ..training.initialize import get_sourced_components
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
@ -989,7 +989,8 @@ def _get_kl_divergence(p: Counter, q: Counter) -> float:
|
||||||
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
|
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
|
||||||
"""Compile into one list for easier reporting"""
|
"""Compile into one list for easier reporting"""
|
||||||
d = {
|
d = {
|
||||||
label: [label] + list(round(d[label], 2) for d in span_data) for label in labels
|
label: [label] + list(_format_number(d[label]) for d in span_data)
|
||||||
|
for label in labels
|
||||||
}
|
}
|
||||||
return list(d.values())
|
return list(d.values())
|
||||||
|
|
||||||
|
@ -1004,6 +1005,10 @@ def _get_span_characteristics(
|
||||||
label: _gmean(l)
|
label: _gmean(l)
|
||||||
for label, l in compiled_gold["spans_length"][spans_key].items()
|
for label, l in compiled_gold["spans_length"][spans_key].items()
|
||||||
}
|
}
|
||||||
|
spans_per_type = {
|
||||||
|
label: len(spans)
|
||||||
|
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
|
||||||
|
}
|
||||||
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
||||||
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
||||||
|
|
||||||
|
@ -1031,6 +1036,7 @@ def _get_span_characteristics(
|
||||||
return {
|
return {
|
||||||
"sd": span_distinctiveness,
|
"sd": span_distinctiveness,
|
||||||
"bd": sb_distinctiveness,
|
"bd": sb_distinctiveness,
|
||||||
|
"spans_per_type": spans_per_type,
|
||||||
"lengths": span_length,
|
"lengths": span_length,
|
||||||
"min_length": min(min_lengths),
|
"min_length": min(min_lengths),
|
||||||
"max_length": max(max_lengths),
|
"max_length": max(max_lengths),
|
||||||
|
@ -1045,12 +1051,15 @@ def _get_span_characteristics(
|
||||||
|
|
||||||
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
|
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
|
||||||
"""Print all span characteristics into a table"""
|
"""Print all span characteristics into a table"""
|
||||||
headers = ("Span Type", "Length", "SD", "BD")
|
headers = ("Span Type", "Length", "SD", "BD", "N")
|
||||||
|
# Wasabi has this at 30 by default, but we might have some long labels
|
||||||
|
max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
|
||||||
# Prepare table data with all span characteristics
|
# Prepare table data with all span characteristics
|
||||||
table_data = [
|
table_data = [
|
||||||
span_characteristics["lengths"],
|
span_characteristics["lengths"],
|
||||||
span_characteristics["sd"],
|
span_characteristics["sd"],
|
||||||
span_characteristics["bd"],
|
span_characteristics["bd"],
|
||||||
|
span_characteristics["spans_per_type"],
|
||||||
]
|
]
|
||||||
table = _format_span_row(
|
table = _format_span_row(
|
||||||
span_data=table_data, labels=span_characteristics["labels"]
|
span_data=table_data, labels=span_characteristics["labels"]
|
||||||
|
@ -1061,8 +1070,18 @@ def _print_span_characteristics(span_characteristics: Dict[str, Any]):
|
||||||
span_characteristics["avg_sd"],
|
span_characteristics["avg_sd"],
|
||||||
span_characteristics["avg_bd"],
|
span_characteristics["avg_bd"],
|
||||||
]
|
]
|
||||||
footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data]
|
|
||||||
msg.table(table, footer=footer, header=headers, divider=True)
|
footer = (
|
||||||
|
["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
|
||||||
|
)
|
||||||
|
msg.table(
|
||||||
|
table,
|
||||||
|
footer=footer,
|
||||||
|
header=headers,
|
||||||
|
divider=True,
|
||||||
|
aligns=["l"] + ["r"] * (len(footer_data) + 1),
|
||||||
|
max_col=max_col,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_spans_length_freq_dist(
|
def _get_spans_length_freq_dist(
|
||||||
|
|
|
@ -299,8 +299,8 @@ def get_meta(
|
||||||
}
|
}
|
||||||
nlp = util.load_model_from_path(Path(model_path))
|
nlp = util.load_model_from_path(Path(model_path))
|
||||||
meta.update(nlp.meta)
|
meta.update(nlp.meta)
|
||||||
meta.update(existing_meta)
|
|
||||||
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
|
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
|
||||||
|
meta.update(existing_meta)
|
||||||
meta["vectors"] = {
|
meta["vectors"] = {
|
||||||
"width": nlp.vocab.vectors_length,
|
"width": nlp.vocab.vectors_length,
|
||||||
"vectors": len(nlp.vocab.vectors),
|
"vectors": len(nlp.vocab.vectors),
|
||||||
|
|
|
@ -110,9 +110,6 @@ def update_dvc_config(
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
|
||||||
# some flags that apply to every command
|
# some flags that apply to every command
|
||||||
if verbose and quiet:
|
|
||||||
# don't allow contradictions
|
|
||||||
msg.fail("Can't set both --verbose and --quiet", exits=1)
|
|
||||||
flags = []
|
flags = []
|
||||||
if verbose:
|
if verbose:
|
||||||
flags.append("--verbose")
|
flags.append("--verbose")
|
||||||
|
|
|
@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = lemmatizer_score,
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = lemmatizer_score,
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||||
from typing import Sequence, Tuple, Union
|
from typing import Tuple
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from copy import deepcopy
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
n_labels = len(self.cfg["labels"])
|
n_labels = len(self.cfg["labels"])
|
||||||
guesses: List[Ints2d] = [
|
guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
|
||||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
|
||||||
]
|
|
||||||
assert len(guesses) == n_docs
|
assert len(guesses) == n_docs
|
||||||
return guesses
|
return guesses
|
||||||
scores = self.model.predict(docs)
|
scores = self.model.predict(docs)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import warnings
|
|
||||||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
||||||
from typing import cast
|
import warnings
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -317,7 +316,7 @@ class EntityRuler(Pipe):
|
||||||
phrase_pattern["id"] = ent_id
|
phrase_pattern["id"] = ent_id
|
||||||
phrase_patterns.append(phrase_pattern)
|
phrase_patterns.append(phrase_pattern)
|
||||||
for entry in token_patterns + phrase_patterns: # type: ignore[operator]
|
for entry in token_patterns + phrase_patterns: # type: ignore[operator]
|
||||||
label = entry["label"]
|
label = entry["label"] # type: ignore
|
||||||
if "id" in entry:
|
if "id" in entry:
|
||||||
ent_label = label
|
ent_label = label
|
||||||
label = self._create_label(label, entry["id"])
|
label = self._create_label(label, entry["id"])
|
||||||
|
|
|
@ -133,6 +133,9 @@ def make_spancat(
|
||||||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||||
initialization and training, the component will look for spans on the
|
initialization and training, the component will look for spans on the
|
||||||
reference document under the same key.
|
reference document under the same key.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
|
spans allowed.
|
||||||
threshold (float): Minimum probability to consider a prediction positive.
|
threshold (float): Minimum probability to consider a prediction positive.
|
||||||
Spans with a positive prediction will be saved on the Doc. Defaults to
|
Spans with a positive prediction will be saved on the Doc. Defaults to
|
||||||
0.5.
|
0.5.
|
||||||
|
|
|
@ -96,8 +96,8 @@ def make_multilabel_textcat(
|
||||||
model: Model[List[Doc], List[Floats2d]],
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
threshold: float,
|
threshold: float,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
) -> "TextCategorizer":
|
) -> "MultiLabel_TextCategorizer":
|
||||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
"""Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels are considered
|
over a whole document. It can learn one or more labels, and the labels are considered
|
||||||
to be non-mutually exclusive, which means that there can be zero or more labels
|
to be non-mutually exclusive, which means that there can be zero or more labels
|
||||||
per doc).
|
per doc).
|
||||||
|
@ -105,6 +105,7 @@ def make_multilabel_textcat(
|
||||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||||
scores for each category.
|
scores for each category.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return MultiLabel_TextCategorizer(
|
return MultiLabel_TextCategorizer(
|
||||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||||
|
@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#init
|
DOCS: https://spacy.io/api/textcategorizer#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
|
||||||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||||
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
||||||
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
||||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
|
||||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
|
||||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
|
||||||
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
|
LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
|
||||||
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
|
GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
|
||||||
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
|
LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
description: StrictStr = Field("", title="Description of asset")
|
description: StrictStr = Field("", title="Description of asset")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
|
||||||
class ProjectConfigAssetGit(BaseModel):
|
class ProjectConfigAssetGit(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
description: Optional[StrictStr] = Field(None, title="Description of asset")
|
description: Optional[StrictStr] = Field(None, title="Description of asset")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
|
||||||
None, title="Indices of sentences' start and end indices"
|
None, title="Indices of sentences' start and end indices"
|
||||||
)
|
)
|
||||||
text: StrictStr = Field(..., title="Document text")
|
text: StrictStr = Field(..., title="Document text")
|
||||||
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
|
spans: Optional[
|
||||||
None, title="Span information - end/start indices, label, KB ID"
|
Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
|
||||||
)
|
] = Field(None, title="Span information - end/start indices, label, KB ID")
|
||||||
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
||||||
..., title="Token information - ID, start, annotations"
|
..., title="Token information - ID, start, annotations"
|
||||||
)
|
)
|
||||||
|
|
|
@ -343,6 +343,14 @@ def ru_lemmatizer():
|
||||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ru_lookup_lemmatizer():
|
||||||
|
pytest.importorskip("pymorphy2")
|
||||||
|
return get_lang_class("ru")().add_pipe(
|
||||||
|
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def sa_tokenizer():
|
def sa_tokenizer():
|
||||||
return get_lang_class("sa")().tokenizer
|
return get_lang_class("sa")().tokenizer
|
||||||
|
@ -422,6 +430,15 @@ def uk_lemmatizer():
|
||||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def uk_lookup_lemmatizer():
|
||||||
|
pytest.importorskip("pymorphy2")
|
||||||
|
pytest.importorskip("pymorphy2_dicts_uk")
|
||||||
|
return get_lang_class("uk")().add_pipe(
|
||||||
|
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ur_tokenizer():
|
def ur_tokenizer():
|
||||||
return get_lang_class("ur")().tokenizer
|
return get_lang_class("ur")().tokenizer
|
||||||
|
|
|
@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||||
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||||
|
|
||||||
|
|
||||||
|
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||||
|
words = ["мама", "мыла", "раму"]
|
||||||
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
morphs = [
|
||||||
|
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||||
|
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||||
|
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||||
|
]
|
||||||
|
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||||
|
doc = ru_lookup_lemmatizer(doc)
|
||||||
|
lemmas = [token.lemma_ for token in doc]
|
||||||
|
assert lemmas == ["мама", "мыла", "раму"]
|
||||||
|
|
|
@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
uk_lemmatizer(doc)
|
uk_lemmatizer(doc)
|
||||||
|
assert [token.lemma for token in doc]
|
||||||
|
|
||||||
|
|
||||||
|
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
|
||||||
|
"""Check that the lookup uk lemmatizer runs."""
|
||||||
|
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
uk_lookup_lemmatizer(doc)
|
||||||
|
assert [token.lemma for token in doc]
|
||||||
|
|
|
@ -164,6 +164,9 @@ examples, see the
|
||||||
Apply the pipeline to some text. The text can span multiple sentences, and can
|
Apply the pipeline to some text. The text can span multiple sentences, and can
|
||||||
contain arbitrary whitespace. Alignment into the original string is preserved.
|
contain arbitrary whitespace. Alignment into the original string is preserved.
|
||||||
|
|
||||||
|
Instead of text, a `Doc` can be passed as input, in which case tokenization is
|
||||||
|
skipped, but the rest of the pipeline is run.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
@ -173,7 +176,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `text` | The text to be processed. ~~str~~ |
|
| `text` | The text to be processed, or a Doc. ~~Union[str, Doc]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||||
|
@ -184,6 +187,9 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
|
||||||
Process texts as a stream, and yield `Doc` objects in order. This is usually
|
Process texts as a stream, and yield `Doc` objects in order. This is usually
|
||||||
more efficient than processing texts one-by-one.
|
more efficient than processing texts one-by-one.
|
||||||
|
|
||||||
|
Instead of text, a `Doc` object can be passed as input. In this case
|
||||||
|
tokenization is skipped but the rest of the pipeline is run.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
@ -194,7 +200,7 @@ more efficient than processing texts one-by-one.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts` | A sequence of strings. ~~Iterable[str]~~ |
|
| `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
||||||
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
|
||||||
|
|
|
@ -1,5 +1,62 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "Zshot",
|
||||||
|
"title": "Zshot",
|
||||||
|
"slogan": "Zero and Few shot named entity & relationships recognition",
|
||||||
|
"github": "ibm/zshot",
|
||||||
|
"pip": "zshot",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"from zshot import PipelineConfig, displacy",
|
||||||
|
"from zshot.linker import LinkerRegen",
|
||||||
|
"from zshot.mentions_extractor import MentionsExtractorSpacy",
|
||||||
|
"from zshot.utils.data_models import Entity",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"# zero shot definition of entities",
|
||||||
|
"nlp_config = PipelineConfig(",
|
||||||
|
" mentions_extractor=MentionsExtractorSpacy(),",
|
||||||
|
" linker=LinkerRegen(),",
|
||||||
|
" entities=[",
|
||||||
|
" Entity(name='Paris',",
|
||||||
|
" description='Paris is located in northern central France, in a north-bending arc of the river Seine'),",
|
||||||
|
" Entity(name='IBM',",
|
||||||
|
" description='International Business Machines Corporation (IBM) is an American multinational technology corporation headquartered in Armonk, New York'),",
|
||||||
|
" Entity(name='New York', description='New York is a city in U.S. state'),",
|
||||||
|
" Entity(name='Florida', description='southeasternmost U.S. state'),",
|
||||||
|
" Entity(name='American',",
|
||||||
|
" description='American, something of, from, or related to the United States of America, commonly known as the United States or America'),",
|
||||||
|
" Entity(name='Chemical formula',",
|
||||||
|
" description='In chemistry, a chemical formula is a way of presenting information about the chemical proportions of atoms that constitute a particular chemical compound or molecul'),",
|
||||||
|
" Entity(name='Acetamide',",
|
||||||
|
" description='Acetamide (systematic name: ethanamide) is an organic compound with the formula CH3CONH2. It is the simplest amide derived from acetic acid. It finds some use as a plasticizer and as an industrial solvent.'),",
|
||||||
|
" Entity(name='Armonk',",
|
||||||
|
" description='Armonk is a hamlet and census-designated place (CDP) in the town of North Castle, located in Westchester County, New York, United States.'),",
|
||||||
|
" Entity(name='Acetic Acid',",
|
||||||
|
" description='Acetic acid, systematically named ethanoic acid, is an acidic, colourless liquid and organic compound with the chemical formula CH3COOH'),",
|
||||||
|
" Entity(name='Industrial solvent',",
|
||||||
|
" description='Acetamide (systematic name: ethanamide) is an organic compound with the formula CH3CONH2. It is the simplest amide derived from acetic acid. It finds some use as a plasticizer and as an industrial solvent.'),",
|
||||||
|
" ]",
|
||||||
|
")",
|
||||||
|
"nlp.add_pipe('zshot', config=nlp_config, last=True)",
|
||||||
|
"",
|
||||||
|
"text = 'International Business Machines Corporation (IBM) is an American multinational technology corporation' \\",
|
||||||
|
" ' headquartered in Armonk, New York, with operations in over 171 countries.'",
|
||||||
|
"",
|
||||||
|
"doc = nlp(text)",
|
||||||
|
"displacy.serve(doc, style='ent')"
|
||||||
|
],
|
||||||
|
"thumb": "https://ibm.github.io/zshot/img/graph.png",
|
||||||
|
"url": "https://ibm.github.io/zshot/",
|
||||||
|
"author": "IBM Research",
|
||||||
|
"author_links": {
|
||||||
|
"github": "ibm",
|
||||||
|
"twitter": "IBMResearch",
|
||||||
|
"website": "https://research.ibm.com/labs/ireland/"
|
||||||
|
},
|
||||||
|
"category": ["scientific", "models", "research"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "concepcy",
|
"id": "concepcy",
|
||||||
"title": "concepCy",
|
"title": "concepCy",
|
||||||
|
@ -2403,20 +2460,20 @@
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
|
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
|
||||||
"",
|
"",
|
||||||
"# Load an spacy model (supported models are \"es\" and \"en\") ",
|
"# Load a spaCy model (supported languages are \"es\" and \"en\") ",
|
||||||
"nlp = spacy.load('en')",
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
"# Spacy 3.x",
|
"# spaCy 3.x",
|
||||||
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
|
"nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
|
||||||
"# Spacy 2.x",
|
"# spaCy 2.x",
|
||||||
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
|
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
|
||||||
"token = nlp('prices')[0]",
|
"token = nlp('prices')[0]",
|
||||||
"",
|
"",
|
||||||
"# wordnet object link spacy token with nltk wordnet interface by giving acces to",
|
"# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
|
||||||
"# synsets and lemmas ",
|
"# synsets and lemmas ",
|
||||||
"token._.wordnet.synsets()",
|
"token._.wordnet.synsets()",
|
||||||
"token._.wordnet.lemmas()",
|
"token._.wordnet.lemmas()",
|
||||||
"",
|
"",
|
||||||
"# And automatically tags with wordnet domains",
|
"# And automatically add info about WordNet domains",
|
||||||
"token._.wordnet.wordnet_domains()"
|
"token._.wordnet.wordnet_domains()"
|
||||||
],
|
],
|
||||||
"author": "recognai",
|
"author": "recognai",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user