mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Merge branch 'master' into feature/etl
This commit is contained in:
commit
34e8bc620d
1
.github/azure-steps.yml
vendored
1
.github/azure-steps.yml
vendored
|
@ -10,6 +10,7 @@ steps:
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: ${{ parameters.python_version }}
|
versionSpec: ${{ parameters.python_version }}
|
||||||
architecture: ${{ parameters.architecture }}
|
architecture: ${{ parameters.architecture }}
|
||||||
|
allowUnstable: true
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||||
|
|
|
@ -85,6 +85,15 @@ jobs:
|
||||||
Python310Mac:
|
Python310Mac:
|
||||||
imageName: "macos-latest"
|
imageName: "macos-latest"
|
||||||
python.version: "3.10"
|
python.version: "3.10"
|
||||||
|
Python311Linux:
|
||||||
|
imageName: 'ubuntu-latest'
|
||||||
|
python.version: '3.11.0-rc.2'
|
||||||
|
Python311Windows:
|
||||||
|
imageName: 'windows-latest'
|
||||||
|
python.version: '3.11.0-rc.2'
|
||||||
|
Python311Mac:
|
||||||
|
imageName: 'macos-latest'
|
||||||
|
python.version: '3.11.0-rc.2'
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
vmImage: $(imageName)
|
vmImage: $(imageName)
|
||||||
|
|
|
@ -15,7 +15,7 @@ pathy>=0.3.5
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||||
jinja2
|
jinja2
|
||||||
langcodes>=3.2.0,<4.0.0
|
langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
|
|
|
@ -56,7 +56,7 @@ install_requires =
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -30,7 +30,9 @@ MOD_NAMES = [
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
"spacy.vocab",
|
"spacy.vocab",
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
"spacy.kb",
|
"spacy.kb.candidate",
|
||||||
|
"spacy.kb.kb",
|
||||||
|
"spacy.kb.kb_in_memory",
|
||||||
"spacy.ml.parser_model",
|
"spacy.ml.parser_model",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.dep_parser",
|
"spacy.pipeline.dep_parser",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.4.1"
|
__version__ = "3.4.2"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -25,6 +25,7 @@ def project_update_dvc_cli(
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||||
|
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
@ -36,7 +37,7 @@ def project_update_dvc_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-dvc
|
DOCS: https://spacy.io/api/cli#project-dvc
|
||||||
"""
|
"""
|
||||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
||||||
|
|
||||||
|
|
||||||
def project_update_dvc(
|
def project_update_dvc(
|
||||||
|
@ -44,6 +45,7 @@ def project_update_dvc(
|
||||||
workflow: Optional[str] = None,
|
workflow: Optional[str] = None,
|
||||||
*,
|
*,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
|
quiet: bool = False,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||||
|
@ -54,11 +56,12 @@ def project_update_dvc(
|
||||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||||
If not set, the first workflow will be used.
|
If not set, the first workflow will be used.
|
||||||
verbose (bool): Print more info.
|
verbose (bool): Print more info.
|
||||||
|
quiet (bool): Print less info.
|
||||||
force (bool): Force update DVC config.
|
force (bool): Force update DVC config.
|
||||||
"""
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
updated = update_dvc_config(
|
updated = update_dvc_config(
|
||||||
project_dir, config, workflow, verbose=verbose, force=force
|
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
||||||
)
|
)
|
||||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||||
if updated:
|
if updated:
|
||||||
|
@ -72,7 +75,7 @@ def update_dvc_config(
|
||||||
config: Dict[str, Any],
|
config: Dict[str, Any],
|
||||||
workflow: Optional[str] = None,
|
workflow: Optional[str] = None,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
silent: bool = False,
|
quiet: bool = False,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||||
|
@ -83,7 +86,7 @@ def update_dvc_config(
|
||||||
path (Path): The path to the project directory.
|
path (Path): The path to the project directory.
|
||||||
config (Dict[str, Any]): The loaded project.yml.
|
config (Dict[str, Any]): The loaded project.yml.
|
||||||
verbose (bool): Whether to print additional info (via DVC).
|
verbose (bool): Whether to print additional info (via DVC).
|
||||||
silent (bool): Don't output anything (via DVC).
|
quiet (bool): Don't output anything (via DVC).
|
||||||
force (bool): Force update, even if hashes match.
|
force (bool): Force update, even if hashes match.
|
||||||
RETURNS (bool): Whether the DVC config file was updated.
|
RETURNS (bool): Whether the DVC config file was updated.
|
||||||
"""
|
"""
|
||||||
|
@ -105,6 +108,14 @@ def update_dvc_config(
|
||||||
dvc_config_path.unlink()
|
dvc_config_path.unlink()
|
||||||
dvc_commands = []
|
dvc_commands = []
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
|
||||||
|
# some flags that apply to every command
|
||||||
|
flags = []
|
||||||
|
if verbose:
|
||||||
|
flags.append("--verbose")
|
||||||
|
if quiet:
|
||||||
|
flags.append("--quiet")
|
||||||
|
|
||||||
for name in workflows[workflow]:
|
for name in workflows[workflow]:
|
||||||
command = config_commands[name]
|
command = config_commands[name]
|
||||||
deps = command.get("deps", [])
|
deps = command.get("deps", [])
|
||||||
|
@ -118,14 +129,26 @@ def update_dvc_config(
|
||||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
|
||||||
|
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
||||||
if command.get("no_skip"):
|
if command.get("no_skip"):
|
||||||
dvc_cmd.append("--always-changed")
|
dvc_cmd.append("--always-changed")
|
||||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||||
dvc_commands.append(join_command(full_cmd))
|
dvc_commands.append(join_command(full_cmd))
|
||||||
|
|
||||||
|
if not dvc_commands:
|
||||||
|
# If we don't check for this, then there will be an error when reading the
|
||||||
|
# config, since DVC wouldn't create it.
|
||||||
|
msg.fail(
|
||||||
|
"No usable commands for DVC found. This can happen if none of your "
|
||||||
|
"commands have dependencies or outputs.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
with working_dir(path):
|
with working_dir(path):
|
||||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
for c in dvc_commands:
|
||||||
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
dvc_command = "dvc " + c
|
||||||
|
run_command(dvc_command)
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
f.seek(0, 0)
|
f.seek(0, 0)
|
||||||
|
@ -133,26 +156,6 @@ def update_dvc_config(
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def run_dvc_commands(
|
|
||||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
|
||||||
) -> None:
|
|
||||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
|
||||||
|
|
||||||
commands (List[str]): The string commands without the leading "dvc".
|
|
||||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
|
||||||
easier to pass flags like --quiet that depend on a variable or
|
|
||||||
command-line setting while avoiding lots of nested conditionals.
|
|
||||||
"""
|
|
||||||
for c in commands:
|
|
||||||
command = split_command(c)
|
|
||||||
dvc_command = ["dvc", *command]
|
|
||||||
# Add the flags if they are set to True
|
|
||||||
for flag, is_active in flags.items():
|
|
||||||
if is_active:
|
|
||||||
dvc_command.append(flag)
|
|
||||||
run_command(dvc_command)
|
|
||||||
|
|
||||||
|
|
||||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||||
"""Validate workflows provided in project.yml and check that a given
|
"""Validate workflows provided in project.yml and check that a given
|
||||||
workflow can be used to generate a DVC config.
|
workflow can be used to generate a DVC config.
|
||||||
|
|
|
@ -540,6 +540,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Can't set {attr} from Span.")
|
E200 = ("Can't set {attr} from Span.")
|
||||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
E203 = ("If the {name} embedding layer is not updated "
|
||||||
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
|
@ -711,9 +713,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"`nlp.enable_pipe` instead.")
|
"`nlp.enable_pipe` instead.")
|
||||||
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
|
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
|
||||||
"property or default function argument?")
|
"property or default function argument?")
|
||||||
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
|
||||||
"but the provided argument {loc} points to a file.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in `{method}`. "
|
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
|
@ -944,8 +946,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"case pass an empty list for the previously not specified argument to avoid this error.")
|
"case pass an empty list for the previously not specified argument to avoid this error.")
|
||||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||||
"{value}.")
|
"{value}.")
|
||||||
E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
|
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
|
||||||
E1045 = ("Invalid rich group config '{label}'.")
|
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||||
|
"method in '{name}'. If you want to use this method, make "
|
||||||
|
"sure it's overwritten on the subclass.")
|
||||||
|
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||||
|
"knowledge base, use `InMemoryLookupKB`.")
|
||||||
|
E1047 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
|
||||||
|
E1048 = ("Invalid rich group config '{label}'.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
3
spacy/kb/__init__.py
Normal file
3
spacy/kb/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from .kb import KnowledgeBase
|
||||||
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
12
spacy/kb/candidate.pxd
Normal file
12
spacy/kb/candidate.pxd
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from .kb cimport KnowledgeBase
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from ..typedefs cimport hash_t
|
||||||
|
|
||||||
|
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||||
|
cdef class Candidate:
|
||||||
|
cdef readonly KnowledgeBase kb
|
||||||
|
cdef hash_t entity_hash
|
||||||
|
cdef float entity_freq
|
||||||
|
cdef vector[float] entity_vector
|
||||||
|
cdef hash_t alias_hash
|
||||||
|
cdef float prior_prob
|
74
spacy/kb/candidate.pyx
Normal file
74
spacy/kb/candidate.pyx
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
|
from typing import Iterable
|
||||||
|
from .kb cimport KnowledgeBase
|
||||||
|
from ..tokens import Span
|
||||||
|
|
||||||
|
cdef class Candidate:
|
||||||
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||||
|
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||||
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
|
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||||
|
self.kb = kb
|
||||||
|
self.entity_hash = entity_hash
|
||||||
|
self.entity_freq = entity_freq
|
||||||
|
self.entity_vector = entity_vector
|
||||||
|
self.alias_hash = alias_hash
|
||||||
|
self.prior_prob = prior_prob
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity(self) -> int:
|
||||||
|
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||||
|
return self.entity_hash
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_(self) -> str:
|
||||||
|
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||||
|
return self.kb.vocab.strings[self.entity_hash]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias(self) -> int:
|
||||||
|
"""RETURNS (uint64): hash of the alias"""
|
||||||
|
return self.alias_hash
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias_(self) -> str:
|
||||||
|
"""RETURNS (str): ID of the original alias"""
|
||||||
|
return self.kb.vocab.strings[self.alias_hash]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_freq(self) -> float:
|
||||||
|
return self.entity_freq
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_vector(self) -> Iterable[float]:
|
||||||
|
return self.entity_vector
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prior_prob(self) -> float:
|
||||||
|
return self.prior_prob
|
||||||
|
|
||||||
|
|
||||||
|
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||||
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
|
mention (Span): Entity mention for which to identify candidates.
|
||||||
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return kb.get_candidates(mention)
|
||||||
|
|
||||||
|
|
||||||
|
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||||
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
|
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||||
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return kb.get_candidates_batch(mentions)
|
10
spacy/kb/kb.pxd
Normal file
10
spacy/kb/kb.pxd
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
"""Knowledge-base for entity or concept linking."""
|
||||||
|
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
from libc.stdint cimport int64_t
|
||||||
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
cdef class KnowledgeBase:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef readonly Vocab vocab
|
||||||
|
cdef readonly int64_t entity_vector_length
|
108
spacy/kb/kb.pyx
Normal file
108
spacy/kb/kb.pyx
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Tuple, Union
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
from .candidate import Candidate
|
||||||
|
from ..tokens import Span
|
||||||
|
from ..util import SimpleFrozenList
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
cdef class KnowledgeBase:
|
||||||
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||||
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
This is an abstract class and requires its operations to be implemented.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/kb
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||||
|
"""Create a KnowledgeBase."""
|
||||||
|
# Make sure abstract KB is not instantiated.
|
||||||
|
if self.__class__ == KnowledgeBase:
|
||||||
|
raise TypeError(
|
||||||
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.vocab = vocab
|
||||||
|
self.entity_vector_length = entity_vector_length
|
||||||
|
self.mem = Pool()
|
||||||
|
|
||||||
|
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||||
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
If no candidate is found for a given text, an empty list is returned.
|
||||||
|
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||||
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return [self.get_candidates(span) for span in mentions]
|
||||||
|
|
||||||
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||||
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
If the no candidate is found for a given text, an empty list is returned.
|
||||||
|
mention (Span): Mention for which to get candidates.
|
||||||
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||||
|
"""
|
||||||
|
Return vectors for entities.
|
||||||
|
entity (str): Entity name/ID.
|
||||||
|
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
||||||
|
"""
|
||||||
|
return [self.get_vector(entity) for entity in entities]
|
||||||
|
|
||||||
|
def get_vector(self, str entity) -> Iterable[float]:
|
||||||
|
"""
|
||||||
|
Return vector for entity.
|
||||||
|
entity (str): Entity name/ID.
|
||||||
|
RETURNS (Iterable[float]): Vector for specified entity.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
RETURNS (bytes): Current state as binary string.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
bytes_data (bytes): KB state.
|
||||||
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||||
|
"""
|
||||||
|
Write KnowledgeBase content to disk.
|
||||||
|
path (Union[str, Path]): Target file path.
|
||||||
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||||
|
"""
|
||||||
|
Load KnowledgeBase content from disk.
|
||||||
|
path (Union[str, Path]): Target file path.
|
||||||
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||||
|
)
|
|
@ -1,14 +1,12 @@
|
||||||
"""Knowledge-base for entity or concept linking."""
|
"""Knowledge-base for entity or concept linking."""
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
|
||||||
from .vocab cimport Vocab
|
from ..typedefs cimport hash_t
|
||||||
from .typedefs cimport hash_t
|
from ..structs cimport KBEntryC, AliasC
|
||||||
from .structs cimport KBEntryC, AliasC
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
ctypedef vector[KBEntryC] entry_vec
|
ctypedef vector[KBEntryC] entry_vec
|
||||||
ctypedef vector[AliasC] alias_vec
|
ctypedef vector[AliasC] alias_vec
|
||||||
|
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
|
||||||
ctypedef vector[float_vec] float_matrix
|
ctypedef vector[float_vec] float_matrix
|
||||||
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
cdef class Candidate:
|
|
||||||
cdef readonly KnowledgeBase kb
|
|
||||||
cdef hash_t entity_hash
|
|
||||||
cdef float entity_freq
|
|
||||||
cdef vector[float] entity_vector
|
|
||||||
cdef hash_t alias_hash
|
|
||||||
cdef float prior_prob
|
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef readonly Vocab vocab
|
|
||||||
cdef int64_t entity_vector_length
|
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique entity string)
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
|
@ -1,8 +1,7 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterator, Iterable, Callable, Dict, Any
|
from typing import Iterable, Callable, Dict, Any, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
@ -12,85 +11,28 @@ from libcpp.vector cimport vector
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from ..tokens import Span
|
||||||
from .errors import Errors, Warnings
|
from ..typedefs cimport hash_t
|
||||||
from . import util
|
from ..errors import Errors, Warnings
|
||||||
from .util import SimpleFrozenList, ensure_path
|
from .. import util
|
||||||
|
from ..util import SimpleFrozenList, ensure_path
|
||||||
cdef class Candidate:
|
from ..vocab cimport Vocab
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
from .kb cimport KnowledgeBase
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
from .candidate import Candidate as Candidate
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
|
||||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
|
||||||
self.kb = kb
|
|
||||||
self.entity_hash = entity_hash
|
|
||||||
self.entity_freq = entity_freq
|
|
||||||
self.entity_vector = entity_vector
|
|
||||||
self.alias_hash = alias_hash
|
|
||||||
self.prior_prob = prior_prob
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity(self):
|
|
||||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
|
||||||
return self.entity_hash
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_(self):
|
|
||||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
|
||||||
return self.kb.vocab.strings[self.entity_hash]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alias(self):
|
|
||||||
"""RETURNS (uint64): hash of the alias"""
|
|
||||||
return self.alias_hash
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alias_(self):
|
|
||||||
"""RETURNS (str): ID of the original alias"""
|
|
||||||
return self.kb.vocab.strings[self.alias_hash]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_freq(self):
|
|
||||||
return self.entity_freq
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector(self):
|
|
||||||
return self.entity_vector
|
|
||||||
|
|
||||||
@property
|
|
||||||
def prior_prob(self):
|
|
||||||
return self.prior_prob
|
|
||||||
|
|
||||||
|
|
||||||
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""
|
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||||
Return candidate entities for a given span by using the text of the span as the alias
|
|
||||||
and fetching appropriate entries from the index.
|
|
||||||
This particular function is optimized to work with the built-in KB functionality,
|
|
||||||
but any other custom candidate generation method can be used in combination with the KB as well.
|
|
||||||
"""
|
|
||||||
return kb.get_alias_candidates(span.text)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
|
||||||
to support entity linking of named entities to real-world concepts.
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://spacy.io/api/kb_in_memory
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
"""Create a KnowledgeBase."""
|
"""Create an InMemoryLookupKB."""
|
||||||
self.mem = Pool()
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.entity_vector_length = entity_vector_length
|
|
||||||
self._entry_index = PreshMap()
|
self._entry_index = PreshMap()
|
||||||
self._alias_index = PreshMap()
|
self._alias_index = PreshMap()
|
||||||
self.vocab = vocab
|
|
||||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||||
|
|
||||||
def _initialize_entities(self, int64_t nr_entities):
|
def _initialize_entities(self, int64_t nr_entities):
|
||||||
|
@ -104,11 +46,6 @@ cdef class KnowledgeBase:
|
||||||
self._alias_index = PreshMap(nr_aliases + 1)
|
self._alias_index = PreshMap(nr_aliases + 1)
|
||||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector_length(self):
|
|
||||||
"""RETURNS (uint64): length of the entity vectors"""
|
|
||||||
return self.entity_vector_length
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.get_size_entities()
|
return self.get_size_entities()
|
||||||
|
|
||||||
|
@ -286,7 +223,10 @@ cdef class KnowledgeBase:
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
|
return self.get_alias_candidates(mention.text) # type: ignore
|
||||||
|
|
||||||
|
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
|
@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms and string in lookup_table.keys():
|
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
# use lookups, and fall back to the token itself
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(lookup_table.get(string, [string])[0])
|
||||||
forms = list(dict.fromkeys(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -280,7 +280,7 @@ _currency = (
|
||||||
_punct = (
|
_punct = (
|
||||||
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
|
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
|
||||||
)
|
)
|
||||||
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
|
||||||
_hyphens = "- – — -- --- —— ~"
|
_hyphens = "- – — -- --- —— ~"
|
||||||
|
|
||||||
# Various symbols like dingbats, but also emoji
|
# Various symbols like dingbats, but also emoji
|
||||||
|
|
|
@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
rules = rules_table.get(univ_pos, [])
|
rules = rules_table.get(univ_pos, [])
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
|
# first try lookup in table based on upos
|
||||||
if string in index:
|
if string in index:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
||||||
|
# then add anything in the exceptions table
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
|
|
||||||
|
# if nothing found yet, use the rules
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
if not forms:
|
if not forms:
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
|
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
|
|
||||||
|
# if still nothing, add the oov forms from rules
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms and string in lookup_table.keys():
|
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
# use lookups, which fall back to the token itself
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(lookup_table.get(string, [string])[0])
|
||||||
forms = list(dict.fromkeys(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -1,11 +1,15 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class AncientGreekDefaults(BaseDefaults):
|
class AncientGreekDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
46
spacy/lang/grc/punctuation.py
Normal file
46
spacy/lang/grc/punctuation.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
|
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||||
|
from ..char_classes import CONCAT_QUOTES
|
||||||
|
|
||||||
|
_prefixes = (
|
||||||
|
[
|
||||||
|
"†",
|
||||||
|
"⸏",
|
||||||
|
]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_CURRENCY
|
||||||
|
+ LIST_ICONS
|
||||||
|
)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
"†",
|
||||||
|
"⸎",
|
||||||
|
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -28,7 +28,7 @@ class Russian(Language):
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={
|
default_config={
|
||||||
"model": None,
|
"model": None,
|
||||||
"mode": "pymorphy2",
|
"mode": "pymorphy3",
|
||||||
"overwrite": False,
|
"overwrite": False,
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
},
|
},
|
||||||
|
|
|
@ -19,11 +19,11 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy3",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = lemmatizer_score,
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -33,6 +33,16 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer()
|
||||||
|
elif mode == "pymorphy3":
|
||||||
|
try:
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"The Russian lemmatizer mode 'pymorphy3' requires the "
|
||||||
|
"pymorphy3 library. Install it with: pip install pymorphy3"
|
||||||
|
) from None
|
||||||
|
if getattr(self, "_morph", None) is None:
|
||||||
|
self._morph = MorphAnalyzer()
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
@ -104,6 +114,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
return [analyses[0].normal_form]
|
return [analyses[0].normal_form]
|
||||||
return [string]
|
return [string]
|
||||||
|
|
||||||
|
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
return self.pymorphy2_lemmatize(token)
|
||||||
|
|
||||||
|
|
||||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||||
gram_map = {
|
gram_map = {
|
||||||
|
|
|
@ -1,9 +1,17 @@
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class SlovenianDefaults(BaseDefaults):
|
class SlovenianDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Slovenian(Language):
|
class Slovenian(Language):
|
||||||
|
|
145
spacy/lang/sl/lex_attrs.py
Normal file
145
spacy/lang/sl/lex_attrs.py
Normal file
|
@ -0,0 +1,145 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
from ...attrs import IS_CURRENCY
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = set(
|
||||||
|
"""
|
||||||
|
nula ničla nič ena dva tri štiri pet šest sedem osem
|
||||||
|
devet deset enajst dvanajst trinajst štirinajst petnajst
|
||||||
|
šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
|
||||||
|
petdeset šestdest sedemdeset osemdeset devedeset sto tisoč
|
||||||
|
milijon bilijon trilijon kvadrilijon nešteto
|
||||||
|
|
||||||
|
en eden enega enemu ennem enim enih enima enimi ene eni eno
|
||||||
|
dveh dvema dvem dvoje trije treh trem tremi troje štirje štirih štirim štirimi
|
||||||
|
petih petim petimi šestih šestim šestimi sedmih sedmim sedmimi osmih osmim osmimi
|
||||||
|
devetih devetim devetimi desetih desetim desetimi enajstih enajstim enajstimi
|
||||||
|
dvanajstih dvanajstim dvanajstimi trinajstih trinajstim trinajstimi
|
||||||
|
šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
|
||||||
|
sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
|
||||||
|
devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
|
prvi drugi tretji četrti peti šesti sedmi osmi
|
||||||
|
deveti deseti enajsti dvanajsti trinajsti štirinajsti
|
||||||
|
petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
|
||||||
|
dvajseti trideseti štirideseti petdeseti šestdeseti sedemdeseti
|
||||||
|
osemdeseti devetdeseti stoti tisoči milijonti bilijonti
|
||||||
|
trilijonti kvadrilijonti nešteti
|
||||||
|
|
||||||
|
prva druga tretja četrta peta šesta sedma osma
|
||||||
|
deveta deseta enajsta dvanajsta trinajsta štirnajsta
|
||||||
|
petnajsta šestnajsta sedemnajsta osemnajsta devetnajsta
|
||||||
|
dvajseta trideseta štirideseta petdeseta šestdeseta sedemdeseta
|
||||||
|
osemdeseta devetdeseta stota tisoča milijonta bilijonta
|
||||||
|
trilijonta kvadrilijonta nešteta
|
||||||
|
|
||||||
|
prvo drugo tretje četrto peto šestro sedmo osmo
|
||||||
|
deveto deseto enajsto dvanajsto trinajsto štirnajsto
|
||||||
|
petnajsto šestnajsto sedemnajsto osemnajsto devetnajsto
|
||||||
|
dvajseto trideseto štirideseto petdeseto šestdeseto sedemdeseto
|
||||||
|
osemdeseto devetdeseto stoto tisočo milijonto bilijonto
|
||||||
|
trilijonto kvadrilijonto nešteto
|
||||||
|
|
||||||
|
prvega drugega tretjega četrtega petega šestega sedmega osmega
|
||||||
|
devega desetega enajstega dvanajstega trinajstega štirnajstega
|
||||||
|
petnajstega šestnajstega sedemnajstega osemnajstega devetnajstega
|
||||||
|
dvajsetega tridesetega štiridesetega petdesetega šestdesetega sedemdesetega
|
||||||
|
osemdesetega devetdesetega stotega tisočega milijontega bilijontega
|
||||||
|
trilijontega kvadrilijontega neštetega
|
||||||
|
|
||||||
|
prvemu drugemu tretjemu četrtemu petemu šestemu sedmemu osmemu devetemu desetemu
|
||||||
|
enajstemu dvanajstemu trinajstemu štirnajstemu petnajstemu šestnajstemu sedemnajstemu
|
||||||
|
osemnajstemu devetnajstemu dvajsetemu tridesetemu štiridesetemu petdesetemu šestdesetemu
|
||||||
|
sedemdesetemu osemdesetemu devetdesetemu stotemu tisočemu milijontemu bilijontemu
|
||||||
|
trilijontemu kvadrilijontemu neštetemu
|
||||||
|
|
||||||
|
prvem drugem tretjem četrtem petem šestem sedmem osmem devetem desetem
|
||||||
|
enajstem dvanajstem trinajstem štirnajstem petnajstem šestnajstem sedemnajstem
|
||||||
|
osemnajstem devetnajstem dvajsetem tridesetem štiridesetem petdesetem šestdesetem
|
||||||
|
sedemdesetem osemdesetem devetdesetem stotem tisočem milijontem bilijontem
|
||||||
|
trilijontem kvadrilijontem neštetem
|
||||||
|
|
||||||
|
prvim drugim tretjim četrtim petim šestim sedtim osmim devetim desetim
|
||||||
|
enajstim dvanajstim trinajstim štirnajstim petnajstim šestnajstim sedemnajstim
|
||||||
|
osemnajstim devetnajstim dvajsetim tridesetim štiridesetim petdesetim šestdesetim
|
||||||
|
sedemdesetim osemdesetim devetdesetim stotim tisočim milijontim bilijontim
|
||||||
|
trilijontim kvadrilijontim neštetim
|
||||||
|
|
||||||
|
prvih drugih tretjih četrthih petih šestih sedmih osmih deveth desetih
|
||||||
|
enajstih dvanajstih trinajstih štirnajstih petnajstih šestnajstih sedemnajstih
|
||||||
|
osemnajstih devetnajstih dvajsetih tridesetih štiridesetih petdesetih šestdesetih
|
||||||
|
sedemdesetih osemdesetih devetdesetih stotih tisočih milijontih bilijontih
|
||||||
|
trilijontih kvadrilijontih nešteth
|
||||||
|
|
||||||
|
prvima drugima tretjima četrtima petima šestima sedmima osmima devetima desetima
|
||||||
|
enajstima dvanajstima trinajstima štirnajstima petnajstima šestnajstima sedemnajstima
|
||||||
|
osemnajstima devetnajstima dvajsetima tridesetima štiridesetima petdesetima šestdesetima
|
||||||
|
sedemdesetima osemdesetima devetdesetima stotima tisočima milijontima bilijontima
|
||||||
|
trilijontima kvadrilijontima neštetima
|
||||||
|
|
||||||
|
prve druge četrte pete šeste sedme osme devete desete
|
||||||
|
enajste dvanajste trinajste štirnajste petnajste šestnajste sedemnajste
|
||||||
|
osemnajste devetnajste dvajsete tridesete štiridesete petdesete šestdesete
|
||||||
|
sedemdesete osemdesete devetdesete stote tisoče milijonte bilijonte
|
||||||
|
trilijonte kvadrilijonte neštete
|
||||||
|
|
||||||
|
prvimi drugimi tretjimi četrtimi petimi šestimi sedtimi osmimi devetimi desetimi
|
||||||
|
enajstimi dvanajstimi trinajstimi štirnajstimi petnajstimi šestnajstimi sedemnajstimi
|
||||||
|
osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
|
||||||
|
sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
|
||||||
|
trilijontimi kvadrilijontimi neštetimi
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
_currency_words = set(
|
||||||
|
"""
|
||||||
|
evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
|
||||||
|
cent centa centu cenom centov centoma centih centom cente centi
|
||||||
|
dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
|
||||||
|
tolar tolarja tolarji tolarju tolarjem tolarjev tolarjema tolarjih tolarje tol
|
||||||
|
dinar dinarja dinarji dinarju dinarjem dinarjev dinarjema dinarjih dinarje din
|
||||||
|
funt funta funti funtu funtom funtov funtoma funtih funte gpb
|
||||||
|
forint forinta forinti forintu forintom forintov forintoma forintih forinte
|
||||||
|
zlot zlota zloti zlotu zlotom zlotov zlotoma zlotih zlote
|
||||||
|
rupij rupija rupiji rupiju rupijem rupijev rupijema rupijih rupije
|
||||||
|
jen jena jeni jenu jenom jenov jenoma jenih jene
|
||||||
|
kuna kuni kune kuno kun kunama kunah kunam kunami
|
||||||
|
marka marki marke markama markah markami
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_currency(text):
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text in _currency_words:
|
||||||
|
return True
|
||||||
|
for char in text:
|
||||||
|
if unicodedata.category(char) != "Sc":
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num, IS_CURRENCY: is_currency}
|
84
spacy/lang/sl/punctuation.py
Normal file
84
spacy/lang/sl/punctuation.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
from ..char_classes import (
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
UNITS,
|
||||||
|
PUNCT,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
)
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
from ..char_classes import merge_chars
|
||||||
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
|
||||||
|
|
||||||
|
_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
INCLUDE_SPECIAL
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
# split initials like J.K. Rowling
|
||||||
|
r"(?<=[A-Z]\.)(?:[A-Z].)",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# a list of all suffixes following a hyphen that are shouldn't split (eg. BTC-jev)
|
||||||
|
# source: Obeliks tokenizer - https://github.com/clarinsi/obeliks/blob/master/obeliks/res/TokRulesPart1.txt
|
||||||
|
CONCAT_QUOTES = CONCAT_QUOTES.replace("'", "")
|
||||||
|
HYPHENS_PERMITTED = (
|
||||||
|
"((a)|(evemu)|(evskega)|(i)|(jevega)|(jevska)|(jevskimi)|(jinemu)|(oma)|(ovim)|"
|
||||||
|
"(ovski)|(e)|(evi)|(evskem)|(ih)|(jevem)|(jevske)|(jevsko)|(jini)|(ov)|(ovima)|"
|
||||||
|
"(ovskih)|(em)|(evih)|(evskemu)|(ja)|(jevemu)|(jevskega)|(ji)|(jinih)|(ova)|"
|
||||||
|
"(ovimi)|(ovskim)|(ema)|(evim)|(evski)|(je)|(jevi)|(jevskem)|(jih)|(jinim)|"
|
||||||
|
"(ove)|(ovo)|(ovskima)|(ev)|(evima)|(evskih)|(jem)|(jevih)|(jevskemu)|(jin)|"
|
||||||
|
"(jinima)|(ovega)|(ovska)|(ovskimi)|(eva)|(evimi)|(evskim)|(jema)|(jevim)|"
|
||||||
|
"(jevski)|(jina)|(jinimi)|(ovem)|(ovske)|(ovsko)|(eve)|(evo)|(evskima)|(jev)|"
|
||||||
|
"(jevima)|(jevskih)|(jine)|(jino)|(ovemu)|(ovskega)|(u)|(evega)|(evska)|"
|
||||||
|
"(evskimi)|(jeva)|(jevimi)|(jevskim)|(jinega)|(ju)|(ovi)|(ovskem)|(evem)|"
|
||||||
|
"(evske)|(evsko)|(jeve)|(jevo)|(jevskima)|(jinem)|(om)|(ovih)|(ovskemu)|"
|
||||||
|
"(ovec)|(ovca)|(ovcu)|(ovcem)|(ovcev)|(ovcema)|(ovcih)|(ovci)|(ovce)|(ovcimi)|"
|
||||||
|
"(evec)|(evca)|(evcu)|(evcem)|(evcev)|(evcema)|(evcih)|(evci)|(evce)|(evcimi)|"
|
||||||
|
"(jevec)|(jevca)|(jevcu)|(jevcem)|(jevcev)|(jevcema)|(jevcih)|(jevci)|(jevce)|"
|
||||||
|
"(jevcimi)|(ovka)|(ovke)|(ovki)|(ovko)|(ovk)|(ovkama)|(ovkah)|(ovkam)|(ovkami)|"
|
||||||
|
"(evka)|(evke)|(evki)|(evko)|(evk)|(evkama)|(evkah)|(evkam)|(evkami)|(jevka)|"
|
||||||
|
"(jevke)|(jevki)|(jevko)|(jevk)|(jevkama)|(jevkah)|(jevkam)|(jevkami)|(timi)|"
|
||||||
|
"(im)|(ima)|(a)|(imi)|(e)|(o)|(ega)|(ti)|(em)|(tih)|(emu)|(tim)|(i)|(tima)|"
|
||||||
|
"(ih)|(ta)|(te)|(to)|(tega)|(tem)|(temu))"
|
||||||
|
)
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?!{hp}$)(?=[{a}])".format(
|
||||||
|
a=ALPHA, h=HYPHENS, hp=HYPHENS_PERMITTED
|
||||||
|
),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -1,326 +1,84 @@
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||||
# Removed various words that are not normally considered stop words, such as months.
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a
|
a ali
|
||||||
ali
|
|
||||||
b
|
b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
|
||||||
bi
|
boste bova boš brez
|
||||||
bil
|
|
||||||
bila
|
c cel cela celi celo
|
||||||
bile
|
|
||||||
bili
|
č če često četrta četrtek četrti četrto čez čigav
|
||||||
bilo
|
|
||||||
biti
|
d da daleč dan danes datum deset deseta deseti deseto devet
|
||||||
blizu
|
deveta deveti deveto do dober dobra dobri dobro dokler dol dolg
|
||||||
bo
|
dolga dolgi dovolj drug druga drugi drugo dva dve
|
||||||
bodo
|
|
||||||
bolj
|
e eden en ena ene eni enkrat eno etc.
|
||||||
bom
|
|
||||||
bomo
|
|
||||||
boste
|
|
||||||
bova
|
|
||||||
boš
|
|
||||||
brez
|
|
||||||
c
|
|
||||||
cel
|
|
||||||
cela
|
|
||||||
celi
|
|
||||||
celo
|
|
||||||
d
|
|
||||||
da
|
|
||||||
daleč
|
|
||||||
dan
|
|
||||||
danes
|
|
||||||
do
|
|
||||||
dober
|
|
||||||
dobra
|
|
||||||
dobri
|
|
||||||
dobro
|
|
||||||
dokler
|
|
||||||
dol
|
|
||||||
dovolj
|
|
||||||
e
|
|
||||||
eden
|
|
||||||
en
|
|
||||||
ena
|
|
||||||
ene
|
|
||||||
eni
|
|
||||||
enkrat
|
|
||||||
eno
|
|
||||||
etc.
|
|
||||||
f
|
f
|
||||||
g
|
|
||||||
g.
|
g g. ga ga. gor gospa gospod
|
||||||
ga
|
|
||||||
ga.
|
h halo
|
||||||
gor
|
|
||||||
gospa
|
i idr. ii iii in iv ix iz
|
||||||
gospod
|
|
||||||
h
|
j jaz je ji jih jim jo jutri
|
||||||
halo
|
|
||||||
i
|
k kadarkoli kaj kajti kako kakor kamor kamorkoli kar karkoli
|
||||||
idr.
|
katerikoli kdaj kdo kdorkoli ker ki kje kjer kjerkoli
|
||||||
ii
|
ko koder koderkoli koga komu kot kratek kratka kratke kratki
|
||||||
iii
|
|
||||||
in
|
l lahka lahke lahki lahko le lep lepa lepe lepi lepo leto
|
||||||
iv
|
|
||||||
ix
|
m majhen majhna majhni malce malo manj me med medtem mene
|
||||||
iz
|
mesec mi midva midve mnogo moj moja moje mora morajo moram
|
||||||
j
|
moramo morate moraš morem mu
|
||||||
jaz
|
|
||||||
je
|
n na nad naj najina najino najmanj naju največ nam narobe
|
||||||
ji
|
nas nato nazaj naš naša naše ne nedavno nedelja nek neka
|
||||||
jih
|
nekaj nekatere nekateri nekatero nekdo neke nekega neki
|
||||||
jim
|
nekje neko nekoga nekoč ni nikamor nikdar nikjer nikoli
|
||||||
jo
|
nič nje njega njegov njegova njegovo njej njemu njen
|
||||||
k
|
njena njeno nji njih njihov njihova njihovo njiju njim
|
||||||
kadarkoli
|
njo njun njuna njuno no nocoj npr.
|
||||||
kaj
|
|
||||||
kajti
|
o ob oba obe oboje od odprt odprta odprti okoli on
|
||||||
kako
|
onadva one oni onidve osem osma osmi osmo oz.
|
||||||
kakor
|
|
||||||
kamor
|
p pa pet peta petek peti peto po pod pogosto poleg poln
|
||||||
kamorkoli
|
polna polni polno ponavadi ponedeljek ponovno potem
|
||||||
kar
|
povsod pozdravljen pozdravljeni prav prava prave pravi
|
||||||
karkoli
|
pravo prazen prazna prazno prbl. precej pred prej preko
|
||||||
katerikoli
|
pri pribl. približno primer pripravljen pripravljena
|
||||||
kdaj
|
pripravljeni proti prva prvi prvo
|
||||||
kdo
|
|
||||||
kdorkoli
|
r ravno redko res reč
|
||||||
ker
|
|
||||||
ki
|
s saj sam sama same sami samo se sebe sebi sedaj sedem
|
||||||
kje
|
sedma sedmi sedmo sem seveda si sicer skoraj skozi slab sm
|
||||||
kjer
|
so sobota spet sreda srednja srednji sta ste stran stvar sva
|
||||||
kjerkoli
|
|
||||||
ko
|
š šest šesta šesti šesto štiri
|
||||||
koderkoli
|
|
||||||
koga
|
t ta tak taka take taki tako takoj tam te tebe tebi tega
|
||||||
komu
|
težak težka težki težko ti tista tiste tisti tisto tj.
|
||||||
kot
|
tja to toda torek tretja tretje tretji tri tu tudi tukaj
|
||||||
l
|
tvoj tvoja tvoje
|
||||||
le
|
|
||||||
lep
|
|
||||||
lepa
|
|
||||||
lepe
|
|
||||||
lepi
|
|
||||||
lepo
|
|
||||||
m
|
|
||||||
manj
|
|
||||||
me
|
|
||||||
med
|
|
||||||
medtem
|
|
||||||
mene
|
|
||||||
mi
|
|
||||||
midva
|
|
||||||
midve
|
|
||||||
mnogo
|
|
||||||
moj
|
|
||||||
moja
|
|
||||||
moje
|
|
||||||
mora
|
|
||||||
morajo
|
|
||||||
moram
|
|
||||||
moramo
|
|
||||||
morate
|
|
||||||
moraš
|
|
||||||
morem
|
|
||||||
mu
|
|
||||||
n
|
|
||||||
na
|
|
||||||
nad
|
|
||||||
naj
|
|
||||||
najina
|
|
||||||
najino
|
|
||||||
najmanj
|
|
||||||
naju
|
|
||||||
največ
|
|
||||||
nam
|
|
||||||
nas
|
|
||||||
nato
|
|
||||||
nazaj
|
|
||||||
naš
|
|
||||||
naša
|
|
||||||
naše
|
|
||||||
ne
|
|
||||||
nedavno
|
|
||||||
nek
|
|
||||||
neka
|
|
||||||
nekaj
|
|
||||||
nekatere
|
|
||||||
nekateri
|
|
||||||
nekatero
|
|
||||||
nekdo
|
|
||||||
neke
|
|
||||||
nekega
|
|
||||||
neki
|
|
||||||
nekje
|
|
||||||
neko
|
|
||||||
nekoga
|
|
||||||
nekoč
|
|
||||||
ni
|
|
||||||
nikamor
|
|
||||||
nikdar
|
|
||||||
nikjer
|
|
||||||
nikoli
|
|
||||||
nič
|
|
||||||
nje
|
|
||||||
njega
|
|
||||||
njegov
|
|
||||||
njegova
|
|
||||||
njegovo
|
|
||||||
njej
|
|
||||||
njemu
|
|
||||||
njen
|
|
||||||
njena
|
|
||||||
njeno
|
|
||||||
nji
|
|
||||||
njih
|
|
||||||
njihov
|
|
||||||
njihova
|
|
||||||
njihovo
|
|
||||||
njiju
|
|
||||||
njim
|
|
||||||
njo
|
|
||||||
njun
|
|
||||||
njuna
|
|
||||||
njuno
|
|
||||||
no
|
|
||||||
nocoj
|
|
||||||
npr.
|
|
||||||
o
|
|
||||||
ob
|
|
||||||
oba
|
|
||||||
obe
|
|
||||||
oboje
|
|
||||||
od
|
|
||||||
okoli
|
|
||||||
on
|
|
||||||
onadva
|
|
||||||
one
|
|
||||||
oni
|
|
||||||
onidve
|
|
||||||
oz.
|
|
||||||
p
|
|
||||||
pa
|
|
||||||
po
|
|
||||||
pod
|
|
||||||
pogosto
|
|
||||||
poleg
|
|
||||||
ponavadi
|
|
||||||
ponovno
|
|
||||||
potem
|
|
||||||
povsod
|
|
||||||
prbl.
|
|
||||||
precej
|
|
||||||
pred
|
|
||||||
prej
|
|
||||||
preko
|
|
||||||
pri
|
|
||||||
pribl.
|
|
||||||
približno
|
|
||||||
proti
|
|
||||||
r
|
|
||||||
redko
|
|
||||||
res
|
|
||||||
s
|
|
||||||
saj
|
|
||||||
sam
|
|
||||||
sama
|
|
||||||
same
|
|
||||||
sami
|
|
||||||
samo
|
|
||||||
se
|
|
||||||
sebe
|
|
||||||
sebi
|
|
||||||
sedaj
|
|
||||||
sem
|
|
||||||
seveda
|
|
||||||
si
|
|
||||||
sicer
|
|
||||||
skoraj
|
|
||||||
skozi
|
|
||||||
smo
|
|
||||||
so
|
|
||||||
spet
|
|
||||||
sta
|
|
||||||
ste
|
|
||||||
sva
|
|
||||||
t
|
|
||||||
ta
|
|
||||||
tak
|
|
||||||
taka
|
|
||||||
take
|
|
||||||
taki
|
|
||||||
tako
|
|
||||||
takoj
|
|
||||||
tam
|
|
||||||
te
|
|
||||||
tebe
|
|
||||||
tebi
|
|
||||||
tega
|
|
||||||
ti
|
|
||||||
tista
|
|
||||||
tiste
|
|
||||||
tisti
|
|
||||||
tisto
|
|
||||||
tj.
|
|
||||||
tja
|
|
||||||
to
|
|
||||||
toda
|
|
||||||
tu
|
|
||||||
tudi
|
|
||||||
tukaj
|
|
||||||
tvoj
|
|
||||||
tvoja
|
|
||||||
tvoje
|
|
||||||
u
|
u
|
||||||
v
|
|
||||||
vaju
|
v vaju vam vas vaš vaša vaše ve vedno velik velika veliki
|
||||||
vam
|
veliko vendar ves več vi vidva vii viii visok visoka visoke
|
||||||
vas
|
visoki vsa vsaj vsak vsaka vsakdo vsake vsaki vsakomur vse
|
||||||
vaš
|
vsega vsi vso včasih včeraj
|
||||||
vaša
|
|
||||||
vaše
|
x
|
||||||
ve
|
|
||||||
vedno
|
z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
|
||||||
vendar
|
|
||||||
ves
|
ž že
|
||||||
več
|
|
||||||
vi
|
|
||||||
vidva
|
|
||||||
vii
|
|
||||||
viii
|
|
||||||
vsa
|
|
||||||
vsaj
|
|
||||||
vsak
|
|
||||||
vsaka
|
|
||||||
vsakdo
|
|
||||||
vsake
|
|
||||||
vsaki
|
|
||||||
vsakomur
|
|
||||||
vse
|
|
||||||
vsega
|
|
||||||
vsi
|
|
||||||
vso
|
|
||||||
včasih
|
|
||||||
x
|
|
||||||
z
|
|
||||||
za
|
|
||||||
zadaj
|
|
||||||
zadnji
|
|
||||||
zakaj
|
|
||||||
zdaj
|
|
||||||
zelo
|
|
||||||
zunaj
|
|
||||||
č
|
|
||||||
če
|
|
||||||
često
|
|
||||||
čez
|
|
||||||
čigav
|
|
||||||
š
|
|
||||||
ž
|
|
||||||
že
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
272
spacy/lang/sl/tokenizer_exceptions.py
Normal file
272
spacy/lang/sl/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,272 @@
|
||||||
|
from typing import Dict, List
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...symbols import ORTH, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
_exc: Dict[str, List[Dict]] = {}
|
||||||
|
|
||||||
|
_other_exc = {
|
||||||
|
"t.i.": [{ORTH: "t.", NORM: "tako"}, {ORTH: "i.", NORM: "imenovano"}],
|
||||||
|
"t.j.": [{ORTH: "t.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
|
||||||
|
"T.j.": [{ORTH: "T.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
|
||||||
|
"d.o.o.": [
|
||||||
|
{ORTH: "d.", NORM: "družba"},
|
||||||
|
{ORTH: "o.", NORM: "omejeno"},
|
||||||
|
{ORTH: "o.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"D.O.O.": [
|
||||||
|
{ORTH: "D.", NORM: "družba"},
|
||||||
|
{ORTH: "O.", NORM: "omejeno"},
|
||||||
|
{ORTH: "O.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"d.n.o.": [
|
||||||
|
{ORTH: "d.", NORM: "družba"},
|
||||||
|
{ORTH: "n.", NORM: "neomejeno"},
|
||||||
|
{ORTH: "o.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"D.N.O.": [
|
||||||
|
{ORTH: "D.", NORM: "družba"},
|
||||||
|
{ORTH: "N.", NORM: "neomejeno"},
|
||||||
|
{ORTH: "O.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"d.d.": [{ORTH: "d.", NORM: "delniška"}, {ORTH: "d.", NORM: "družba"}],
|
||||||
|
"D.D.": [{ORTH: "D.", NORM: "delniška"}, {ORTH: "D.", NORM: "družba"}],
|
||||||
|
"s.p.": [{ORTH: "s.", NORM: "samostojni"}, {ORTH: "p.", NORM: "podjetnik"}],
|
||||||
|
"S.P.": [{ORTH: "S.", NORM: "samostojni"}, {ORTH: "P.", NORM: "podjetnik"}],
|
||||||
|
"l.r.": [{ORTH: "l.", NORM: "lastno"}, {ORTH: "r.", NORM: "ročno"}],
|
||||||
|
"le-te": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "te"}],
|
||||||
|
"Le-te": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "te"}],
|
||||||
|
"le-ti": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ti"}],
|
||||||
|
"Le-ti": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ti"}],
|
||||||
|
"le-to": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "to"}],
|
||||||
|
"Le-to": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "to"}],
|
||||||
|
"le-ta": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ta"}],
|
||||||
|
"Le-ta": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ta"}],
|
||||||
|
"le-tega": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "tega"}],
|
||||||
|
"Le-tega": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "tega"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
_exc.update(_other_exc)
|
||||||
|
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "adm.", NORM: "administracija"},
|
||||||
|
{ORTH: "aer.", NORM: "aeronavtika"},
|
||||||
|
{ORTH: "agr.", NORM: "agronomija"},
|
||||||
|
{ORTH: "amer.", NORM: "ameriško"},
|
||||||
|
{ORTH: "anat.", NORM: "anatomija"},
|
||||||
|
{ORTH: "angl.", NORM: "angleški"},
|
||||||
|
{ORTH: "ant.", NORM: "antonim"},
|
||||||
|
{ORTH: "antr.", NORM: "antropologija"},
|
||||||
|
{ORTH: "apr.", NORM: "april"},
|
||||||
|
{ORTH: "arab.", NORM: "arabsko"},
|
||||||
|
{ORTH: "arheol.", NORM: "arheologija"},
|
||||||
|
{ORTH: "arhit.", NORM: "arhitektura"},
|
||||||
|
{ORTH: "avg.", NORM: "avgust"},
|
||||||
|
{ORTH: "avstr.", NORM: "avstrijsko"},
|
||||||
|
{ORTH: "avt.", NORM: "avtomobilizem"},
|
||||||
|
{ORTH: "bibl.", NORM: "biblijsko"},
|
||||||
|
{ORTH: "biokem.", NORM: "biokemija"},
|
||||||
|
{ORTH: "biol.", NORM: "biologija"},
|
||||||
|
{ORTH: "bolg.", NORM: "bolgarski"},
|
||||||
|
{ORTH: "bot.", NORM: "botanika"},
|
||||||
|
{ORTH: "cit.", NORM: "citat"},
|
||||||
|
{ORTH: "daj.", NORM: "dajalnik"},
|
||||||
|
{ORTH: "del.", NORM: "deležnik"},
|
||||||
|
{ORTH: "ed.", NORM: "ednina"},
|
||||||
|
{ORTH: "etn.", NORM: "etnografija"},
|
||||||
|
{ORTH: "farm.", NORM: "farmacija"},
|
||||||
|
{ORTH: "filat.", NORM: "filatelija"},
|
||||||
|
{ORTH: "filoz.", NORM: "filozofija"},
|
||||||
|
{ORTH: "fin.", NORM: "finančništvo"},
|
||||||
|
{ORTH: "fiz.", NORM: "fizika"},
|
||||||
|
{ORTH: "fot.", NORM: "fotografija"},
|
||||||
|
{ORTH: "fr.", NORM: "francoski"},
|
||||||
|
{ORTH: "friz.", NORM: "frizerstvo"},
|
||||||
|
{ORTH: "gastr.", NORM: "gastronomija"},
|
||||||
|
{ORTH: "geogr.", NORM: "geografija"},
|
||||||
|
{ORTH: "geol.", NORM: "geologija"},
|
||||||
|
{ORTH: "geom.", NORM: "geometrija"},
|
||||||
|
{ORTH: "germ.", NORM: "germanski"},
|
||||||
|
{ORTH: "gl.", NORM: "glej"},
|
||||||
|
{ORTH: "glag.", NORM: "glagolski"},
|
||||||
|
{ORTH: "glasb.", NORM: "glasba"},
|
||||||
|
{ORTH: "gled.", NORM: "gledališče"},
|
||||||
|
{ORTH: "gost.", NORM: "gostinstvo"},
|
||||||
|
{ORTH: "gozd.", NORM: "gozdarstvo"},
|
||||||
|
{ORTH: "gr.", NORM: "grški"},
|
||||||
|
{ORTH: "grad.", NORM: "gradbeništvo"},
|
||||||
|
{ORTH: "hebr.", NORM: "hebrejsko"},
|
||||||
|
{ORTH: "hrv.", NORM: "hrvaško"},
|
||||||
|
{ORTH: "ide.", NORM: "indoevropsko"},
|
||||||
|
{ORTH: "igr.", NORM: "igre"},
|
||||||
|
{ORTH: "im.", NORM: "imenovalnik"},
|
||||||
|
{ORTH: "iron.", NORM: "ironično"},
|
||||||
|
{ORTH: "it.", NORM: "italijanski"},
|
||||||
|
{ORTH: "itd.", NORM: "in tako dalje"},
|
||||||
|
{ORTH: "itn.", NORM: "in tako naprej"},
|
||||||
|
{ORTH: "ipd.", NORM: "in podobno"},
|
||||||
|
{ORTH: "jap.", NORM: "japonsko"},
|
||||||
|
{ORTH: "jul.", NORM: "julij"},
|
||||||
|
{ORTH: "jun.", NORM: "junij"},
|
||||||
|
{ORTH: "kit.", NORM: "kitajsko"},
|
||||||
|
{ORTH: "knj.", NORM: "knjižno"},
|
||||||
|
{ORTH: "knjiž.", NORM: "knjižno"},
|
||||||
|
{ORTH: "kor.", NORM: "koreografija"},
|
||||||
|
{ORTH: "lat.", NORM: "latinski"},
|
||||||
|
{ORTH: "les.", NORM: "lesna stroka"},
|
||||||
|
{ORTH: "lingv.", NORM: "lingvistika"},
|
||||||
|
{ORTH: "lit.", NORM: "literarni"},
|
||||||
|
{ORTH: "ljubk.", NORM: "ljubkovalno"},
|
||||||
|
{ORTH: "lov.", NORM: "lovstvo"},
|
||||||
|
{ORTH: "m.", NORM: "moški"},
|
||||||
|
{ORTH: "mak.", NORM: "makedonski"},
|
||||||
|
{ORTH: "mar.", NORM: "marec"},
|
||||||
|
{ORTH: "mat.", NORM: "matematika"},
|
||||||
|
{ORTH: "med.", NORM: "medicina"},
|
||||||
|
{ORTH: "meh.", NORM: "mehiško"},
|
||||||
|
{ORTH: "mest.", NORM: "mestnik"},
|
||||||
|
{ORTH: "mdr.", NORM: "med drugim"},
|
||||||
|
{ORTH: "min.", NORM: "mineralogija"},
|
||||||
|
{ORTH: "mitol.", NORM: "mitologija"},
|
||||||
|
{ORTH: "mn.", NORM: "množina"},
|
||||||
|
{ORTH: "mont.", NORM: "montanistika"},
|
||||||
|
{ORTH: "muz.", NORM: "muzikologija"},
|
||||||
|
{ORTH: "nam.", NORM: "namenilnik"},
|
||||||
|
{ORTH: "nar.", NORM: "narečno"},
|
||||||
|
{ORTH: "nav.", NORM: "navadno"},
|
||||||
|
{ORTH: "nedol.", NORM: "nedoločnik"},
|
||||||
|
{ORTH: "nedov.", NORM: "nedovršni"},
|
||||||
|
{ORTH: "neprav.", NORM: "nepravilno"},
|
||||||
|
{ORTH: "nepreh.", NORM: "neprehodno"},
|
||||||
|
{ORTH: "neskl.", NORM: "nesklonljiv(o)"},
|
||||||
|
{ORTH: "nestrok.", NORM: "nestrokovno"},
|
||||||
|
{ORTH: "num.", NORM: "numizmatika"},
|
||||||
|
{ORTH: "npr.", NORM: "na primer"},
|
||||||
|
{ORTH: "obrt.", NORM: "obrtništvo"},
|
||||||
|
{ORTH: "okt.", NORM: "oktober"},
|
||||||
|
{ORTH: "or.", NORM: "orodnik"},
|
||||||
|
{ORTH: "os.", NORM: "oseba"},
|
||||||
|
{ORTH: "otr.", NORM: "otroško"},
|
||||||
|
{ORTH: "oz.", NORM: "oziroma"},
|
||||||
|
{ORTH: "pal.", NORM: "paleontologija"},
|
||||||
|
{ORTH: "papir.", NORM: "papirništvo"},
|
||||||
|
{ORTH: "ped.", NORM: "pedagogika"},
|
||||||
|
{ORTH: "pisar.", NORM: "pisarniško"},
|
||||||
|
{ORTH: "pog.", NORM: "pogovorno"},
|
||||||
|
{ORTH: "polit.", NORM: "politika"},
|
||||||
|
{ORTH: "polj.", NORM: "poljsko"},
|
||||||
|
{ORTH: "poljud.", NORM: "poljudno"},
|
||||||
|
{ORTH: "preg.", NORM: "pregovor"},
|
||||||
|
{ORTH: "preh.", NORM: "prehodno"},
|
||||||
|
{ORTH: "pren.", NORM: "preneseno"},
|
||||||
|
{ORTH: "prid.", NORM: "pridevnik"},
|
||||||
|
{ORTH: "prim.", NORM: "primerjaj"},
|
||||||
|
{ORTH: "prisl.", NORM: "prislov"},
|
||||||
|
{ORTH: "psih.", NORM: "psihologija"},
|
||||||
|
{ORTH: "psiht.", NORM: "psihiatrija"},
|
||||||
|
{ORTH: "rad.", NORM: "radiotehnika"},
|
||||||
|
{ORTH: "rač.", NORM: "računalništvo"},
|
||||||
|
{ORTH: "rib.", NORM: "ribištvo"},
|
||||||
|
{ORTH: "rod.", NORM: "rodilnik"},
|
||||||
|
{ORTH: "rus.", NORM: "rusko"},
|
||||||
|
{ORTH: "s.", NORM: "srednji"},
|
||||||
|
{ORTH: "sam.", NORM: "samostalniški"},
|
||||||
|
{ORTH: "sed.", NORM: "sedanjik"},
|
||||||
|
{ORTH: "sep.", NORM: "september"},
|
||||||
|
{ORTH: "slabš.", NORM: "slabšalno"},
|
||||||
|
{ORTH: "slovan.", NORM: "slovansko"},
|
||||||
|
{ORTH: "slovaš.", NORM: "slovaško"},
|
||||||
|
{ORTH: "srb.", NORM: "srbsko"},
|
||||||
|
{ORTH: "star.", NORM: "starinsko"},
|
||||||
|
{ORTH: "stil.", NORM: "stilno"},
|
||||||
|
{ORTH: "sv.", NORM: "svet(i)"},
|
||||||
|
{ORTH: "teh.", NORM: "tehnika"},
|
||||||
|
{ORTH: "tisk.", NORM: "tiskarstvo"},
|
||||||
|
{ORTH: "tj.", NORM: "to je"},
|
||||||
|
{ORTH: "tož.", NORM: "tožilnik"},
|
||||||
|
{ORTH: "trg.", NORM: "trgovina"},
|
||||||
|
{ORTH: "ukr.", NORM: "ukrajinski"},
|
||||||
|
{ORTH: "um.", NORM: "umetnost"},
|
||||||
|
{ORTH: "vel.", NORM: "velelnik"},
|
||||||
|
{ORTH: "vet.", NORM: "veterina"},
|
||||||
|
{ORTH: "vez.", NORM: "veznik"},
|
||||||
|
{ORTH: "vn.", NORM: "visokonemško"},
|
||||||
|
{ORTH: "voj.", NORM: "vojska"},
|
||||||
|
{ORTH: "vrtn.", NORM: "vrtnarstvo"},
|
||||||
|
{ORTH: "vulg.", NORM: "vulgarno"},
|
||||||
|
{ORTH: "vznes.", NORM: "vzneseno"},
|
||||||
|
{ORTH: "zal.", NORM: "založništvo"},
|
||||||
|
{ORTH: "zastar.", NORM: "zastarelo"},
|
||||||
|
{ORTH: "zgod.", NORM: "zgodovina"},
|
||||||
|
{ORTH: "zool.", NORM: "zoologija"},
|
||||||
|
{ORTH: "čeb.", NORM: "čebelarstvo"},
|
||||||
|
{ORTH: "češ.", NORM: "češki"},
|
||||||
|
{ORTH: "člov.", NORM: "človeškost"},
|
||||||
|
{ORTH: "šah.", NORM: "šahovski"},
|
||||||
|
{ORTH: "šalj.", NORM: "šaljivo"},
|
||||||
|
{ORTH: "šp.", NORM: "španski"},
|
||||||
|
{ORTH: "špan.", NORM: "špansko"},
|
||||||
|
{ORTH: "šport.", NORM: "športni"},
|
||||||
|
{ORTH: "štev.", NORM: "števnik"},
|
||||||
|
{ORTH: "šved.", NORM: "švedsko"},
|
||||||
|
{ORTH: "švic.", NORM: "švicarsko"},
|
||||||
|
{ORTH: "ž.", NORM: "ženski"},
|
||||||
|
{ORTH: "žarg.", NORM: "žargonsko"},
|
||||||
|
{ORTH: "žel.", NORM: "železnica"},
|
||||||
|
{ORTH: "živ.", NORM: "živost"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
abbrv = """
|
||||||
|
Co. Ch. DIPL. DR. Dr. Ev. Inc. Jr. Kr. Mag. M. MR. Mr. Mt. Murr. Npr. OZ.
|
||||||
|
Opr. Osn. Prim. Roj. ST. Sim. Sp. Sred. St. Sv. Škofl. Tel. UR. Zb.
|
||||||
|
a. aa. ab. abc. abit. abl. abs. abt. acc. accel. add. adj. adv. aet. afr. akad. al. alban. all. alleg.
|
||||||
|
alp. alt. alter. alžir. am. an. andr. ang. anh. anon. ans. antrop. apoc. app. approx. apt. ar. arc. arch.
|
||||||
|
arh. arr. as. asist. assist. assoc. asst. astr. attn. aug. avstral. az. b. bab. bal. bbl. bd. belg. bioinf.
|
||||||
|
biomed. bk. bl. bn. borg. bp. br. braz. brit. bros. broš. bt. bu. c. ca. cal. can. cand. cantab. cap. capt.
|
||||||
|
cat. cath. cc. cca. cd. cdr. cdre. cent. cerkv. cert. cf. cfr. ch. chap. chem. chr. chs. cic. circ. civ. cl.
|
||||||
|
cm. cmd. cnr. co. cod. col. coll. colo. com. comp. con. conc. cond. conn. cons. cont. coop. corr. cost. cp.
|
||||||
|
cpl. cr. crd. cres. cresc. ct. cu. d. dan. dat. davč. ddr. dec. ded. def. dem. dent. dept. dia. dip. dipl.
|
||||||
|
dir. disp. diss. div. do. doc. dok. dol. doo. dop. dott. dr. dram. druž. družb. drž. dt. duh. dur. dvr. dwt. e.
|
||||||
|
ea. ecc. eccl. eccles. econ. edn. egipt. egr. ekon. eksp. el. em. enc. eng. eo. ep. err. esp. esq. est.
|
||||||
|
et. etc. etnogr. etnol. ev. evfem. evr. ex. exc. excl. exp. expl. ext. exx. f. fa. facs. fak. faks. fas.
|
||||||
|
fasc. fco. fcp. feb. febr. fec. fed. fem. ff. fff. fid. fig. fil. film. fiziol. fiziot. flam. fm. fo. fol. folk.
|
||||||
|
frag. fran. franc. fsc. g. ga. gal. gdč. ge. gen. geod. geog. geotehnol. gg. gimn. glas. glav. gnr. go. gor.
|
||||||
|
gosp. gp. graf. gram. gren. grš. gs. h. hab. hf. hist. ho. hort. i. ia. ib. ibid. id. idr. idridr. ill. imen.
|
||||||
|
imp. impf. impr. in. inc. incl. ind. indus. inf. inform. ing. init. ins. int. inv. inšp. inštr. inž. is. islam.
|
||||||
|
ist. ital. iur. iz. izbr. izd. izg. izgr. izr. izv. j. jak. jam. jan. jav. je. jez. jr. jsl. jud. jug.
|
||||||
|
jugoslovan. jur. juž. jv. jz. k. kal. kan. kand. kat. kdo. kem. kip. kmet. kol. kom. komp. konf. kont. kost. kov.
|
||||||
|
kp. kpfw. kr. kraj. krat. kub. kult. kv. kval. l. la. lab. lb. ld. let. lib. lik. litt. lj. ljud. ll. loc. log.
|
||||||
|
loč. lt. ma. madž. mag. manag. manjš. masc. mass. mater. max. maxmax. mb. md. mech. medic. medij. medn.
|
||||||
|
mehč. mem. menedž. mes. mess. metal. meteor. meteorol. mex. mi. mikr. mil. minn. mio. misc. miss. mit. mk.
|
||||||
|
mkt. ml. mlad. mlle. mlr. mm. mme. množ. mo. moj. moš. možn. mr. mrd. mrs. ms. msc. msgr. mt. murr. mus. mut.
|
||||||
|
n. na. nad. nadalj. nadom. nagl. nakl. namer. nan. naniz. nasl. nat. navt. nač. ned. nem. nik. nizoz. nm. nn.
|
||||||
|
no. nom. norv. notr. nov. novogr. ns. o. ob. obd. obj. oblač. obl. oblik. obr. obraz. obs. obst. obt. obč. oc.
|
||||||
|
oct. od. odd. odg. odn. odst. odv. oec. off. ok. okla. okr. ont. oo. op. opis. opp. opr. orch. ord. ore. oreg.
|
||||||
|
org. orient. orig. ork. ort. oseb. osn. ot. ozir. ošk. p. pag. par. para. parc. parl. part. past. pat. pdk.
|
||||||
|
pen. perf. pert. perz. pesn. pet. pev. pf. pfc. ph. pharm. phil. pis. pl. po. pod. podr. podaljš. pogl. pogoj. pojm.
|
||||||
|
pok. pokr. pol. poljed. poljub. polu. pom. pomen. pon. ponov. pop. por. port. pos. posl. posn. pov. pp. ppl. pr.
|
||||||
|
praet. prav. pravopis. pravosl. preb. pred. predl. predm. predp. preds. pref. pregib. prel. prem. premen. prep.
|
||||||
|
pres. pret. prev. pribl. prih. pril. primerj. primor. prip. pripor. prir. prist. priv. proc. prof. prog. proiz.
|
||||||
|
prom. pron. prop. prot. protest. prov. ps. pss. pt. publ. pz. q. qld. qu. quad. que. r. racc. rastl. razgl.
|
||||||
|
razl. razv. rd. red. ref. reg. rel. relig. rep. repr. rer. resp. rest. ret. rev. revol. rež. rim. rist. rkp. rm.
|
||||||
|
roj. rom. romun. rp. rr. rt. rud. ruš. ry. sal. samogl. san. sc. scen. sci. scr. sdv. seg. sek. sen. sept. ser.
|
||||||
|
sev. sg. sgt. sh. sig. sigg. sign. sim. sin. sing. sinh. skand. skl. sklad. sklanj. sklep. skr. sl. slik. slov.
|
||||||
|
slovak. slovn. sn. so. sob. soc. sociol. sod. sopomen. sopr. sor. sov. sovj. sp. spec. spl. spr. spreg. sq. sr.
|
||||||
|
sre. sred. sredoz. srh. ss. ssp. st. sta. stan. stanstar. stcsl. ste. stim. stol. stom. str. stroj. strok. stsl.
|
||||||
|
stud. sup. supl. suppl. svet. sz. t. tab. tech. ted. tehn. tehnol. tek. teks. tekst. tel. temp. ten. teol. ter.
|
||||||
|
term. test. th. theol. tim. tip. tisočl. tit. tl. tol. tolmač. tom. tor. tov. tr. trad. traj. trans. tren.
|
||||||
|
trib. tril. trop. trp. trž. ts. tt. tu. tur. turiz. tvor. tvorb. tč. u. ul. umet. un. univ. up. upr. ur. urad.
|
||||||
|
us. ust. utr. v. va. val. var. varn. ven. ver. verb. vest. vezal. vic. vis. viv. viz. viš. vod. vok. vol. vpr.
|
||||||
|
vrst. vrstil. vs. vv. vzd. vzg. vzh. vzor. w. wed. wg. wk. x. y. z. zah. zaim. zak. zap. zasl. zavar. zač. zb.
|
||||||
|
združ. zg. zn. znan. znanstv. zoot. zun. zv. zvd. á. é. ć. č. čas. čet. čl. člen. čustv. đ. ľ. ł. ş. ŠT. š. šir.
|
||||||
|
škofl. škot. šol. št. števil. štud. ů. ű. žen. žival.
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
for orth in abbrv:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -29,7 +29,7 @@ class Ukrainian(Language):
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={
|
default_config={
|
||||||
"model": None,
|
"model": None,
|
||||||
"mode": "pymorphy2",
|
"mode": "pymorphy3",
|
||||||
"overwrite": False,
|
"overwrite": False,
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
},
|
},
|
||||||
|
|
|
@ -14,11 +14,11 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy3",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = lemmatizer_score,
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
|
elif mode == "pymorphy3":
|
||||||
|
try:
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"The Ukrainian lemmatizer mode 'pymorphy3' requires the "
|
||||||
|
"pymorphy3 library and dictionaries. Install them with: "
|
||||||
|
"pip install pymorphy3 pymorphy3-dicts-uk"
|
||||||
|
) from None
|
||||||
|
if getattr(self, "_morph", None) is None:
|
||||||
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Callable, Iterable, List, Tuple
|
from typing import Optional, Callable, Iterable, List, Tuple
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
from thinc.api import Model, Maxout, Linear, tuplify, Ragged
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||||
|
from ...kb import Candidate, get_candidates, get_candidates_batch
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens import Span, Doc
|
from ...tokens import Span, Doc
|
||||||
from ..extract_spans import extract_spans
|
from ..extract_spans import extract_spans
|
||||||
|
@ -78,9 +79,11 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.KBFromFile.v1")
|
@registry.misc("spacy.KBFromFile.v1")
|
||||||
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
def load_kb(
|
||||||
def kb_from_file(vocab):
|
kb_path: Path,
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
def kb_from_file(vocab: Vocab):
|
||||||
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.from_disk(kb_path)
|
kb.from_disk(kb_path)
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
|
@ -88,9 +91,11 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.EmptyKB.v1")
|
@registry.misc("spacy.EmptyKB.v1")
|
||||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
def empty_kb(
|
||||||
def empty_kb_factory(vocab):
|
entity_vector_length: int,
|
||||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
def empty_kb_factory(vocab: Vocab):
|
||||||
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
|
||||||
return empty_kb_factory
|
return empty_kb_factory
|
||||||
|
|
||||||
|
@ -98,3 +103,10 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
@registry.misc("spacy.CandidateGenerator.v1")
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||||
|
def create_candidates_batch() -> Callable[
|
||||||
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
|
]:
|
||||||
|
return get_candidates_batch
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||||
from typing import Sequence, Tuple, Union
|
from typing import Tuple
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from copy import deepcopy
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
n_labels = len(self.cfg["labels"])
|
n_labels = len(self.cfg["labels"])
|
||||||
guesses: List[Ints2d] = [
|
guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
|
||||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
|
||||||
]
|
|
||||||
assert len(guesses) == n_docs
|
assert len(guesses) == n_docs
|
||||||
return guesses
|
return guesses
|
||||||
scores = self.model.predict(docs)
|
scores = self.model.predict(docs)
|
||||||
|
|
|
@ -53,9 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"entity_vector_length": 64,
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
"use_gold_ents": True,
|
"use_gold_ents": True,
|
||||||
|
"candidates_batch_size": 1,
|
||||||
"threshold": None,
|
"threshold": None,
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
|
@ -75,9 +77,13 @@ def make_entity_linker(
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
get_candidates_batch: Callable[
|
||||||
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
|
],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
candidates_batch_size: int,
|
||||||
threshold: Optional[float] = None,
|
threshold: Optional[float] = None,
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
@ -90,17 +96,21 @@ def make_entity_linker(
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
get_candidates_batch (
|
||||||
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||||
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not model.attrs.get("include_span_maker", False):
|
if not model.attrs.get("include_span_maker", False):
|
||||||
# The only difference in arguments here is that use_gold_ents is not available
|
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
||||||
return EntityLinker_v1(
|
return EntityLinker_v1(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
model,
|
model,
|
||||||
|
@ -124,9 +134,11 @@ def make_entity_linker(
|
||||||
incl_context=incl_context,
|
incl_context=incl_context,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
|
get_candidates_batch=get_candidates_batch,
|
||||||
overwrite=overwrite,
|
overwrite=overwrite,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
use_gold_ents=use_gold_ents,
|
use_gold_ents=use_gold_ents,
|
||||||
|
candidates_batch_size=candidates_batch_size,
|
||||||
threshold=threshold,
|
threshold=threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -160,9 +172,13 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
get_candidates_batch: Callable[
|
||||||
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
|
],
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
scorer: Optional[Callable] = entity_linker_score,
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
candidates_batch_size: int,
|
||||||
threshold: Optional[float] = None,
|
threshold: Optional[float] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
@ -178,10 +194,14 @@ class EntityLinker(TrainablePipe):
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
get_candidates_batch (
|
||||||
Scorer.score_links.
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||||
|
Iterable[Candidate]]
|
||||||
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
@ -204,22 +224,27 @@ class EntityLinker(TrainablePipe):
|
||||||
self.incl_prior = incl_prior
|
self.incl_prior = incl_prior
|
||||||
self.incl_context = incl_context
|
self.incl_context = incl_context
|
||||||
self.get_candidates = get_candidates
|
self.get_candidates = get_candidates
|
||||||
|
self.get_candidates_batch = get_candidates_batch
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neighbour sentences to take into account
|
# how many neighbour sentences to take into account
|
||||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
# create an empty KB by default
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
|
self.candidates_batch_size = candidates_batch_size
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
|
|
||||||
|
if candidates_batch_size < 1:
|
||||||
|
raise ValueError(Errors.E1044)
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
create it using this object's vocab."""
|
create it using this object's vocab."""
|
||||||
if not callable(kb_loader):
|
if not callable(kb_loader):
|
||||||
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
||||||
|
|
||||||
self.kb = kb_loader(self.vocab)
|
self.kb = kb_loader(self.vocab) # type: ignore
|
||||||
|
|
||||||
def validate_kb(self) -> None:
|
def validate_kb(self) -> None:
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
|
@ -241,8 +266,8 @@ class EntityLinker(TrainablePipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
|
||||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
instance. Note that providing this argument will overwrite all data accumulated in the current KB.
|
||||||
Use this only when loading a KB as-such from file.
|
Use this only when loading a KB as-such from file.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#initialize
|
DOCS: https://spacy.io/api/entitylinker#initialize
|
||||||
|
@ -419,66 +444,93 @@ class EntityLinker(TrainablePipe):
|
||||||
if len(doc) == 0:
|
if len(doc) == 0:
|
||||||
continue
|
continue
|
||||||
sentences = [s for s in doc.sents]
|
sentences = [s for s in doc.sents]
|
||||||
# Looping through each entity (TODO: rewrite)
|
|
||||||
for ent in doc.ents:
|
|
||||||
sent_index = sentences.index(ent.sent)
|
|
||||||
assert sent_index >= 0
|
|
||||||
|
|
||||||
if self.incl_context:
|
# Loop over entities in batches.
|
||||||
# get n_neighbour sentences, clipped to the length of the document
|
for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
|
||||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
|
||||||
start_token = sentences[start_sentence].start
|
# Look up candidate entities.
|
||||||
end_token = sentences[end_sentence].end
|
valid_ent_idx = [
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
idx
|
||||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
for idx in range(len(ent_batch))
|
||||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
if ent_batch[idx].label_ not in self.labels_discard
|
||||||
sentence_encoding_t = sentence_encoding.T
|
]
|
||||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
|
||||||
entity_count += 1
|
batch_candidates = list(
|
||||||
if ent.label_ in self.labels_discard:
|
self.get_candidates_batch(
|
||||||
# ignoring this entity - setting to NIL
|
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
|
||||||
final_kb_ids.append(self.NIL)
|
)
|
||||||
else:
|
if self.candidates_batch_size > 1
|
||||||
candidates = list(self.get_candidates(self.kb, ent))
|
else [
|
||||||
if not candidates:
|
self.get_candidates(self.kb, ent_batch[idx])
|
||||||
# no prediction possible for this entity - setting to NIL
|
for idx in valid_ent_idx
|
||||||
final_kb_ids.append(self.NIL)
|
]
|
||||||
elif len(candidates) == 1 and self.threshold is None:
|
)
|
||||||
# shortcut for efficiency reasons: take the 1 candidate
|
|
||||||
final_kb_ids.append(candidates[0].entity_)
|
# Looping through each entity in batch (TODO: rewrite)
|
||||||
else:
|
for j, ent in enumerate(ent_batch):
|
||||||
random.shuffle(candidates)
|
sent_index = sentences.index(ent.sent)
|
||||||
# set all prior probabilities to 0 if incl_prior=False
|
assert sent_index >= 0
|
||||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
|
||||||
if not self.incl_prior:
|
if self.incl_context:
|
||||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
scores = prior_probs
|
start_sentence = max(0, sent_index - self.n_sents)
|
||||||
# add in similarity from the context
|
end_sentence = min(
|
||||||
if self.incl_context:
|
len(sentences) - 1, sent_index + self.n_sents
|
||||||
entity_encodings = xp.asarray(
|
|
||||||
[c.entity_vector for c in candidates]
|
|
||||||
)
|
|
||||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
|
||||||
if len(entity_encodings) != len(prior_probs):
|
|
||||||
raise RuntimeError(
|
|
||||||
Errors.E147.format(
|
|
||||||
method="predict",
|
|
||||||
msg="vectors not of equal length",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# cosine similarity
|
|
||||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
|
||||||
sentence_norm * entity_norm
|
|
||||||
)
|
|
||||||
if sims.shape != prior_probs.shape:
|
|
||||||
raise ValueError(Errors.E161)
|
|
||||||
scores = prior_probs + sims - (prior_probs * sims)
|
|
||||||
final_kb_ids.append(
|
|
||||||
candidates[scores.argmax().item()].entity_
|
|
||||||
if self.threshold is None or scores.max() >= self.threshold
|
|
||||||
else EntityLinker.NIL
|
|
||||||
)
|
)
|
||||||
|
start_token = sentences[start_sentence].start
|
||||||
|
end_token = sentences[end_sentence].end
|
||||||
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||||
|
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||||
|
sentence_encoding_t = sentence_encoding.T
|
||||||
|
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||||
|
entity_count += 1
|
||||||
|
if ent.label_ in self.labels_discard:
|
||||||
|
# ignoring this entity - setting to NIL
|
||||||
|
final_kb_ids.append(self.NIL)
|
||||||
|
else:
|
||||||
|
candidates = list(batch_candidates[j])
|
||||||
|
if not candidates:
|
||||||
|
# no prediction possible for this entity - setting to NIL
|
||||||
|
final_kb_ids.append(self.NIL)
|
||||||
|
elif len(candidates) == 1 and self.threshold is None:
|
||||||
|
# shortcut for efficiency reasons: take the 1 candidate
|
||||||
|
final_kb_ids.append(candidates[0].entity_)
|
||||||
|
else:
|
||||||
|
random.shuffle(candidates)
|
||||||
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
|
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||||
|
if not self.incl_prior:
|
||||||
|
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||||
|
scores = prior_probs
|
||||||
|
# add in similarity from the context
|
||||||
|
if self.incl_context:
|
||||||
|
entity_encodings = xp.asarray(
|
||||||
|
[c.entity_vector for c in candidates]
|
||||||
|
)
|
||||||
|
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||||
|
if len(entity_encodings) != len(prior_probs):
|
||||||
|
raise RuntimeError(
|
||||||
|
Errors.E147.format(
|
||||||
|
method="predict",
|
||||||
|
msg="vectors not of equal length",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# cosine similarity
|
||||||
|
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||||
|
sentence_norm * entity_norm
|
||||||
|
)
|
||||||
|
if sims.shape != prior_probs.shape:
|
||||||
|
raise ValueError(Errors.E161)
|
||||||
|
scores = prior_probs + sims - (prior_probs * sims)
|
||||||
|
final_kb_ids.append(
|
||||||
|
candidates[scores.argmax().item()].entity_
|
||||||
|
if self.threshold is None
|
||||||
|
or scores.max() >= self.threshold
|
||||||
|
else EntityLinker.NIL
|
||||||
|
)
|
||||||
|
|
||||||
if not (len(final_kb_ids) == entity_count):
|
if not (len(final_kb_ids) == entity_count):
|
||||||
err = Errors.E147.format(
|
err = Errors.E147.format(
|
||||||
method="predict", msg="result variables not of equal length"
|
method="predict", msg="result variables not of equal length"
|
||||||
|
|
|
@ -68,8 +68,7 @@ class EntityLinker_v1(TrainablePipe):
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
Scorer.score_links.
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -115,7 +114,7 @@ class EntityLinker_v1(TrainablePipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
|
||||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||||
Use this only when loading a KB as-such from file.
|
Use this only when loading a KB as-such from file.
|
||||||
|
|
||||||
|
|
|
@ -26,17 +26,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
|
||||||
hidden_size = 128
|
hidden_size = 128
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
[model.tok2vec.embed]
|
[model.tok2vec.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
width = 96
|
width = 96
|
||||||
rows = [5000, 2000, 1000, 1000]
|
rows = [5000, 2000, 1000, 1000]
|
||||||
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
|
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
include_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
width = ${model.tok2vec.embed.width}
|
width = ${model.tok2vec.embed.width}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
@ -133,6 +133,9 @@ def make_spancat(
|
||||||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||||
initialization and training, the component will look for spans on the
|
initialization and training, the component will look for spans on the
|
||||||
reference document under the same key.
|
reference document under the same key.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
|
spans allowed.
|
||||||
threshold (float): Minimum probability to consider a prediction positive.
|
threshold (float): Minimum probability to consider a prediction positive.
|
||||||
Spans with a positive prediction will be saved on the Doc. Defaults to
|
Spans with a positive prediction will be saved on the Doc. Defaults to
|
||||||
0.5.
|
0.5.
|
||||||
|
|
|
@ -19,7 +19,7 @@ multi_label_default_config = """
|
||||||
@architectures = "spacy.TextCatEnsemble.v2"
|
@architectures = "spacy.TextCatEnsemble.v2"
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
[model.tok2vec.embed]
|
[model.tok2vec.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v2"
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
|
@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||||
include_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
width = ${model.tok2vec.embed.width}
|
width = ${model.tok2vec.embed.width}
|
||||||
window_size = 1
|
window_size = 1
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
|
@ -96,8 +96,8 @@ def make_multilabel_textcat(
|
||||||
model: Model[List[Doc], List[Floats2d]],
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
threshold: float,
|
threshold: float,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
) -> "TextCategorizer":
|
) -> "MultiLabel_TextCategorizer":
|
||||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
"""Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels are considered
|
over a whole document. It can learn one or more labels, and the labels are considered
|
||||||
to be non-mutually exclusive, which means that there can be zero or more labels
|
to be non-mutually exclusive, which means that there can be zero or more labels
|
||||||
per doc).
|
per doc).
|
||||||
|
@ -105,6 +105,7 @@ def make_multilabel_textcat(
|
||||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||||
scores for each category.
|
scores for each category.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
return MultiLabel_TextCategorizer(
|
return MultiLabel_TextCategorizer(
|
||||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||||
|
@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#init
|
DOCS: https://spacy.io/api/textcategorizer#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
|
||||||
width = self.model.get_dim("nO")
|
width = self.model.get_dim("nO")
|
||||||
return [self.model.ops.alloc((0, width)) for doc in docs]
|
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
|
||||||
for listener in self.listeners:
|
|
||||||
listener.receive(batch_id, tokvecs, _empty_backprop)
|
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
||||||
|
@ -286,8 +283,19 @@ class Tok2VecListener(Model):
|
||||||
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||||
"""Supply the outputs from the upstream Tok2Vec component."""
|
"""Supply the outputs from the upstream Tok2Vec component."""
|
||||||
if is_train:
|
if is_train:
|
||||||
model.verify_inputs(inputs)
|
# This might occur during training when the tok2vec layer is frozen / hasn't been updated.
|
||||||
return model._outputs, model._backprop
|
# In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
|
||||||
|
if model._batch_id is None:
|
||||||
|
outputs = []
|
||||||
|
for doc in inputs:
|
||||||
|
if doc.tensor.size == 0:
|
||||||
|
raise ValueError(Errors.E203.format(name="tok2vec"))
|
||||||
|
else:
|
||||||
|
outputs.append(doc.tensor)
|
||||||
|
return outputs, _empty_backprop
|
||||||
|
else:
|
||||||
|
model.verify_inputs(inputs)
|
||||||
|
return model._outputs, model._backprop
|
||||||
else:
|
else:
|
||||||
# This is pretty grim, but it's hard to do better :(.
|
# This is pretty grim, but it's hard to do better :(.
|
||||||
# It's hard to avoid relying on the doc.tensor attribute, because the
|
# It's hard to avoid relying on the doc.tensor attribute, because the
|
||||||
|
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||||
outputs.append(model.ops.alloc2f(len(doc), width))
|
outputs.append(model.ops.alloc2f(len(doc), width))
|
||||||
else:
|
else:
|
||||||
outputs.append(doc.tensor)
|
outputs.append(doc.tensor)
|
||||||
return outputs, lambda dX: []
|
return outputs, _empty_backprop
|
||||||
|
|
||||||
|
|
||||||
def _empty_backprop(dX): # for pickling
|
def _empty_backprop(dX): # for pickling
|
||||||
|
|
|
@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
|
||||||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||||
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
||||||
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
||||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
|
||||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
|
||||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
|
||||||
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
|
LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
|
||||||
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
|
GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
|
||||||
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
|
LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
description: StrictStr = Field("", title="Description of asset")
|
description: StrictStr = Field("", title="Description of asset")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
|
||||||
class ProjectConfigAssetGit(BaseModel):
|
class ProjectConfigAssetGit(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
||||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
description: Optional[StrictStr] = Field(None, title="Description of asset")
|
description: Optional[StrictStr] = Field(None, title="Description of asset")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
|
||||||
None, title="Indices of sentences' start and end indices"
|
None, title="Indices of sentences' start and end indices"
|
||||||
)
|
)
|
||||||
text: StrictStr = Field(..., title="Document text")
|
text: StrictStr = Field(..., title="Document text")
|
||||||
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
|
spans: Optional[
|
||||||
None, title="Span information - end/start indices, label, KB ID"
|
Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
|
||||||
)
|
] = Field(None, title="Span information - end/start indices, label, KB ID")
|
||||||
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
||||||
..., title="Token information - ID, start, annotations"
|
..., title="Token information - ID, start, annotations"
|
||||||
)
|
)
|
||||||
|
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
|
||||||
title="Any custom data stored in the document's _ attribute",
|
title="Any custom data stored in the document's _ attribute",
|
||||||
alias="_",
|
alias="_",
|
||||||
)
|
)
|
||||||
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
|
||||||
None, title="Any custom data stored in the token's _ attribute"
|
None, title="Any custom data stored in the token's _ attribute"
|
||||||
)
|
)
|
||||||
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
|
||||||
None, title="Any custom data stored in the span's _ attribute"
|
None, title="Any custom data stored in the span's _ attribute"
|
||||||
)
|
)
|
||||||
|
|
|
@ -333,16 +333,24 @@ def ro_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ru_tokenizer():
|
def ru_tokenizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().tokenizer
|
return get_lang_class("ru")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ru_lemmatizer():
|
def ru_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ru_lookup_lemmatizer():
|
||||||
|
pytest.importorskip("pymorphy2")
|
||||||
|
return get_lang_class("ru")().add_pipe(
|
||||||
|
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def sa_tokenizer():
|
def sa_tokenizer():
|
||||||
return get_lang_class("sa")().tokenizer
|
return get_lang_class("sa")().tokenizer
|
||||||
|
@ -411,15 +419,24 @@ def ky_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def uk_tokenizer():
|
def uk_tokenizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("uk")().tokenizer
|
return get_lang_class("uk")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def uk_lemmatizer():
|
def uk_lemmatizer():
|
||||||
|
pytest.importorskip("pymorphy3")
|
||||||
|
pytest.importorskip("pymorphy3_dicts_uk")
|
||||||
|
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def uk_lookup_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy2")
|
||||||
pytest.importorskip("pymorphy2_dicts_uk")
|
pytest.importorskip("pymorphy2_dicts_uk")
|
||||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
return get_lang_class("uk")().add_pipe(
|
||||||
|
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
|
|
@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
|
||||||
doc._.json_test1 = "hello world"
|
doc._.json_test1 = "hello world"
|
||||||
doc._.json_test2 = [1, 2, 3]
|
doc._.json_test2 = [1, 2, 3]
|
||||||
doc[0:1]._.span_test = "span_attribute"
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0:2]._.span_test = "span_attribute_2"
|
||||||
doc[0]._.token_test = 117
|
doc[0]._.token_test = 117
|
||||||
|
doc[1]._.token_test = 118
|
||||||
doc.spans["span_group"] = [doc[0:1]]
|
doc.spans["span_group"] = [doc[0:1]]
|
||||||
json_doc = doc.to_json(
|
json_doc = doc.to_json(
|
||||||
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||||
|
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
|
||||||
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||||
assert "underscore_token" in json_doc
|
assert "underscore_token" in json_doc
|
||||||
assert "underscore_span" in json_doc
|
assert "underscore_span" in json_doc
|
||||||
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
|
||||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
|
||||||
|
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||||
|
assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
|
||||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
|
||||||
assert json_doc["_"]["json_test"] == "hello world"
|
assert json_doc["_"]["json_test"] == "hello world"
|
||||||
assert "underscore_token" in json_doc
|
assert "underscore_token" in json_doc
|
||||||
assert "underscore_span" in json_doc
|
assert "underscore_span" in json_doc
|
||||||
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
|
||||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
|
||||||
assert json_doc["_"]["my_ext"] == "hello world"
|
assert json_doc["_"]["my_ext"] == "hello world"
|
||||||
assert "underscore_token" in json_doc
|
assert "underscore_token" in json_doc
|
||||||
assert "underscore_span" in json_doc
|
assert "underscore_span" in json_doc
|
||||||
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
|
assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
|
||||||
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
|
assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
|
||||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
|
||||||
doc[0]._.token_test = 117
|
doc[0]._.token_test = 117
|
||||||
json_doc = doc.to_json(underscore=["span_test"])
|
json_doc = doc.to_json(underscore=["span_test"])
|
||||||
|
|
||||||
assert "underscore_token" in json_doc
|
|
||||||
assert "underscore_span" in json_doc
|
assert "underscore_span" in json_doc
|
||||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||||
assert "token_test" not in json_doc["underscore_token"]
|
assert "underscore_token" not in json_doc
|
||||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
|
||||||
doc._.json_test1 = "hello world"
|
doc._.json_test1 = "hello world"
|
||||||
doc._.json_test2 = [1, 2, 3]
|
doc._.json_test2 = [1, 2, 3]
|
||||||
doc[0:1]._.span_test = "span_attribute"
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0:2]._.span_test = "span_attribute_2"
|
||||||
doc[0]._.token_test = 117
|
doc[0]._.token_test = 117
|
||||||
|
doc[1]._.token_test = 118
|
||||||
|
|
||||||
json_doc = doc.to_json(
|
json_doc = doc.to_json(
|
||||||
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||||
|
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
|
||||||
assert new_doc._.json_test1 == "hello world"
|
assert new_doc._.json_test1 == "hello world"
|
||||||
assert new_doc._.json_test2 == [1, 2, 3]
|
assert new_doc._.json_test2 == [1, 2, 3]
|
||||||
assert new_doc[0]._.token_test == 117
|
assert new_doc[0]._.token_test == 117
|
||||||
|
assert new_doc[1]._.token_test == 118
|
||||||
assert new_doc[0:1]._.span_test == "span_attribute"
|
assert new_doc[0:1]._.span_test == "span_attribute"
|
||||||
|
assert new_doc[0:2]._.span_test == "span_attribute_2"
|
||||||
assert new_doc.user_data == doc.user_data
|
assert new_doc.user_data == doc.user_data
|
||||||
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
|
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
|
||||||
exclude=["user_data"]
|
exclude=["user_data"]
|
||||||
|
|
18
spacy/tests/lang/grc/test_tokenizer.py
Normal file
18
spacy/tests/lang/grc/test_tokenizer.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
GRC_TOKEN_EXCEPTION_TESTS = [
|
||||||
|
("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]),
|
||||||
|
("τὴν δὲ τῶν Αἰγυπτίων φιλοσοφίαν εἶναι τοιαύτην περί τε †θεῶν† καὶ ὑπὲρ δικαιοσύνης.", ["τὴν", "δὲ", "τῶν", "Αἰγυπτίων", "φιλοσοφίαν", "εἶναι", "τοιαύτην", "περί", "τε", "†", "θεῶν", "†", "καὶ", "ὑπὲρ", "δικαιοσύνης", "."]),
|
||||||
|
("⸏πόσις δ' Ἐρεχθεύς ἐστί μοι σεσωσμένος⸏", ["⸏", "πόσις", "δ'", "Ἐρεχθεύς", "ἐστί", "μοι", "σεσωσμένος", "⸏"]),
|
||||||
|
("⸏ὔπνον ἴδωμεν⸎", ["⸏", "ὔπνον", "ἴδωμεν", "⸎"]),
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", GRC_TOKEN_EXCEPTION_TESTS)
|
||||||
|
def test_grc_tokenizer(grc_tokenizer, text, expected_tokens):
|
||||||
|
tokens = grc_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
|
@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||||
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||||
|
|
||||||
|
|
||||||
|
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||||
|
words = ["мама", "мыла", "раму"]
|
||||||
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
morphs = [
|
||||||
|
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||||
|
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||||
|
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||||
|
]
|
||||||
|
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||||
|
doc = ru_lookup_lemmatizer(doc)
|
||||||
|
lemmas = [token.lemma_ for token in doc]
|
||||||
|
assert lemmas == ["мама", "мыла", "раму"]
|
||||||
|
|
|
@ -20,7 +20,6 @@ od katerih so te svoboščine odvisne,
|
||||||
assert len(tokens) == 116
|
assert len(tokens) == 116
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_ordinal_number(sl_tokenizer):
|
def test_ordinal_number(sl_tokenizer):
|
||||||
text = "10. decembra 1948"
|
text = "10. decembra 1948"
|
||||||
tokens = sl_tokenizer(text)
|
tokens = sl_tokenizer(text)
|
||||||
|
|
|
@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
uk_lemmatizer(doc)
|
uk_lemmatizer(doc)
|
||||||
|
assert [token.lemma for token in doc]
|
||||||
|
|
||||||
|
|
||||||
|
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
|
||||||
|
"""Check that the lookup uk lemmatizer runs."""
|
||||||
|
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
uk_lookup_lemmatizer(doc)
|
||||||
|
assert [token.lemma for token in doc]
|
||||||
|
|
|
@ -6,7 +6,7 @@ from numpy.testing import assert_equal
|
||||||
from spacy import registry, util
|
from spacy import registry, util
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, KnowledgeBase, get_candidates
|
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.ml import load_kb
|
from spacy.ml import load_kb
|
||||||
from spacy.pipeline import EntityLinker
|
from spacy.pipeline import EntityLinker
|
||||||
|
@ -34,7 +34,7 @@ def assert_almost_equal(a, b):
|
||||||
def test_issue4674():
|
def test_issue4674():
|
||||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
@ -51,7 +51,7 @@ def test_issue4674():
|
||||||
dir_path.mkdir()
|
dir_path.mkdir()
|
||||||
file_path = dir_path / "kb"
|
file_path = dir_path / "kb"
|
||||||
kb.to_disk(str(file_path))
|
kb.to_disk(str(file_path))
|
||||||
kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
kb2.from_disk(str(file_path))
|
kb2.from_disk(str(file_path))
|
||||||
assert kb2.get_size_entities() == 1
|
assert kb2.get_size_entities() == 1
|
||||||
|
|
||||||
|
@ -59,9 +59,9 @@ def test_issue4674():
|
||||||
@pytest.mark.issue(6730)
|
@pytest.mark.issue(6730)
|
||||||
def test_issue6730(en_vocab):
|
def test_issue6730(en_vocab):
|
||||||
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||||
|
|
||||||
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(en_vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
@ -127,7 +127,7 @@ def test_issue7065_b():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
alias="No. 8",
|
alias="No. 8",
|
||||||
|
@ -190,7 +190,7 @@ def test_no_entities():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
@ -231,7 +231,7 @@ def test_partial_links():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
@ -263,7 +263,7 @@ def test_partial_links():
|
||||||
|
|
||||||
def test_kb_valid_entities(nlp):
|
def test_kb_valid_entities(nlp):
|
||||||
"""Test the valid construction of a KB with 3 entities and two aliases"""
|
"""Test the valid construction of a KB with 3 entities and two aliases"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
|
||||||
|
@ -292,7 +292,7 @@ def test_kb_valid_entities(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_entities(nlp):
|
def test_kb_invalid_entities(nlp):
|
||||||
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
|
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
|
@ -308,7 +308,7 @@ def test_kb_invalid_entities(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_probabilities(nlp):
|
def test_kb_invalid_probabilities(nlp):
|
||||||
"""Test the invalid construction of a KB with wrong prior probabilities"""
|
"""Test the invalid construction of a KB with wrong prior probabilities"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
|
@ -322,7 +322,7 @@ def test_kb_invalid_probabilities(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_combination(nlp):
|
def test_kb_invalid_combination(nlp):
|
||||||
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
|
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
|
@ -338,7 +338,7 @@ def test_kb_invalid_combination(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_entity_vector(nlp):
|
def test_kb_invalid_entity_vector(nlp):
|
||||||
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
|
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
|
||||||
|
@ -376,7 +376,7 @@ def test_kb_initialize_empty(nlp):
|
||||||
|
|
||||||
def test_kb_serialize(nlp):
|
def test_kb_serialize(nlp):
|
||||||
"""Test serialization of the KB"""
|
"""Test serialization of the KB"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
# normal read-write behaviour
|
# normal read-write behaviour
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
|
@ -393,12 +393,12 @@ def test_kb_serialize(nlp):
|
||||||
@pytest.mark.issue(9137)
|
@pytest.mark.issue(9137)
|
||||||
def test_kb_serialize_2(nlp):
|
def test_kb_serialize_2(nlp):
|
||||||
v = [5, 6, 7, 8]
|
v = [5, 6, 7, 8]
|
||||||
kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb1.set_entities(["E1"], [1], [v])
|
kb1.set_entities(["E1"], [1], [v])
|
||||||
assert kb1.get_vector("E1") == v
|
assert kb1.get_vector("E1") == v
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
kb1.to_disk(d / "kb")
|
kb1.to_disk(d / "kb")
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb2.from_disk(d / "kb")
|
kb2.from_disk(d / "kb")
|
||||||
assert kb2.get_vector("E1") == v
|
assert kb2.get_vector("E1") == v
|
||||||
|
|
||||||
|
@ -408,7 +408,7 @@ def test_kb_set_entities(nlp):
|
||||||
v = [5, 6, 7, 8]
|
v = [5, 6, 7, 8]
|
||||||
v1 = [1, 1, 1, 0]
|
v1 = [1, 1, 1, 0]
|
||||||
v2 = [2, 2, 2, 3]
|
v2 = [2, 2, 2, 3]
|
||||||
kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb1.set_entities(["E0"], [1], [v])
|
kb1.set_entities(["E0"], [1], [v])
|
||||||
assert kb1.get_entity_strings() == ["E0"]
|
assert kb1.get_entity_strings() == ["E0"]
|
||||||
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
|
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
|
||||||
|
@ -417,7 +417,7 @@ def test_kb_set_entities(nlp):
|
||||||
assert kb1.get_vector("E2") == v2
|
assert kb1.get_vector("E2") == v2
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
kb1.to_disk(d / "kb")
|
kb1.to_disk(d / "kb")
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb2.from_disk(d / "kb")
|
kb2.from_disk(d / "kb")
|
||||||
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
|
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
|
||||||
assert kb2.get_vector("E1") == v1
|
assert kb2.get_vector("E1") == v1
|
||||||
|
@ -428,7 +428,7 @@ def test_kb_serialize_vocab(nlp):
|
||||||
"""Test serialization of the KB and custom strings"""
|
"""Test serialization of the KB and custom strings"""
|
||||||
entity = "MyFunnyID"
|
entity = "MyFunnyID"
|
||||||
assert entity not in nlp.vocab.strings
|
assert entity not in nlp.vocab.strings
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
assert not mykb.contains_entity(entity)
|
assert not mykb.contains_entity(entity)
|
||||||
mykb.add_entity(entity, freq=342, entity_vector=[3])
|
mykb.add_entity(entity, freq=342, entity_vector=[3])
|
||||||
assert mykb.contains_entity(entity)
|
assert mykb.contains_entity(entity)
|
||||||
|
@ -436,14 +436,14 @@ def test_kb_serialize_vocab(nlp):
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
# normal read-write behaviour
|
# normal read-write behaviour
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
|
mykb_new = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||||
mykb_new.from_disk(d / "kb")
|
mykb_new.from_disk(d / "kb")
|
||||||
assert entity in mykb_new.vocab.strings
|
assert entity in mykb_new.vocab.strings
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_generation(nlp):
|
def test_candidate_generation(nlp):
|
||||||
"""Test correct candidate generation"""
|
"""Test correct candidate generation"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
doc = nlp("douglas adam Adam shrubbery")
|
doc = nlp("douglas adam Adam shrubbery")
|
||||||
|
|
||||||
douglas_ent = doc[0:1]
|
douglas_ent = doc[0:1]
|
||||||
|
@ -481,7 +481,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
ruler.add_patterns([pattern])
|
ruler.add_patterns([pattern])
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||||
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
||||||
|
@ -500,10 +500,21 @@ def test_el_pipe_configuration(nlp):
|
||||||
def get_lowercased_candidates(kb, span):
|
def get_lowercased_candidates(kb, span):
|
||||||
return kb.get_alias_candidates(span.text.lower())
|
return kb.get_alias_candidates(span.text.lower())
|
||||||
|
|
||||||
|
def get_lowercased_candidates_batch(kb, spans):
|
||||||
|
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||||
|
|
||||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[
|
||||||
|
[InMemoryLookupKB, "Span"], Iterable[Candidate]
|
||||||
|
]:
|
||||||
return get_lowercased_candidates
|
return get_lowercased_candidates
|
||||||
|
|
||||||
|
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
||||||
|
def create_candidates_batch() -> Callable[
|
||||||
|
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
|
||||||
|
]:
|
||||||
|
return get_lowercased_candidates_batch
|
||||||
|
|
||||||
# replace the pipe with a new one with with a different candidate generator
|
# replace the pipe with a new one with with a different candidate generator
|
||||||
entity_linker = nlp.replace_pipe(
|
entity_linker = nlp.replace_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
|
@ -511,6 +522,9 @@ def test_el_pipe_configuration(nlp):
|
||||||
config={
|
config={
|
||||||
"incl_context": False,
|
"incl_context": False,
|
||||||
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
||||||
|
"get_candidates_batch": {
|
||||||
|
"@misc": "spacy.LowercaseCandidateBatchGenerator.v1"
|
||||||
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
|
@ -532,7 +546,7 @@ def test_nel_nsents(nlp):
|
||||||
|
|
||||||
def test_vocab_serialization(nlp):
|
def test_vocab_serialization(nlp):
|
||||||
"""Test that string information is retained across storage"""
|
"""Test that string information is retained across storage"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
@ -552,7 +566,7 @@ def test_vocab_serialization(nlp):
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
|
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||||
kb_new_vocab.from_disk(d / "kb")
|
kb_new_vocab.from_disk(d / "kb")
|
||||||
|
|
||||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
candidates = kb_new_vocab.get_alias_candidates("adam")
|
||||||
|
@ -568,7 +582,7 @@ def test_vocab_serialization(nlp):
|
||||||
|
|
||||||
def test_append_alias(nlp):
|
def test_append_alias(nlp):
|
||||||
"""Test that we can append additional alias-entity pairs"""
|
"""Test that we can append additional alias-entity pairs"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
@ -599,7 +613,7 @@ def test_append_alias(nlp):
|
||||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
def test_append_invalid_alias(nlp):
|
def test_append_invalid_alias(nlp):
|
||||||
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
@ -621,7 +635,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
vector_length = 1
|
vector_length = 1
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
||||||
|
@ -723,7 +737,7 @@ def test_overfitting_IO():
|
||||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
# Q7381115 (Russ Cochran): publisher
|
# Q7381115 (Russ Cochran): publisher
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
|
@ -805,7 +819,7 @@ def test_kb_serialization():
|
||||||
kb_dir = tmp_dir / "kb"
|
kb_dir = tmp_dir / "kb"
|
||||||
nlp1 = English()
|
nlp1 = English()
|
||||||
assert "Q2146908" not in nlp1.vocab.strings
|
assert "Q2146908" not in nlp1.vocab.strings
|
||||||
mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(nlp1.vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
assert "Q2146908" in nlp1.vocab.strings
|
assert "Q2146908" in nlp1.vocab.strings
|
||||||
|
@ -828,7 +842,7 @@ def test_kb_serialization():
|
||||||
def test_kb_pickle():
|
def test_kb_pickle():
|
||||||
# Test that the KB can be pickled
|
# Test that the KB can be pickled
|
||||||
nlp = English()
|
nlp = English()
|
||||||
kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
assert not kb_1.contains_alias("Russ Cochran")
|
assert not kb_1.contains_alias("Russ Cochran")
|
||||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
|
@ -842,7 +856,7 @@ def test_kb_pickle():
|
||||||
def test_nel_pickle():
|
def test_nel_pickle():
|
||||||
# Test that a pipeline with an EL component can be pickled
|
# Test that a pipeline with an EL component can be pickled
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
return kb
|
return kb
|
||||||
|
@ -864,7 +878,7 @@ def test_nel_pickle():
|
||||||
def test_kb_to_bytes():
|
def test_kb_to_bytes():
|
||||||
# Test that the KB's to_bytes method works correctly
|
# Test that the KB's to_bytes method works correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
||||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
|
@ -874,7 +888,7 @@ def test_kb_to_bytes():
|
||||||
)
|
)
|
||||||
assert kb_1.contains_alias("Russ Cochran")
|
assert kb_1.contains_alias("Russ Cochran")
|
||||||
kb_bytes = kb_1.to_bytes()
|
kb_bytes = kb_1.to_bytes()
|
||||||
kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
assert not kb_2.contains_alias("Russ Cochran")
|
assert not kb_2.contains_alias("Russ Cochran")
|
||||||
kb_2 = kb_2.from_bytes(kb_bytes)
|
kb_2 = kb_2.from_bytes(kb_bytes)
|
||||||
# check that both KBs are exactly the same
|
# check that both KBs are exactly the same
|
||||||
|
@ -897,7 +911,7 @@ def test_kb_to_bytes():
|
||||||
def test_nel_to_bytes():
|
def test_nel_to_bytes():
|
||||||
# Test that a pipeline with an EL component can be converted to bytes
|
# Test that a pipeline with an EL component can be converted to bytes
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
return kb
|
return kb
|
||||||
|
@ -987,7 +1001,7 @@ def test_legacy_architectures(name, config):
|
||||||
train_examples.append(Example.from_dict(doc, annotation))
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
|
@ -1054,7 +1068,7 @@ def test_no_gold_ents(patterns):
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||||
# Placeholder
|
# Placeholder
|
||||||
|
@ -1104,7 +1118,7 @@ def test_tokenization_mismatch():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create placeholder KB
|
# create placeholder KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
@ -1121,6 +1135,12 @@ def test_tokenization_mismatch():
|
||||||
nlp.evaluate(train_examples)
|
nlp.evaluate(train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
def test_abstract_kb_instantiation():
|
||||||
|
"""Test whether instantiation of abstract KB base class fails."""
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
KnowledgeBase(None, 3)
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"meet_threshold,config",
|
"meet_threshold,config",
|
||||||
|
@ -1151,7 +1171,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=3)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
alias="Mahler",
|
alias="Mahler",
|
||||||
|
|
|
@ -230,6 +230,97 @@ def test_tok2vec_listener_callback():
|
||||||
assert get_dX(Y) is not None
|
assert get_dX(Y) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_listener_overfitting():
|
||||||
|
"""Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
|
||||||
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
|
||||||
|
assert losses["tagger"] < 0.00001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
assert doc[0].tag_ == "N"
|
||||||
|
assert doc[1].tag_ == "V"
|
||||||
|
assert doc[2].tag_ == "J"
|
||||||
|
assert doc[3].tag_ == "N"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert doc2[0].tag_ == "N"
|
||||||
|
assert doc2[1].tag_ == "V"
|
||||||
|
assert doc2[2].tag_ == "J"
|
||||||
|
assert doc2[3].tag_ == "N"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_frozen_not_annotating():
|
||||||
|
"""Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
|
||||||
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(2):
|
||||||
|
losses = {}
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError, match=r"the tok2vec embedding layer is not updated"
|
||||||
|
):
|
||||||
|
nlp.update(
|
||||||
|
train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_frozen_overfitting():
|
||||||
|
"""Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
|
||||||
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(100):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(
|
||||||
|
train_examples,
|
||||||
|
sgd=optimizer,
|
||||||
|
losses=losses,
|
||||||
|
exclude=["tok2vec"],
|
||||||
|
annotates=["tok2vec"],
|
||||||
|
)
|
||||||
|
assert losses["tagger"] < 0.0001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
assert doc[0].tag_ == "N"
|
||||||
|
assert doc[1].tag_ == "V"
|
||||||
|
assert doc[2].tag_ == "J"
|
||||||
|
assert doc[3].tag_ == "N"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert doc2[0].tag_ == "N"
|
||||||
|
assert doc2[1].tag_ == "V"
|
||||||
|
assert doc2[2].tag_ == "J"
|
||||||
|
assert doc2[3].tag_ == "N"
|
||||||
|
|
||||||
|
|
||||||
def test_replace_listeners():
|
def test_replace_listeners():
|
||||||
orig_config = Config().from_str(cfg_string)
|
orig_config = Config().from_str(cfg_string)
|
||||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
|
|
@ -3,7 +3,7 @@ from unittest import TestCase
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
from numpy import zeros
|
from numpy import zeros
|
||||||
from spacy.kb import KnowledgeBase, Writer
|
from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
|
||||||
from spacy.vectors import Vectors
|
from spacy.vectors import Vectors
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TrainablePipe
|
from spacy.pipeline import TrainablePipe
|
||||||
|
@ -71,7 +71,7 @@ def entity_linker():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ def test_writer_with_path_py35():
|
||||||
|
|
||||||
def test_save_and_load_knowledge_base():
|
def test_save_and_load_knowledge_base():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
path = d / "kb"
|
path = d / "kb"
|
||||||
try:
|
try:
|
||||||
|
@ -129,7 +129,7 @@ def test_save_and_load_knowledge_base():
|
||||||
pytest.fail(str(e))
|
pytest.fail(str(e))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
kb_loaded = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
kb_loaded.from_disk(path)
|
kb_loaded.from_disk(path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(str(e))
|
pytest.fail(str(e))
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import Callable
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.util import ensure_path, registry, load_model_from_config
|
from spacy.util import ensure_path, registry, load_model_from_config
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ def test_serialize_kb_disk(en_vocab):
|
||||||
dir_path.mkdir()
|
dir_path.mkdir()
|
||||||
file_path = dir_path / "kb"
|
file_path = dir_path / "kb"
|
||||||
kb1.to_disk(str(file_path))
|
kb1.to_disk(str(file_path))
|
||||||
kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
|
kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
|
||||||
kb2.from_disk(str(file_path))
|
kb2.from_disk(str(file_path))
|
||||||
|
|
||||||
# final assertions
|
# final assertions
|
||||||
|
@ -30,7 +30,7 @@ def test_serialize_kb_disk(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def _get_dummy_kb(vocab):
|
def _get_dummy_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
||||||
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
||||||
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
||||||
|
@ -104,7 +104,7 @@ def test_serialize_subclassed_kb():
|
||||||
custom_field = 666
|
custom_field = 666
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class SubKnowledgeBase(KnowledgeBase):
|
class SubInMemoryLookupKB(InMemoryLookupKB):
|
||||||
def __init__(self, vocab, entity_vector_length, custom_field):
|
def __init__(self, vocab, entity_vector_length, custom_field):
|
||||||
super().__init__(vocab, entity_vector_length)
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.custom_field = custom_field
|
self.custom_field = custom_field
|
||||||
|
@ -112,9 +112,9 @@ def test_serialize_subclassed_kb():
|
||||||
@registry.misc("spacy.CustomKB.v1")
|
@registry.misc("spacy.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[[Vocab], KnowledgeBase]:
|
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
kb = SubKnowledgeBase(
|
kb = SubInMemoryLookupKB(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
custom_field=custom_field,
|
custom_field=custom_field,
|
||||||
|
@ -129,7 +129,7 @@ def test_serialize_subclassed_kb():
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
|
||||||
entity_linker = nlp.get_pipe("entity_linker")
|
entity_linker = nlp.get_pipe("entity_linker")
|
||||||
assert type(entity_linker.kb) == SubKnowledgeBase
|
assert type(entity_linker.kb) == SubInMemoryLookupKB
|
||||||
assert entity_linker.kb.entity_vector_length == 342
|
assert entity_linker.kb.entity_vector_length == 342
|
||||||
assert entity_linker.kb.custom_field == 666
|
assert entity_linker.kb.custom_field == 666
|
||||||
|
|
||||||
|
@ -139,6 +139,6 @@ def test_serialize_subclassed_kb():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||||
# After IO, the KB is the standard one
|
# After IO, the KB is the standard one
|
||||||
assert type(entity_linker2.kb) == KnowledgeBase
|
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||||
assert entity_linker2.kb.entity_vector_length == 342
|
assert entity_linker2.kb.entity_vector_length == 342
|
||||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||||
|
|
|
@ -42,7 +42,8 @@ class SpanGroups(UserDict):
|
||||||
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
|
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
|
||||||
if doc is None:
|
if doc is None:
|
||||||
doc = self._ensure_doc()
|
doc = self._ensure_doc()
|
||||||
return SpanGroups(doc).from_bytes(self.to_bytes())
|
data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
|
||||||
|
return SpanGroups(doc, items=data_copy)
|
||||||
|
|
||||||
def setdefault(self, key, default=None):
|
def setdefault(self, key, default=None):
|
||||||
if not isinstance(default, SpanGroup):
|
if not isinstance(default, SpanGroup):
|
||||||
|
|
|
@ -1609,24 +1609,20 @@ cdef class Doc:
|
||||||
Doc.set_extension(attr)
|
Doc.set_extension(attr)
|
||||||
self._.set(attr, doc_json["_"][attr])
|
self._.set(attr, doc_json["_"][attr])
|
||||||
|
|
||||||
if doc_json.get("underscore_token", {}):
|
for token_attr in doc_json.get("underscore_token", {}):
|
||||||
for token_attr in doc_json["underscore_token"]:
|
if not Token.has_extension(token_attr):
|
||||||
token_start = doc_json["underscore_token"][token_attr]["token_start"]
|
Token.set_extension(token_attr)
|
||||||
value = doc_json["underscore_token"][token_attr]["value"]
|
for token_data in doc_json["underscore_token"][token_attr]:
|
||||||
|
start = token_by_char(self.c, self.length, token_data["start"])
|
||||||
if not Token.has_extension(token_attr):
|
value = token_data["value"]
|
||||||
Token.set_extension(token_attr)
|
self[start]._.set(token_attr, value)
|
||||||
self[token_start]._.set(token_attr, value)
|
|
||||||
|
|
||||||
if doc_json.get("underscore_span", {}):
|
for span_attr in doc_json.get("underscore_span", {}):
|
||||||
for span_attr in doc_json["underscore_span"]:
|
if not Span.has_extension(span_attr):
|
||||||
token_start = doc_json["underscore_span"][span_attr]["token_start"]
|
Span.set_extension(span_attr)
|
||||||
token_end = doc_json["underscore_span"][span_attr]["token_end"]
|
for span_data in doc_json["underscore_span"][span_attr]:
|
||||||
value = doc_json["underscore_span"][span_attr]["value"]
|
value = span_data["value"]
|
||||||
|
self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
|
||||||
if not Span.has_extension(span_attr):
|
|
||||||
Span.set_extension(span_attr)
|
|
||||||
self[token_start:token_end]._.set(span_attr, value)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_json(self, underscore=None):
|
def to_json(self, underscore=None):
|
||||||
|
@ -1674,30 +1670,34 @@ cdef class Doc:
|
||||||
if underscore:
|
if underscore:
|
||||||
user_keys = set()
|
user_keys = set()
|
||||||
if self.user_data:
|
if self.user_data:
|
||||||
data["_"] = {}
|
for data_key, value in self.user_data.copy().items():
|
||||||
data["underscore_token"] = {}
|
|
||||||
data["underscore_span"] = {}
|
|
||||||
for data_key in self.user_data:
|
|
||||||
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
||||||
attr = data_key[1]
|
attr = data_key[1]
|
||||||
start = data_key[2]
|
start = data_key[2]
|
||||||
end = data_key[3]
|
end = data_key[3]
|
||||||
if attr in underscore:
|
if attr in underscore:
|
||||||
user_keys.add(attr)
|
user_keys.add(attr)
|
||||||
value = self.user_data[data_key]
|
|
||||||
if not srsly.is_json_serializable(value):
|
if not srsly.is_json_serializable(value):
|
||||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
# Check if doc attribute
|
# Check if doc attribute
|
||||||
if start is None:
|
if start is None:
|
||||||
|
if "_" not in data:
|
||||||
|
data["_"] = {}
|
||||||
data["_"][attr] = value
|
data["_"][attr] = value
|
||||||
# Check if token attribute
|
# Check if token attribute
|
||||||
elif end is None:
|
elif end is None:
|
||||||
|
if "underscore_token" not in data:
|
||||||
|
data["underscore_token"] = {}
|
||||||
if attr not in data["underscore_token"]:
|
if attr not in data["underscore_token"]:
|
||||||
data["underscore_token"][attr] = {"token_start": start, "value": value}
|
data["underscore_token"][attr] = []
|
||||||
|
data["underscore_token"][attr].append({"start": start, "value": value})
|
||||||
# Else span attribute
|
# Else span attribute
|
||||||
else:
|
else:
|
||||||
|
if "underscore_span" not in data:
|
||||||
|
data["underscore_span"] = {}
|
||||||
if attr not in data["underscore_span"]:
|
if attr not in data["underscore_span"]:
|
||||||
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
|
data["underscore_span"][attr] = []
|
||||||
|
data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
|
||||||
|
|
||||||
for attr in underscore:
|
for attr in underscore:
|
||||||
if attr not in user_keys:
|
if attr not in user_keys:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Any, Dict, Iterable
|
from typing import Any, Dict, Iterable, Optional
|
||||||
from .doc import Doc
|
from .doc import Doc
|
||||||
from .span import Span
|
from .span import Span
|
||||||
|
|
||||||
|
@ -24,4 +24,4 @@ class SpanGroup:
|
||||||
def __getitem__(self, i: int) -> Span: ...
|
def __getitem__(self, i: int) -> Span: ...
|
||||||
def to_bytes(self) -> bytes: ...
|
def to_bytes(self) -> bytes: ...
|
||||||
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
|
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
|
||||||
def copy(self) -> SpanGroup: ...
|
def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...
|
||||||
|
|
|
@ -241,15 +241,18 @@ cdef class SpanGroup:
|
||||||
cdef void push_back(self, SpanC span) nogil:
|
cdef void push_back(self, SpanC span) nogil:
|
||||||
self.c.push_back(span)
|
self.c.push_back(span)
|
||||||
|
|
||||||
def copy(self) -> SpanGroup:
|
def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
|
||||||
"""Clones the span group.
|
"""Clones the span group.
|
||||||
|
|
||||||
|
doc (Doc): New reference document to which the copy is bound.
|
||||||
RETURNS (SpanGroup): A copy of the span group.
|
RETURNS (SpanGroup): A copy of the span group.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spangroup#copy
|
DOCS: https://spacy.io/api/spangroup#copy
|
||||||
"""
|
"""
|
||||||
|
if doc is None:
|
||||||
|
doc = self.doc
|
||||||
return SpanGroup(
|
return SpanGroup(
|
||||||
self.doc,
|
doc,
|
||||||
name=self.name,
|
name=self.name,
|
||||||
attrs=deepcopy(self.attrs),
|
attrs=deepcopy(self.attrs),
|
||||||
spans=list(self),
|
spans=list(self),
|
||||||
|
|
|
@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||||
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
|
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
|
||||||
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
|
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
|
||||||
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
||||||
|
| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,8 @@ entities) to unique identifiers, grounding the named entities into the "real
|
||||||
world". It requires a `KnowledgeBase`, as well as a function to generate
|
world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||||
and a machine learning model to pick the right candidate, given the local
|
and a machine learning model to pick the right candidate, given the local
|
||||||
context of the mention.
|
context of the mention. `EntityLinker` defaults to using the
|
||||||
|
[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
|
||||||
|
|
||||||
## Assigned Attributes {#assigned-attributes}
|
## Assigned Attributes {#assigned-attributes}
|
||||||
|
|
||||||
|
@ -170,7 +171,7 @@ with the current vocab.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> def create_kb(vocab):
|
> def create_kb(vocab):
|
||||||
> kb = KnowledgeBase(vocab, entity_vector_length=128)
|
> kb = InMemoryLookupKB(vocab, entity_vector_length=128)
|
||||||
> kb.add_entity(...)
|
> kb.add_entity(...)
|
||||||
> kb.add_alias(...)
|
> kb.add_alias(...)
|
||||||
> return kb
|
> return kb
|
||||||
|
|
|
@ -4,27 +4,45 @@ teaser:
|
||||||
A storage class for entities and aliases of a specific knowledge base
|
A storage class for entities and aliases of a specific knowledge base
|
||||||
(ontology)
|
(ontology)
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/kb.pyx
|
source: spacy/kb/kb.pyx
|
||||||
new: 2.2
|
new: 2.2
|
||||||
---
|
---
|
||||||
|
|
||||||
The `KnowledgeBase` object provides a method to generate
|
The `KnowledgeBase` object is an abstract class providing a method to generate
|
||||||
[`Candidate`](/api/kb/#candidate) objects, which are plausible external
|
[`Candidate`](/api/kb#candidate) objects, which are plausible external
|
||||||
identifiers given a certain textual mention. Each such `Candidate` holds
|
identifiers given a certain textual mention. Each such `Candidate` holds
|
||||||
information from the relevant KB entities, such as its frequency in text and
|
information from the relevant KB entities, such as its frequency in text and
|
||||||
possible aliases. Each entity in the knowledge base also has a pretrained entity
|
possible aliases. Each entity in the knowledge base also has a pretrained entity
|
||||||
vector of a fixed size.
|
vector of a fixed size.
|
||||||
|
|
||||||
|
Beyond that, `KnowledgeBase` classes have to implement a number of utility
|
||||||
|
functions called by the [`EntityLinker`](/api/entitylinker) component.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
|
||||||
|
implementation up to that point is available as `InMemoryLookupKB` from 3.5
|
||||||
|
onwards.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create the knowledge base.
|
`KnowledgeBase` is an abstract class and cannot be instantiated. Its child
|
||||||
|
classes should call `__init__()` to set up some necessary attributes.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.kb import KnowledgeBase
|
> from spacy.kb import KnowledgeBase
|
||||||
|
> from spacy.vocab import Vocab
|
||||||
|
>
|
||||||
|
> class FullyImplementedKB(KnowledgeBase):
|
||||||
|
> def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||||
|
> super().__init__(vocab, entity_vector_length)
|
||||||
|
> ...
|
||||||
> vocab = nlp.vocab
|
> vocab = nlp.vocab
|
||||||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -40,133 +58,66 @@ The length of the fixed-size entity vectors in the knowledge base.
|
||||||
| ----------- | ------------------------------------------------ |
|
| ----------- | ------------------------------------------------ |
|
||||||
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
||||||
|
|
||||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
||||||
|
|
||||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
vector, which should be of length
|
of type [`Candidate`](/api/kb#candidate).
|
||||||
[`entity_vector_length`](/api/kb#entity_vector_length).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
|
> from spacy.lang.en import English
|
||||||
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates(doc[0:2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ---------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------- |
|
||||||
| `entity` | The unique entity identifier. ~~str~~ |
|
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||||
| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
|
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
||||||
| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
## KnowledgeBase.get_candidates_batch {#get_candidates_batch tag="method"}
|
||||||
|
|
||||||
Define the full list of entities in the knowledge base, specifying the corpus
|
Same as [`get_candidates()`](/api/kb#get_candidates), but for an arbitrary
|
||||||
frequency and entity vector for each entity.
|
number of mentions. The [`EntityLinker`](/api/entitylinker) component will call
|
||||||
|
`get_candidates_batch()` instead of `get_candidates()`, if the config parameter
|
||||||
|
`candidates_batch_size` is greater or equal than 1.
|
||||||
|
|
||||||
|
The default implementation of `get_candidates_batch()` executes
|
||||||
|
`get_candidates()` in a loop. We recommend implementing a more efficient way to
|
||||||
|
retrieve candidates for multiple mentions at once, if performance is of concern
|
||||||
|
to you.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | ---------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||||
| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
|
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||||
| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
|
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||||
| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
|
||||||
|
|
||||||
Add an alias or mention to the knowledge base, specifying its potential KB
|
|
||||||
identifiers and their prior probabilities. The entity identifiers should refer
|
|
||||||
to entities previously added with [`add_entity`](/api/kb#add_entity) or
|
|
||||||
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
|
|
||||||
should not exceed 1. Note that an empty string can not be used as alias.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| --------------- | --------------------------------------------------------------------------------- |
|
|
||||||
| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
|
|
||||||
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
|
|
||||||
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
|
|
||||||
|
|
||||||
Get the total number of entities in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> total_entities = len(kb)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ----------------------------------------------------- |
|
|
||||||
| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
|
|
||||||
|
|
||||||
Get a list of all entity IDs in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> all_entities = kb.get_entity_strings()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | --------------------------------------------------------- |
|
|
||||||
| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
|
|
||||||
|
|
||||||
Get the total number of aliases in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> total_aliases = kb.get_size_aliases()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ---------------------------------------------------- |
|
|
||||||
| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
|
|
||||||
|
|
||||||
Get a list of all aliases in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> all_aliases = kb.get_alias_strings()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | -------------------------------------------------------- |
|
|
||||||
| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}
|
## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
<Infobox variant="warning">
|
||||||
of type [`Candidate`](/api/kb/#candidate).
|
This method is _not_ available from spaCy 3.5 onwards.
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
||||||
>
|
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
|
||||||
> ```python
|
more flexibility in customizing knowledge bases. Some of its methods were moved
|
||||||
> candidates = kb.get_alias_candidates("Douglas")
|
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
|
||||||
> ```
|
being `get_alias_candidates()`. This method is now available as
|
||||||
|
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||||
| Name | Description |
|
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
|
||||||
| ----------- | ------------------------------------------------------------- |
|
defaults to
|
||||||
| `alias` | The textual mention or alias. ~~str~~ |
|
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||||
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
||||||
|
|
||||||
|
@ -178,27 +129,30 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
> vector = kb.get_vector("Q42")
|
> vector = kb.get_vector("Q42")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------ |
|
| ----------- | -------------------------------------- |
|
||||||
| `entity` | The entity ID. ~~str~~ |
|
| `entity` | The entity ID. ~~str~~ |
|
||||||
| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
|
| **RETURNS** | The entity vector. ~~Iterable[float]~~ |
|
||||||
|
|
||||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
## KnowledgeBase.get_vectors {#get_vectors tag="method"}
|
||||||
|
|
||||||
Given a certain entity ID and a certain textual mention, retrieve the prior
|
Same as [`get_vector()`](/api/kb#get_vector), but for an arbitrary number of
|
||||||
probability of the fact that the mention links to the entity ID.
|
entity IDs.
|
||||||
|
|
||||||
|
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
||||||
|
We recommend implementing a more efficient way to retrieve vectors for multiple
|
||||||
|
entities at once, if performance is of concern to you.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> probability = kb.get_prior_prob("Q42", "Douglas")
|
> vectors = kb.get_vectors(("Q42", "Q3107329"))
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------------- |
|
||||||
| `entity` | The entity ID. ~~str~~ |
|
| `entities` | The entity IDs. ~~Iterable[str]~~ |
|
||||||
| `alias` | The textual mention or alias. ~~str~~ |
|
| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
|
||||||
| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.to_disk {#to_disk tag="method"}
|
## KnowledgeBase.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
@ -207,12 +161,13 @@ Save the current state of the knowledge base to a directory.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> kb.to_disk(loc)
|
> kb.to_disk(path)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
|
|
||||||
## KnowledgeBase.from_disk {#from_disk tag="method"}
|
## KnowledgeBase.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -222,16 +177,16 @@ Restore the state of the knowledge base from a given directory. Note that the
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.kb import KnowledgeBase
|
|
||||||
> from spacy.vocab import Vocab
|
> from spacy.vocab import Vocab
|
||||||
> vocab = Vocab().from_disk("/path/to/vocab")
|
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||||
> kb.from_disk("/path/to/kb")
|
> kb.from_disk("/path/to/kb")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||||
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
||||||
|
|
||||||
## Candidate {#candidate tag="class"}
|
## Candidate {#candidate tag="class"}
|
||||||
|
|
302
website/docs/api/kb_in_memory.md
Normal file
302
website/docs/api/kb_in_memory.md
Normal file
|
@ -0,0 +1,302 @@
|
||||||
|
---
|
||||||
|
title: InMemoryLookupKB
|
||||||
|
teaser:
|
||||||
|
The default implementation of the KnowledgeBase interface. Stores all
|
||||||
|
information in-memory.
|
||||||
|
tag: class
|
||||||
|
source: spacy/kb/kb_in_memory.pyx
|
||||||
|
new: 3.5
|
||||||
|
---
|
||||||
|
|
||||||
|
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
|
||||||
|
implements all of its methods. It stores all KB data in-memory and generates
|
||||||
|
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
|
||||||
|
entity names. It's highly optimized for both a low memory footprint and speed of
|
||||||
|
retrieval.
|
||||||
|
|
||||||
|
## InMemoryLookupKB.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Create the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.kb import InMemoryLookupKB
|
||||||
|
> vocab = nlp.vocab
|
||||||
|
> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------------- | ------------------------------------------------ |
|
||||||
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
|
| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.entity_vector_length {#entity_vector_length tag="property"}
|
||||||
|
|
||||||
|
The length of the fixed-size entity vectors in the knowledge base.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------ |
|
||||||
|
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.add_entity {#add_entity tag="method"}
|
||||||
|
|
||||||
|
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
||||||
|
vector, which should be of length
|
||||||
|
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
|
||||||
|
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------------- | ---------------------------------------------------------- |
|
||||||
|
| `entity` | The unique entity identifier. ~~str~~ |
|
||||||
|
| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
|
||||||
|
| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.set_entities {#set_entities tag="method"}
|
||||||
|
|
||||||
|
Define the full list of entities in the knowledge base, specifying the corpus
|
||||||
|
frequency and entity vector for each entity.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------- | ---------------------------------------------------------------- |
|
||||||
|
| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
|
||||||
|
| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
|
||||||
|
| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.add_alias {#add_alias tag="method"}
|
||||||
|
|
||||||
|
Add an alias or mention to the knowledge base, specifying its potential KB
|
||||||
|
identifiers and their prior probabilities. The entity identifiers should refer
|
||||||
|
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
|
||||||
|
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
|
||||||
|
probabilities should not exceed 1. Note that an empty string can not be used as
|
||||||
|
alias.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------------- | --------------------------------------------------------------------------------- |
|
||||||
|
| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
|
||||||
|
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
|
||||||
|
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
Get the total number of entities in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> total_entities = len(kb)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------------------- |
|
||||||
|
| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_entity_strings {#get_entity_strings tag="method"}
|
||||||
|
|
||||||
|
Get a list of all entity IDs in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> all_entities = kb.get_entity_strings()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------------------- |
|
||||||
|
| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_size_aliases {#get_size_aliases tag="method"}
|
||||||
|
|
||||||
|
Get the total number of aliases in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> total_aliases = kb.get_size_aliases()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ---------------------------------------------------- |
|
||||||
|
| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_alias_strings {#get_alias_strings tag="method"}
|
||||||
|
|
||||||
|
Get a list of all aliases in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> all_aliases = kb.get_alias_strings()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------- |
|
||||||
|
| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_candidates {#get_candidates tag="method"}
|
||||||
|
|
||||||
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
|
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||||
|
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates(doc[0:2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------------------- |
|
||||||
|
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||||
|
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_candidates_batch {#get_candidates_batch tag="method"}
|
||||||
|
|
||||||
|
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
|
||||||
|
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
|
||||||
|
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
|
||||||
|
parameter `candidates_batch_size` is greater or equal than 1.
|
||||||
|
|
||||||
|
The default implementation of `get_candidates_batch()` executes
|
||||||
|
`get_candidates()` in a loop. We recommend implementing a more efficient way to
|
||||||
|
retrieve candidates for multiple mentions at once, if performance is of concern
|
||||||
|
to you.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||||
|
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||||
|
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_alias_candidates {#get_alias_candidates tag="method"}
|
||||||
|
|
||||||
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
|
of type [`Candidate`](/api/kb#candidate).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> candidates = kb.get_alias_candidates("Douglas")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------- |
|
||||||
|
| `alias` | The textual mention or alias. ~~str~~ |
|
||||||
|
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_vector {#get_vector tag="method"}
|
||||||
|
|
||||||
|
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> vector = kb.get_vector("Q42")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------ |
|
||||||
|
| `entity` | The entity ID. ~~str~~ |
|
||||||
|
| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_vectors {#get_vectors tag="method"}
|
||||||
|
|
||||||
|
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
|
||||||
|
number of entity IDs.
|
||||||
|
|
||||||
|
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
||||||
|
We recommend implementing a more efficient way to retrieve vectors for multiple
|
||||||
|
entities at once, if performance is of concern to you.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> vectors = kb.get_vectors(("Q42", "Q3107329"))
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------------------- |
|
||||||
|
| `entities` | The entity IDs. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_prior_prob {#get_prior_prob tag="method"}
|
||||||
|
|
||||||
|
Given a certain entity ID and a certain textual mention, retrieve the prior
|
||||||
|
probability of the fact that the mention links to the entity ID.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> probability = kb.get_prior_prob("Q42", "Douglas")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------- |
|
||||||
|
| `entity` | The entity ID. ~~str~~ |
|
||||||
|
| `alias` | The textual mention or alias. ~~str~~ |
|
||||||
|
| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
Save the current state of the knowledge base to a directory.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.to_disk(path)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
Restore the state of the knowledge base from a given directory. Note that the
|
||||||
|
[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.vocab import Vocab
|
||||||
|
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||||
|
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||||
|
> kb.from_disk("/path/to/kb")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||||
|
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
|
@ -70,7 +70,7 @@ lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
|
||||||
[`token.pos`](/api/token) from a previous pipeline component (see example
|
[`token.pos`](/api/token) from a previous pipeline component (see example
|
||||||
pipeline configurations in the
|
pipeline configurations in the
|
||||||
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
|
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
|
||||||
libraries (`pymorphy2`).
|
libraries (`pymorphy3`).
|
||||||
|
|
||||||
| Language | Default Mode |
|
| Language | Default Mode |
|
||||||
| -------- | ------------ |
|
| -------- | ------------ |
|
||||||
|
@ -86,9 +86,9 @@ libraries (`pymorphy2`).
|
||||||
| `nb` | `rule` |
|
| `nb` | `rule` |
|
||||||
| `nl` | `rule` |
|
| `nl` | `rule` |
|
||||||
| `pl` | `pos_lookup` |
|
| `pl` | `pos_lookup` |
|
||||||
| `ru` | `pymorphy2` |
|
| `ru` | `pymorphy3` |
|
||||||
| `sv` | `rule` |
|
| `sv` | `rule` |
|
||||||
| `uk` | `pymorphy2` |
|
| `uk` | `pymorphy3` |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
||||||
|
|
|
@ -255,9 +255,10 @@ Return a copy of the span group.
|
||||||
> new_group = doc.spans["errors"].copy()
|
> new_group = doc.spans["errors"].copy()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
|
| `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
|
||||||
|
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
|
||||||
|
|
||||||
## SpanGroup.to_bytes {#to_bytes tag="method"}
|
## SpanGroup.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -78,7 +78,9 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
||||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
|
||||||
|
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||||
|
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
|
||||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||||
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |
|
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |
|
||||||
|
|
|
@ -243,6 +243,27 @@ pipelines.
|
||||||
> python -m spacy project run test . --vars.foo bar
|
> python -m spacy project run test . --vars.foo bar
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
> #### Tip: Environment Variables
|
||||||
|
>
|
||||||
|
> Commands in a project file are not executed in a shell, so they don't have
|
||||||
|
> direct access to environment variables. But you can insert environment
|
||||||
|
> variables using the `env` dictionary to make values available for
|
||||||
|
> interpolation, just like values in `vars`. Here's an example `env` dict that
|
||||||
|
> makes `$PATH` available as `ENV_PATH`:
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> env:
|
||||||
|
> ENV_PATH: PATH
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> This can be used in a project command like so:
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> - name: "echo-path"
|
||||||
|
> script:
|
||||||
|
> - "echo ${env.ENV_PATH}"
|
||||||
|
> ```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
|
|
|
@ -480,7 +480,7 @@ as-is. They are also excluded when calling
|
||||||
> parse. So the evaluation results should always reflect what your pipeline will
|
> parse. So the evaluation results should always reflect what your pipeline will
|
||||||
> produce at runtime. If you want a frozen component to run (without updating)
|
> produce at runtime. If you want a frozen component to run (without updating)
|
||||||
> during training as well, so that downstream components can use its
|
> during training as well, so that downstream components can use its
|
||||||
> **predictions**, you can add it to the list of
|
> **predictions**, you should add it to the list of
|
||||||
> [`annotating_components`](/usage/training#annotating-components).
|
> [`annotating_components`](/usage/training#annotating-components).
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
|
|
|
@ -374,8 +374,8 @@
|
||||||
"has_examples": true,
|
"has_examples": true,
|
||||||
"dependencies": [
|
"dependencies": [
|
||||||
{
|
{
|
||||||
"name": "pymorphy2",
|
"name": "pymorphy3",
|
||||||
"url": "https://github.com/kmike/pymorphy2"
|
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"models": [
|
"models": [
|
||||||
|
@ -480,12 +480,12 @@
|
||||||
],
|
],
|
||||||
"dependencies": [
|
"dependencies": [
|
||||||
{
|
{
|
||||||
"name": "pymorphy2",
|
"name": "pymorphy3",
|
||||||
"url": "https://github.com/kmike/pymorphy2"
|
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "pymorphy2-dicts-uk",
|
"name": "pymorphy3-dicts-uk",
|
||||||
"url": "https://github.com/kmike/pymorphy2-dicts/"
|
"url": "https://github.com/no-plagiarism/pymorphy3-dicts"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,5 +1,46 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "spacy-cleaner",
|
||||||
|
"title": "spacy-cleaner",
|
||||||
|
"slogan": "Easily clean text with spaCy!",
|
||||||
|
"description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
|
||||||
|
"github": "Ce11an/spacy-cleaner",
|
||||||
|
"pip": "spacy-cleaner",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"import spacy_cleaner",
|
||||||
|
"from spacy_cleaner.processing import removers, replacers, mutators",
|
||||||
|
"",
|
||||||
|
"model = spacy.load(\"en_core_web_sm\")",
|
||||||
|
"pipeline = spacy_cleaner.Pipeline(",
|
||||||
|
" model,",
|
||||||
|
" removers.remove_stopword_token,",
|
||||||
|
" replacers.replace_punctuation_token,",
|
||||||
|
" mutators.mutate_lemma_token,",
|
||||||
|
")",
|
||||||
|
"",
|
||||||
|
"texts = [\"Hello, my name is Cellan! I love to swim!\"]",
|
||||||
|
"",
|
||||||
|
"pipeline.clean(texts)",
|
||||||
|
"# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"url": "https://ce11an.github.io/spacy-cleaner/",
|
||||||
|
"image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
|
||||||
|
"author": "Cellan Hall",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "Ce11an",
|
||||||
|
"github": "Ce11an",
|
||||||
|
"website": "https://www.linkedin.com/in/cellan-hall/"
|
||||||
|
},
|
||||||
|
"category": [
|
||||||
|
"extension"
|
||||||
|
],
|
||||||
|
"tags": [
|
||||||
|
"text-processing"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "Zshot",
|
"id": "Zshot",
|
||||||
"title": "Zshot",
|
"title": "Zshot",
|
||||||
|
@ -2460,20 +2501,20 @@
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
|
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
|
||||||
"",
|
"",
|
||||||
"# Load an spacy model (supported models are \"es\" and \"en\") ",
|
"# Load a spaCy model (supported languages are \"es\" and \"en\") ",
|
||||||
"nlp = spacy.load('en')",
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
"# Spacy 3.x",
|
"# spaCy 3.x",
|
||||||
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
|
"nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
|
||||||
"# Spacy 2.x",
|
"# spaCy 2.x",
|
||||||
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
|
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
|
||||||
"token = nlp('prices')[0]",
|
"token = nlp('prices')[0]",
|
||||||
"",
|
"",
|
||||||
"# wordnet object link spacy token with nltk wordnet interface by giving acces to",
|
"# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
|
||||||
"# synsets and lemmas ",
|
"# synsets and lemmas ",
|
||||||
"token._.wordnet.synsets()",
|
"token._.wordnet.synsets()",
|
||||||
"token._.wordnet.lemmas()",
|
"token._.wordnet.lemmas()",
|
||||||
"",
|
"",
|
||||||
"# And automatically tags with wordnet domains",
|
"# And automatically add info about WordNet domains",
|
||||||
"token._.wordnet.wordnet_domains()"
|
"token._.wordnet.wordnet_domains()"
|
||||||
],
|
],
|
||||||
"author": "recognai",
|
"author": "recognai",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user