mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-20 21:40:35 +03:00
Merge pull request #13490 from svlandeg/feat/update_v4
Update v4 branch with latest from master
This commit is contained in:
commit
818fdb537e
2
.github/workflows/explosionbot.yml
vendored
2
.github/workflows/explosionbot.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
run: echo "$GITHUB_CONTEXT"
|
run: echo "$GITHUB_CONTEXT"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
- name: Install and run explosion-bot
|
- name: Install and run explosion-bot
|
||||||
run: |
|
run: |
|
||||||
|
|
2
.github/workflows/gputests.yml
vendored
2
.github/workflows/gputests.yml
vendored
|
@ -9,7 +9,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
branch: [master, main]
|
branch: [master, v4]
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
|
|
2
.github/workflows/lock.yml
vendored
2
.github/workflows/lock.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v5
|
||||||
with:
|
with:
|
||||||
process-only: 'issues'
|
process-only: 'issues'
|
||||||
issue-inactive-days: '30'
|
issue-inactive-days: '30'
|
||||||
|
|
4
.github/workflows/slowtests.yml
vendored
4
.github/workflows/slowtests.yml
vendored
|
@ -9,12 +9,12 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
branch: [master, main]
|
branch: [master, v4]
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ matrix.branch }}
|
ref: ${{ matrix.branch }}
|
||||||
- name: Get commits from past 24 hours
|
- name: Get commits from past 24 hours
|
||||||
|
|
2
.github/workflows/spacy_universe_alert.yml
vendored
2
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "$GITHUB_CONTEXT"
|
echo "$GITHUB_CONTEXT"
|
||||||
|
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
|
@ -25,13 +25,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.9"
|
python-version: "3.9"
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: black
|
- name: black
|
||||||
run: |
|
run: |
|
||||||
|
@ -71,13 +70,12 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
|
3
.github/workflows/universe_validation.yml
vendored
3
.github/workflows/universe_validation.yml
vendored
|
@ -20,13 +20,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.9"
|
python-version: "3.9"
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: Validate website/meta/universe.json
|
- name: Validate website/meta/universe.json
|
||||||
run: |
|
run: |
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -9,9 +9,8 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.10.0
|
typer>=0.3.0,<1.0.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
weasel>=0.1.0,<0.5.0
|
||||||
weasel>=0.1.0,<0.4.0
|
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
numpy>=1.19.0; python_version >= "3.9"
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
|
|
|
@ -41,10 +41,9 @@ install_requires =
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
weasel>=0.1.0,<0.4.0
|
weasel>=0.1.0,<0.5.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.10.0
|
typer>=0.3.0,<1.0.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
numpy>=1.19.0; python_version >= "3.9"
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
# Needed for testing
|
||||||
|
from . import download as download_module # noqa: F401
|
||||||
from ._util import app, setup_cli # noqa: F401
|
from ._util import app, setup_cli # noqa: F401
|
||||||
from .apply import apply # noqa: F401
|
from .apply import apply # noqa: F401
|
||||||
from .assemble import assemble_cli # noqa: F401
|
from .assemble import assemble_cli # noqa: F401
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional, Sequence
|
from typing import Optional, Sequence
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import typer
|
import typer
|
||||||
|
@ -64,6 +65,13 @@ def download(
|
||||||
)
|
)
|
||||||
pip_args = pip_args + ("--no-deps",)
|
pip_args = pip_args + ("--no-deps",)
|
||||||
if direct:
|
if direct:
|
||||||
|
# Reject model names with '/', in order to prevent shenanigans.
|
||||||
|
if "/" in model:
|
||||||
|
msg.fail(
|
||||||
|
title="Model download rejected",
|
||||||
|
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
|
||||||
|
exits=True,
|
||||||
|
)
|
||||||
components = model.split("-")
|
components = model.split("-")
|
||||||
model_name = "".join(components[:-1])
|
model_name = "".join(components[:-1])
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
|
@ -156,7 +164,16 @@ def get_latest_version(model: str) -> str:
|
||||||
def download_model(
|
def download_model(
|
||||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
download_url = about.__download_url__ + "/" + filename
|
# Construct the download URL carefully. We need to make sure we don't
|
||||||
|
# allow relative paths or other shenanigans to trick us into download
|
||||||
|
# from outside our own repo.
|
||||||
|
base_url = about.__download_url__
|
||||||
|
# urljoin requires that the path ends with /, or the last path part will be dropped
|
||||||
|
if not base_url.endswith("/"):
|
||||||
|
base_url = about.__download_url__ + "/"
|
||||||
|
download_url = urljoin(base_url, filename)
|
||||||
|
if not download_url.startswith(about.__download_url__):
|
||||||
|
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
|
||||||
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
||||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||||
run_command(cmd)
|
run_command(cmd)
|
||||||
|
|
|
@ -39,7 +39,7 @@ def find_threshold_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
@ -81,7 +81,7 @@ def find_threshold(
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Tuple[float, float, Dict[float, float]]:
|
) -> Tuple[float, float, Dict[float, float]]:
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
Runs prediction trials for models with varying thresholds to maximize the specified metric.
|
||||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||||
pipe_name (str): Name of pipe to examine thresholds for.
|
pipe_name (str): Name of pipe to examine thresholds for.
|
||||||
|
|
|
@ -215,6 +215,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"key attribute for vectors, configure it through Vectors(attr=) or "
|
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||||
"'spacy init vectors --attr'")
|
"'spacy init vectors --attr'")
|
||||||
W126 = ("These keys are unsupported: {unsupported}")
|
W126 = ("These keys are unsupported: {unsupported}")
|
||||||
|
W127 = ("Not all `Language.pipe` worker processes completed successfully")
|
||||||
|
|
||||||
# v4 warning strings
|
# v4 warning strings
|
||||||
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||||
|
|
|
@ -1844,6 +1844,9 @@ class Language:
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.join()
|
proc.join()
|
||||||
|
|
||||||
|
if not all(proc.exitcode == 0 for proc in procs):
|
||||||
|
warnings.warn(Warnings.W127)
|
||||||
|
|
||||||
def _link_components(self) -> None:
|
def _link_components(self) -> None:
|
||||||
"""Register 'listeners' within pipeline components, to allow them to
|
"""Register 'listeners' within pipeline components, to allow them to
|
||||||
effectively share weights.
|
effectively share weights.
|
||||||
|
@ -2467,6 +2470,7 @@ def _apply_pipes(
|
||||||
if isinstance(texts_with_ctx, _WorkDoneSentinel):
|
if isinstance(texts_with_ctx, _WorkDoneSentinel):
|
||||||
sender.close()
|
sender.close()
|
||||||
receiver.close()
|
receiver.close()
|
||||||
|
return
|
||||||
|
|
||||||
docs = (
|
docs = (
|
||||||
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
||||||
|
@ -2492,6 +2496,7 @@ def _apply_pipes(
|
||||||
# stop processing.
|
# stop processing.
|
||||||
sender.close()
|
sender.close()
|
||||||
receiver.close()
|
receiver.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class _Sender:
|
class _Sender:
|
||||||
|
|
406
spacy/lexeme.pyx
406
spacy/lexeme.pyx
|
@ -164,32 +164,34 @@ cdef class Lexeme:
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
return numpy.sqrt((vector**2).sum())
|
return numpy.sqrt((vector**2).sum())
|
||||||
|
|
||||||
property vector:
|
@property
|
||||||
|
def vector(self):
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the lexeme's semantics.
|
representing the lexeme's semantics.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
cdef int length = self.vocab.vectors_length
|
||||||
cdef int length = self.vocab.vectors_length
|
if length == 0:
|
||||||
if length == 0:
|
raise ValueError(Errors.E010)
|
||||||
raise ValueError(Errors.E010)
|
return self.vocab.get_vector(self.c.orth)
|
||||||
return self.vocab.get_vector(self.c.orth)
|
|
||||||
|
|
||||||
def __set__(self, vector):
|
@vector.setter
|
||||||
if len(vector) != self.vocab.vectors_length:
|
def vector(self, vector):
|
||||||
raise ValueError(Errors.E073.format(new_length=len(vector),
|
if len(vector) != self.vocab.vectors_length:
|
||||||
length=self.vocab.vectors_length))
|
raise ValueError(Errors.E073.format(new_length=len(vector),
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
length=self.vocab.vectors_length))
|
||||||
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
|
|
||||||
property rank:
|
@property
|
||||||
|
def rank(self):
|
||||||
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
||||||
to index into tables, e.g. for word vectors."""
|
to index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
return self.c.id
|
||||||
return self.c.id
|
|
||||||
|
|
||||||
def __set__(self, value):
|
@rank.setter
|
||||||
self.c.id = value
|
def rank(self, value):
|
||||||
|
self.c.id = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
|
@ -203,306 +205,338 @@ cdef class Lexeme:
|
||||||
"""RETURNS (str): The original verbatim text of the lexeme."""
|
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
@property
|
||||||
|
def lower(self):
|
||||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||||
def __get__(self):
|
return self.c.lower
|
||||||
return self.c.lower
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@lower.setter
|
||||||
self.c.lower = x
|
def lower(self, attr_t x):
|
||||||
|
self.c.lower = x
|
||||||
|
|
||||||
property norm:
|
@property
|
||||||
|
def norm(self):
|
||||||
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.norm
|
||||||
return self.c.norm
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@norm.setter
|
||||||
if "lexeme_norm" not in self.vocab.lookups:
|
def norm(self, attr_t x):
|
||||||
self.vocab.lookups.add_table("lexeme_norm")
|
if "lexeme_norm" not in self.vocab.lookups:
|
||||||
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
self.vocab.lookups.add_table("lexeme_norm")
|
||||||
norm_table[self.c.orth] = self.vocab.strings[x]
|
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
||||||
self.c.norm = x
|
norm_table[self.c.orth] = self.vocab.strings[x]
|
||||||
|
self.c.norm = x
|
||||||
|
|
||||||
property shape:
|
@property
|
||||||
|
def shape(self):
|
||||||
"""RETURNS (uint64): Transform of the word's string, to show
|
"""RETURNS (uint64): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.shape
|
||||||
return self.c.shape
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@shape.setter
|
||||||
self.c.shape = x
|
def shape(self, attr_t x):
|
||||||
|
self.c.shape = x
|
||||||
|
|
||||||
property prefix:
|
@property
|
||||||
|
def prefix(self):
|
||||||
"""RETURNS (uint64): Length-N substring from the start of the word.
|
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.prefix
|
||||||
return self.c.prefix
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@prefix.setter
|
||||||
self.c.prefix = x
|
def prefix(self, attr_t x):
|
||||||
|
self.c.prefix = x
|
||||||
|
|
||||||
property suffix:
|
@property
|
||||||
|
def suffix(self):
|
||||||
"""RETURNS (uint64): Length-N substring from the end of the word.
|
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.suffix
|
||||||
return self.c.suffix
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@suffix.setter
|
||||||
self.c.suffix = x
|
def suffix(self, attr_t x):
|
||||||
|
self.c.suffix = x
|
||||||
|
|
||||||
property cluster:
|
@property
|
||||||
|
def cluster(self):
|
||||||
"""RETURNS (int): Brown cluster ID."""
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
def __get__(self):
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
return cluster_table.get(self.c.orth, 0)
|
||||||
return cluster_table.get(self.c.orth, 0)
|
|
||||||
|
|
||||||
def __set__(self, int x):
|
@cluster.setter
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
def cluster(self, int x):
|
||||||
cluster_table[self.c.orth] = x
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
|
cluster_table[self.c.orth] = x
|
||||||
|
|
||||||
property lang:
|
@property
|
||||||
|
def lang(self):
|
||||||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
return self.c.lang
|
||||||
return self.c.lang
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@lang.setter
|
||||||
self.c.lang = x
|
def lang(self, attr_t x):
|
||||||
|
self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
@property
|
||||||
|
def prob(self):
|
||||||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||||
type."""
|
type."""
|
||||||
def __get__(self):
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
||||||
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||||||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
return prob_table.get(self.c.orth, default_oov_prob)
|
||||||
return prob_table.get(self.c.orth, default_oov_prob)
|
|
||||||
|
|
||||||
def __set__(self, float x):
|
@prob.setter
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
def prob(self, float x):
|
||||||
prob_table[self.c.orth] = x
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
|
prob_table[self.c.orth] = x
|
||||||
|
|
||||||
property lower_:
|
@property
|
||||||
|
def lower_(self):
|
||||||
"""RETURNS (str): Lowercase form of the word."""
|
"""RETURNS (str): Lowercase form of the word."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.lower]
|
||||||
return self.vocab.strings[self.c.lower]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@lower_.setter
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
def lower_(self, str x):
|
||||||
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
@property
|
||||||
|
def norm_(self):
|
||||||
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.norm]
|
||||||
return self.vocab.strings[self.c.norm]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@norm_.setter
|
||||||
self.norm = self.vocab.strings.add(x)
|
def norm_(self, str x):
|
||||||
|
self.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
@property
|
||||||
|
def shape_(self):
|
||||||
"""RETURNS (str): Transform of the word's string, to show
|
"""RETURNS (str): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.shape]
|
||||||
return self.vocab.strings[self.c.shape]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@shape_.setter
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
def shape_(self, str x):
|
||||||
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
@property
|
||||||
|
def prefix_(self):
|
||||||
"""RETURNS (str): Length-N substring from the start of the word.
|
"""RETURNS (str): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.prefix]
|
||||||
return self.vocab.strings[self.c.prefix]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@prefix_.setter
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
def prefix_(self, str x):
|
||||||
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
@property
|
||||||
|
def suffix_(self):
|
||||||
"""RETURNS (str): Length-N substring from the end of the word.
|
"""RETURNS (str): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.suffix]
|
||||||
return self.vocab.strings[self.c.suffix]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@suffix_.setter
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
def suffix_(self, str x):
|
||||||
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
@property
|
||||||
|
def lang_(self):
|
||||||
"""RETURNS (str): Language of the parent vocabulary."""
|
"""RETURNS (str): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.lang]
|
||||||
return self.vocab.strings[self.c.lang]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@lang_.setter
|
||||||
self.c.lang = self.vocab.strings.add(x)
|
def lang_(self, str x):
|
||||||
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property flags:
|
@property
|
||||||
|
def flags(self):
|
||||||
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||||||
def __get__(self):
|
return self.c.flags
|
||||||
return self.c.flags
|
|
||||||
|
|
||||||
def __set__(self, flags_t x):
|
@flags.setter
|
||||||
self.c.flags = x
|
def flags(self, flags_t x):
|
||||||
|
self.c.flags = x
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_oov(self):
|
def is_oov(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||||
return self.orth not in self.vocab.vectors
|
return self.orth not in self.vocab.vectors
|
||||||
|
|
||||||
property is_stop:
|
@property
|
||||||
|
def is_stop(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||||
return Lexeme.c_check_flag(self.c, IS_STOP)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_stop.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
def is_stop(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||||
|
|
||||||
property is_alpha:
|
@property
|
||||||
|
def is_alpha(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
||||||
characters. Equivalent to `lexeme.text.isalpha()`.
|
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||||
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_alpha.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
def is_alpha(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||||
|
|
||||||
property is_ascii:
|
@property
|
||||||
|
def is_ascii(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||||||
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||||
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_ascii.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
def is_ascii(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||||
|
|
||||||
property is_digit:
|
@property
|
||||||
|
def is_digit(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||||||
to `lexeme.text.isdigit()`.
|
to `lexeme.text.isdigit()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_digit.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
def is_digit(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||||
|
|
||||||
property is_lower:
|
@property
|
||||||
|
def is_lower(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||||||
`lexeme.text.islower()`.
|
`lexeme.text.islower()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||||
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_lower.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
def is_lower(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||||
|
|
||||||
property is_upper:
|
@property
|
||||||
|
def is_upper(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||||||
`lexeme.text.isupper()`.
|
`lexeme.text.isupper()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||||||
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_upper.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
def is_upper(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||||||
|
|
||||||
property is_title:
|
@property
|
||||||
|
def is_title(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||||||
`lexeme.text.istitle()`.
|
`lexeme.text.istitle()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||||
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_title.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
def is_title(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||||
|
|
||||||
property is_punct:
|
@property
|
||||||
|
def is_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_punct.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
def is_punct(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||||
|
|
||||||
property is_space:
|
@property
|
||||||
|
def is_space(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||||||
Equivalent to `lexeme.text.isspace()`.
|
Equivalent to `lexeme.text.isspace()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||||
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_space.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
def is_space(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||||
|
|
||||||
property is_bracket:
|
@property
|
||||||
|
def is_bracket(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||||
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_bracket.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
def is_bracket(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||||
|
|
||||||
property is_quote:
|
@property
|
||||||
|
def is_quote(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||||
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_quote.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
def is_quote(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
|
|
||||||
property is_left_punct:
|
@property
|
||||||
|
def is_left_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_left_punct.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
def is_left_punct(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||||
|
|
||||||
property is_right_punct:
|
@property
|
||||||
|
def is_right_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_right_punct.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
def is_right_punct(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
|
|
||||||
property is_currency:
|
@property
|
||||||
|
def is_currency(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
||||||
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_currency.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
def is_currency(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
||||||
|
|
||||||
property like_url:
|
@property
|
||||||
|
def like_url(self):
|
||||||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_url.setter
|
||||||
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
def like_url(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
||||||
property like_num:
|
@property
|
||||||
|
def like_num(self):
|
||||||
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||||||
"10", "ten", etc.
|
"10", "ten", etc.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_num.setter
|
||||||
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
def like_num(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||||
|
|
||||||
property like_email:
|
@property
|
||||||
|
def like_email(self):
|
||||||
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_email.setter
|
||||||
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
def like_email(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||||
|
|
|
@ -241,6 +241,7 @@ def _build_parametric_attention_with_residual_nonlinear(
|
||||||
|
|
||||||
parametric_attention.set_ref("tok2vec", tok2vec)
|
parametric_attention.set_ref("tok2vec", tok2vec)
|
||||||
parametric_attention.set_ref("attention_layer", attention_layer)
|
parametric_attention.set_ref("attention_layer", attention_layer)
|
||||||
|
parametric_attention.set_ref("key_transform", key_transform)
|
||||||
parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
|
parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
|
||||||
parametric_attention.set_ref("norm_layer", norm_layer)
|
parametric_attention.set_ref("norm_layer", norm_layer)
|
||||||
|
|
||||||
|
@ -248,10 +249,19 @@ def _build_parametric_attention_with_residual_nonlinear(
|
||||||
|
|
||||||
|
|
||||||
def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
|
def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
|
||||||
|
# When tok2vec is lazily initialized, we need to initialize it before
|
||||||
|
# the rest of the chain to ensure that we can get its width.
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
tok2vec.initialize(X)
|
||||||
|
|
||||||
tok2vec_width = get_tok2vec_width(model)
|
tok2vec_width = get_tok2vec_width(model)
|
||||||
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
||||||
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
|
if model.get_ref("key_transform").has_dim("nI") is None:
|
||||||
|
model.get_ref("key_transform").set_dim("nI", tok2vec_width)
|
||||||
|
if model.get_ref("key_transform").has_dim("nO") is None:
|
||||||
|
model.get_ref("key_transform").set_dim("nO", tok2vec_width)
|
||||||
model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
|
model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
|
||||||
|
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
|
||||||
model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
|
model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
|
||||||
model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
|
model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
|
||||||
init_chain(model, X, Y)
|
init_chain(model, X, Y)
|
||||||
|
|
|
@ -24,7 +24,6 @@ from .. import util
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..kb import Candidate, KnowledgeBase
|
from ..kb import Candidate, KnowledgeBase
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..ml import empty_kb
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Span, SpanGroup
|
from ..tokens import Doc, Span, SpanGroup
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
|
@ -114,7 +113,7 @@ def make_entity_linker(
|
||||||
documents with textual mentions.
|
documents with textual mentions.
|
||||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
|
@ -227,7 +226,6 @@ class EntityLinker(TrainablePipe):
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||||
self.scorer = scorer
|
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self.save_activations = save_activations
|
self.save_activations = save_activations
|
||||||
|
@ -235,6 +233,37 @@ class EntityLinker(TrainablePipe):
|
||||||
if self.incl_prior and not self.kb.supports_prior_probs:
|
if self.incl_prior and not self.kb.supports_prior_probs:
|
||||||
warnings.warn(Warnings.W401)
|
warnings.warn(Warnings.W401)
|
||||||
|
|
||||||
|
def _score_with_ents_set(examples: Iterable[Example], **kwargs):
|
||||||
|
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
|
||||||
|
# calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
|
||||||
|
if not scorer:
|
||||||
|
return scorer
|
||||||
|
if not self.use_gold_ents:
|
||||||
|
return scorer(examples, **kwargs)
|
||||||
|
else:
|
||||||
|
examples = self._ensure_ents(examples)
|
||||||
|
docs = self.pipe(
|
||||||
|
(eg.predicted for eg in examples),
|
||||||
|
)
|
||||||
|
for eg, doc in zip(examples, docs):
|
||||||
|
eg.predicted = doc
|
||||||
|
return scorer(examples, **kwargs)
|
||||||
|
|
||||||
|
self.scorer = _score_with_ents_set
|
||||||
|
|
||||||
|
def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
|
||||||
|
"""If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
|
||||||
|
if not self.use_gold_ents:
|
||||||
|
return examples
|
||||||
|
|
||||||
|
new_examples = []
|
||||||
|
for eg in examples:
|
||||||
|
ents, _ = eg.get_aligned_ents_and_ner()
|
||||||
|
new_eg = eg.copy()
|
||||||
|
new_eg.predicted.ents = ents
|
||||||
|
new_examples.append(new_eg)
|
||||||
|
return new_examples
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
create it using this object's vocab."""
|
create it using this object's vocab."""
|
||||||
|
@ -276,11 +305,9 @@ class EntityLinker(TrainablePipe):
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
vector_sample = []
|
vector_sample = []
|
||||||
for eg in islice(get_examples(), 10):
|
examples = self._ensure_ents(islice(get_examples(), 10))
|
||||||
|
for eg in examples:
|
||||||
doc = eg.x
|
doc = eg.x
|
||||||
if self.use_gold_ents:
|
|
||||||
ents, _ = eg.get_aligned_ents_and_ner()
|
|
||||||
doc.ents = ents
|
|
||||||
doc_sample.append(doc)
|
doc_sample.append(doc)
|
||||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
@ -347,31 +374,17 @@ class EntityLinker(TrainablePipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return losses
|
return losses
|
||||||
|
examples = self._ensure_ents(examples)
|
||||||
validate_examples(examples, "EntityLinker.update")
|
validate_examples(examples, "EntityLinker.update")
|
||||||
|
|
||||||
set_dropout_rate(self.model, drop)
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
# save to restore later
|
|
||||||
old_ents = [doc.ents for doc in docs]
|
|
||||||
|
|
||||||
for doc, ex in zip(docs, examples):
|
|
||||||
if self.use_gold_ents:
|
|
||||||
ents, _ = ex.get_aligned_ents_and_ner()
|
|
||||||
doc.ents = ents
|
|
||||||
else:
|
|
||||||
# only keep matching ents
|
|
||||||
doc.ents = ex.get_matching_ents()
|
|
||||||
|
|
||||||
# make sure we have something to learn from, if not, short-circuit
|
# make sure we have something to learn from, if not, short-circuit
|
||||||
if not self.batch_has_learnable_example(examples):
|
if not self.batch_has_learnable_example(examples):
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
sentence_encodings, bp_context = self.model.begin_update(docs)
|
sentence_encodings, bp_context = self.model.begin_update(docs)
|
||||||
|
|
||||||
# now restore the ents
|
|
||||||
for doc, old in zip(docs, old_ents):
|
|
||||||
doc.ents = old
|
|
||||||
|
|
||||||
loss, d_scores = self.get_loss(
|
loss, d_scores = self.get_loss(
|
||||||
sentence_encodings=sentence_encodings, examples=examples
|
sentence_encodings=sentence_encodings, examples=examples
|
||||||
)
|
)
|
||||||
|
@ -379,11 +392,13 @@ class EntityLinker(TrainablePipe):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||||
validate_examples(examples, "EntityLinker.get_loss")
|
validate_examples(examples, "EntityLinker.get_loss")
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
|
# We assume that get_loss is called with gold ents set in the examples if need be
|
||||||
eidx = 0 # indices in gold entities to keep
|
eidx = 0 # indices in gold entities to keep
|
||||||
keep_ents = [] # indices in sentence_encodings to keep
|
keep_ents = [] # indices in sentence_encodings to keep
|
||||||
|
|
||||||
|
|
|
@ -799,7 +799,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO_gold_entities():
|
||||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
|
@ -826,7 +826,9 @@ def test_overfitting_IO():
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
entity_linker = nlp.add_pipe(
|
||||||
|
"entity_linker", last=True, config={"use_gold_ents": True}
|
||||||
|
)
|
||||||
assert isinstance(entity_linker, EntityLinker)
|
assert isinstance(entity_linker, EntityLinker)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
assert "Q2146908" in entity_linker.vocab.strings
|
assert "Q2146908" in entity_linker.vocab.strings
|
||||||
|
@ -889,6 +891,107 @@ def test_overfitting_IO():
|
||||||
assert_equal(batch_deps_1, batch_deps_2)
|
assert_equal(batch_deps_1, batch_deps_2)
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_deps_1, no_batch_deps)
|
||||||
|
|
||||||
|
eval = nlp.evaluate(train_examples)
|
||||||
|
assert "nel_macro_p" in eval
|
||||||
|
assert "nel_macro_r" in eval
|
||||||
|
assert "nel_macro_f" in eval
|
||||||
|
assert "nel_micro_p" in eval
|
||||||
|
assert "nel_micro_r" in eval
|
||||||
|
assert "nel_micro_f" in eval
|
||||||
|
assert "nel_f_per_type" in eval
|
||||||
|
assert "PERSON" in eval["nel_f_per_type"]
|
||||||
|
|
||||||
|
assert eval["nel_macro_f"] > 0
|
||||||
|
assert eval["nel_micro_f"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO_with_ner():
|
||||||
|
# Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
vector_length = 3
|
||||||
|
assert "Q2146908" not in nlp.vocab.strings
|
||||||
|
|
||||||
|
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||||
|
train_examples = []
|
||||||
|
for text, annotation in TRAIN_DATA:
|
||||||
|
doc = nlp(text)
|
||||||
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
|
def create_kb(vocab):
|
||||||
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
|
# Q7381115 (Russ Cochran): publisher
|
||||||
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
|
mykb.add_alias(
|
||||||
|
alias="Russ Cochran",
|
||||||
|
entities=["Q2146908", "Q7381115"],
|
||||||
|
probabilities=[0.5, 0.5],
|
||||||
|
)
|
||||||
|
return mykb
|
||||||
|
|
||||||
|
# Create the NER and EL components and add them to the pipeline
|
||||||
|
ner = nlp.add_pipe("ner", first=True)
|
||||||
|
entity_linker = nlp.add_pipe(
|
||||||
|
"entity_linker", last=True, config={"use_gold_ents": False}
|
||||||
|
)
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
for ent in annotations.get("entities"):
|
||||||
|
ner.add_label(ent[2])
|
||||||
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
|
# train the NER and NEL pipes
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["ner"] < 0.001
|
||||||
|
assert losses["entity_linker"] < 0.001
|
||||||
|
|
||||||
|
# adding additional components that are required for the entity_linker
|
||||||
|
nlp.add_pipe("sentencizer", first=True)
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "Russ Cochran captured his first major title with his son as caddie."
|
||||||
|
doc = nlp(test_text)
|
||||||
|
ents = doc.ents
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].text == "Russ Cochran"
|
||||||
|
assert ents[0].label_ == "PERSON"
|
||||||
|
assert ents[0].kb_id_ != "NIL"
|
||||||
|
|
||||||
|
# TODO: below assert is still flaky - EL doesn't properly overfit quite yet
|
||||||
|
# assert ents[0].kb_id_ == "Q2146908"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
assert nlp2.pipe_names == nlp.pipe_names
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
ents2 = doc2.ents
|
||||||
|
assert len(ents2) == 1
|
||||||
|
assert ents2[0].text == "Russ Cochran"
|
||||||
|
assert ents2[0].label_ == "PERSON"
|
||||||
|
assert ents2[0].kb_id_ != "NIL"
|
||||||
|
|
||||||
|
eval = nlp.evaluate(train_examples)
|
||||||
|
assert "nel_macro_f" in eval
|
||||||
|
assert "nel_micro_f" in eval
|
||||||
|
assert "ents_f" in eval
|
||||||
|
assert "nel_f_per_type" in eval
|
||||||
|
assert "ents_per_type" in eval
|
||||||
|
assert "PERSON" in eval["nel_f_per_type"]
|
||||||
|
assert "PERSON" in eval["ents_per_type"]
|
||||||
|
|
||||||
|
assert eval["nel_macro_f"] > 0
|
||||||
|
assert eval["nel_micro_f"] > 0
|
||||||
|
assert eval["ents_f"] > 0
|
||||||
|
|
||||||
|
|
||||||
def test_kb_serialization():
|
def test_kb_serialization():
|
||||||
# Test that the KB can be used in a pipeline with a different vocab
|
# Test that the KB can be used in a pipeline with a different vocab
|
||||||
|
|
|
@ -29,6 +29,8 @@ from spacy.tokens import Doc, DocBin
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.training.initialize import init_nlp
|
from spacy.training.initialize import init_nlp
|
||||||
|
|
||||||
|
# Ensure that the architecture gets added to the registry.
|
||||||
|
from ..tok2vec import build_lazy_init_tok2vec as _
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
TRAIN_DATA_SINGLE_LABEL = [
|
TRAIN_DATA_SINGLE_LABEL = [
|
||||||
|
@ -41,6 +43,13 @@ TRAIN_DATA_MULTI_LABEL = [
|
||||||
("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}),
|
("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
lazy_init_model_config = """
|
||||||
|
[model]
|
||||||
|
@architectures = "test.LazyInitTok2Vec.v1"
|
||||||
|
width = 96
|
||||||
|
"""
|
||||||
|
LAZY_INIT_TOK2VEC_MODEL = Config().from_str(lazy_init_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
def make_get_examples_single_label(nlp):
|
def make_get_examples_single_label(nlp):
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -551,6 +560,34 @@ def test_error_with_multi_labels():
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"name,textcat_config",
|
||||||
|
[
|
||||||
|
# ENSEMBLE V2
|
||||||
|
("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
||||||
|
("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
||||||
|
# PARAMETRIC ATTENTION V1
|
||||||
|
("textcat", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||||
|
("textcat_multilabel", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||||
|
# REDUCE
|
||||||
|
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
|
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
def test_tok2vec_lazy_init(name, textcat_config):
|
||||||
|
# Check that we can properly initialize and use a textcat model using
|
||||||
|
# a lazily-initialized tok2vec.
|
||||||
|
nlp = English()
|
||||||
|
pipe_config = {"model": textcat_config}
|
||||||
|
textcat = nlp.add_pipe(name, config=pipe_config)
|
||||||
|
textcat.add_label("POSITIVE")
|
||||||
|
textcat.add_label("NEGATIVE")
|
||||||
|
nlp.initialize()
|
||||||
|
nlp.pipe(["This is a test."])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,get_examples, train_data",
|
"name,get_examples, train_data",
|
||||||
[
|
[
|
||||||
|
|
|
@ -12,7 +12,7 @@ from thinc.api import Config
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy import about
|
from spacy import about
|
||||||
from spacy.cli import info
|
from spacy.cli import download_module, info
|
||||||
from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
|
from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
|
||||||
from spacy.cli.apply import apply
|
from spacy.cli.apply import apply
|
||||||
from spacy.cli.debug_data import (
|
from spacy.cli.debug_data import (
|
||||||
|
@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
|
||||||
def test_project_api_imports():
|
def test_project_api_imports():
|
||||||
from spacy.cli import project_run
|
from spacy.cli import project_run
|
||||||
from spacy.cli.project.run import project_run # noqa: F401, F811
|
from spacy.cli.project.run import project_run # noqa: F401, F811
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_rejects_relative_urls(monkeypatch):
|
||||||
|
"""Test that we can't tell spacy download to get an arbitrary model by using a
|
||||||
|
relative path in the filename"""
|
||||||
|
|
||||||
|
monkeypatch.setattr(download_module, "run_command", lambda cmd: None)
|
||||||
|
|
||||||
|
# Check that normal download works
|
||||||
|
download_module.download("en_core_web_sm-3.7.1", direct=True)
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
download_module.download("../en_core_web_sm-3.7.1", direct=True)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -423,7 +424,7 @@ def test_language_pipe_error_handler(n_process):
|
||||||
nlp.set_error_handler(raise_error)
|
nlp.set_error_handler(raise_error)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(nlp.pipe(texts, n_process=n_process))
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
# set explicitely to ignoring
|
# set explicitly to ignoring
|
||||||
nlp.set_error_handler(ignore_error)
|
nlp.set_error_handler(ignore_error)
|
||||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
assert len(docs) == 0
|
assert len(docs) == 0
|
||||||
|
@ -834,9 +835,13 @@ def test_pass_doc_to_pipeline(nlp, n_process):
|
||||||
assert doc.text == texts[0]
|
assert doc.text == texts[0]
|
||||||
assert len(doc.cats) > 0
|
assert len(doc.cats) > 0
|
||||||
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
|
||||||
docs = nlp.pipe(docs, n_process=n_process)
|
# Catch warnings to ensure that all worker processes exited
|
||||||
assert [doc.text for doc in docs] == texts
|
# succesfully.
|
||||||
assert all(len(doc.cats) for doc in docs)
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
|
docs = nlp.pipe(docs, n_process=n_process)
|
||||||
|
assert [doc.text for doc in docs] == texts
|
||||||
|
assert all(len(doc.cats) for doc in docs)
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_arg_to_pipeline(nlp):
|
def test_invalid_arg_to_pipeline(nlp):
|
||||||
|
|
36
spacy/tests/tok2vec.py
Normal file
36
spacy/tests/tok2vec.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from thinc.api import Model
|
||||||
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("test.LazyInitTok2Vec.v1")
|
||||||
|
def build_lazy_init_tok2vec(*, width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
"""tok2vec model of which the output size is only known after
|
||||||
|
initialization. This implementation does not output meaningful
|
||||||
|
embeddings, it is strictly for testing."""
|
||||||
|
return Model(
|
||||||
|
"lazy_init_tok2vec",
|
||||||
|
lazy_init_tok2vec_forward,
|
||||||
|
init=lazy_init_tok2vec_init,
|
||||||
|
dims={"nO": None},
|
||||||
|
attrs={"width": width},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def lazy_init_tok2vec_init(model: Model, X=None, Y=None):
|
||||||
|
width = model.attrs["width"]
|
||||||
|
model.set_dim("nO", width)
|
||||||
|
|
||||||
|
|
||||||
|
def lazy_init_tok2vec_forward(model: Model, X: List[Doc], is_train: bool):
|
||||||
|
width = model.get_dim("nO")
|
||||||
|
Y = [model.ops.alloc2f(len(doc), width) for doc in X]
|
||||||
|
|
||||||
|
def backprop(dY):
|
||||||
|
return []
|
||||||
|
|
||||||
|
return Y, backprop
|
|
@ -71,65 +71,72 @@ cdef class Tokenizer:
|
||||||
self._special_matcher = PhraseMatcher(self.vocab)
|
self._special_matcher = PhraseMatcher(self.vocab)
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
|
|
||||||
property token_match:
|
@property
|
||||||
def __get__(self):
|
def token_match(self):
|
||||||
return self._token_match
|
return self._token_match
|
||||||
|
|
||||||
def __set__(self, token_match):
|
@token_match.setter
|
||||||
self._token_match = token_match
|
def token_match(self, token_match):
|
||||||
self._reload_special_cases()
|
self._token_match = token_match
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
property url_match:
|
@property
|
||||||
def __get__(self):
|
def url_match(self):
|
||||||
return self._url_match
|
return self._url_match
|
||||||
|
|
||||||
def __set__(self, url_match):
|
@url_match.setter
|
||||||
self._url_match = url_match
|
def url_match(self, url_match):
|
||||||
self._reload_special_cases()
|
self._url_match = url_match
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
property prefix_search:
|
@property
|
||||||
def __get__(self):
|
def prefix_search(self):
|
||||||
return self._prefix_search
|
return self._prefix_search
|
||||||
|
|
||||||
def __set__(self, prefix_search):
|
@prefix_search.setter
|
||||||
self._prefix_search = prefix_search
|
def prefix_search(self, prefix_search):
|
||||||
self._reload_special_cases()
|
self._prefix_search = prefix_search
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
property suffix_search:
|
@property
|
||||||
def __get__(self):
|
def suffix_search(self):
|
||||||
return self._suffix_search
|
return self._suffix_search
|
||||||
|
|
||||||
def __set__(self, suffix_search):
|
@suffix_search.setter
|
||||||
self._suffix_search = suffix_search
|
def suffix_search(self, suffix_search):
|
||||||
self._reload_special_cases()
|
self._suffix_search = suffix_search
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
property infix_finditer:
|
@property
|
||||||
def __get__(self):
|
def infix_finditer(self):
|
||||||
return self._infix_finditer
|
return self._infix_finditer
|
||||||
|
|
||||||
def __set__(self, infix_finditer):
|
@infix_finditer.setter
|
||||||
self._infix_finditer = infix_finditer
|
def infix_finditer(self, infix_finditer):
|
||||||
self._reload_special_cases()
|
self._infix_finditer = infix_finditer
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
property rules:
|
@property
|
||||||
def __get__(self):
|
def rules(self):
|
||||||
return self._rules
|
return self._rules
|
||||||
|
|
||||||
def __set__(self, rules):
|
@rules.setter
|
||||||
self._rules = {}
|
def rules(self, rules):
|
||||||
self._flush_cache()
|
self._rules = {}
|
||||||
self._flush_specials()
|
self._flush_cache()
|
||||||
self._cache = PreshMap()
|
self._flush_specials()
|
||||||
self._specials = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._load_special_cases(rules)
|
self._specials = PreshMap()
|
||||||
|
self._load_special_cases(rules)
|
||||||
|
|
||||||
property faster_heuristics:
|
@property
|
||||||
def __get__(self):
|
def faster_heuristics(self):
|
||||||
return self._faster_heuristics
|
return self._faster_heuristics
|
||||||
|
|
||||||
def __set__(self, faster_heuristics):
|
@faster_heuristics.setter
|
||||||
self._faster_heuristics = faster_heuristics
|
def faster_heuristics(self, faster_heuristics):
|
||||||
self._reload_special_cases()
|
self._faster_heuristics = faster_heuristics
|
||||||
|
self._reload_special_cases()
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (self.vocab,
|
args = (self.vocab,
|
||||||
|
|
|
@ -667,7 +667,8 @@ cdef class Doc:
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
property vector:
|
@property
|
||||||
|
def vector(self):
|
||||||
"""A real-valued meaning representation. Defaults to an average of the
|
"""A real-valued meaning representation. Defaults to an average of the
|
||||||
token vectors.
|
token vectors.
|
||||||
|
|
||||||
|
@ -676,45 +677,46 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#vector
|
DOCS: https://spacy.io/api/doc#vector
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
if "vector" in self.user_hooks:
|
||||||
if "vector" in self.user_hooks:
|
return self.user_hooks["vector"](self)
|
||||||
return self.user_hooks["vector"](self)
|
if self._vector is not None:
|
||||||
if self._vector is not None:
|
return self._vector
|
||||||
return self._vector
|
xp = get_array_module(self.vocab.vectors.data)
|
||||||
xp = get_array_module(self.vocab.vectors.data)
|
if not len(self):
|
||||||
if not len(self):
|
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
||||||
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
return self._vector
|
||||||
return self._vector
|
elif self.vocab.vectors.size > 0:
|
||||||
elif self.vocab.vectors.size > 0:
|
self._vector = sum(t.vector for t in self) / len(self)
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
return self._vector
|
||||||
return self._vector
|
else:
|
||||||
else:
|
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
||||||
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
|
||||||
|
|
||||||
def __set__(self, value):
|
@vector.setter
|
||||||
self._vector = value
|
def vector(self, value):
|
||||||
|
self._vector = value
|
||||||
|
|
||||||
property vector_norm:
|
@property
|
||||||
|
def vector_norm(self):
|
||||||
"""The L2 norm of the document's vector representation.
|
"""The L2 norm of the document's vector representation.
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
RETURNS (float): The L2 norm of the vector representation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#vector_norm
|
DOCS: https://spacy.io/api/doc#vector_norm
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
if "vector_norm" in self.user_hooks:
|
||||||
if "vector_norm" in self.user_hooks:
|
return self.user_hooks["vector_norm"](self)
|
||||||
return self.user_hooks["vector_norm"](self)
|
cdef float value
|
||||||
cdef float value
|
cdef double norm = 0
|
||||||
cdef double norm = 0
|
if self._vector_norm is None:
|
||||||
if self._vector_norm is None:
|
norm = 0.0
|
||||||
norm = 0.0
|
for value in self.vector:
|
||||||
for value in self.vector:
|
norm += value * value
|
||||||
norm += value * value
|
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
||||||
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
return self._vector_norm
|
||||||
return self._vector_norm
|
|
||||||
|
|
||||||
def __set__(self, value):
|
@vector_norm.setter
|
||||||
self._vector_norm = value
|
def vector_norm(self, value):
|
||||||
|
self._vector_norm = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
|
@ -733,7 +735,8 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
@property
|
||||||
|
def ents(self):
|
||||||
"""The named entities in the document. Returns a list of named entity
|
"""The named entities in the document. Returns a list of named entity
|
||||||
`Span` objects, if the entity recognizer has been applied.
|
`Span` objects, if the entity recognizer has been applied.
|
||||||
|
|
||||||
|
@ -741,55 +744,55 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#ents
|
DOCS: https://spacy.io/api/doc#ents
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
cdef int i
|
||||||
cdef int i
|
cdef const TokenC* token
|
||||||
cdef const TokenC* token
|
cdef int start = -1
|
||||||
cdef int start = -1
|
cdef attr_t label = 0
|
||||||
cdef attr_t label = 0
|
cdef attr_t kb_id = 0
|
||||||
cdef attr_t kb_id = 0
|
cdef attr_t ent_id = 0
|
||||||
cdef attr_t ent_id = 0
|
output = []
|
||||||
output = []
|
for i in range(self.length):
|
||||||
for i in range(self.length):
|
token = &self.c[i]
|
||||||
token = &self.c[i]
|
if token.ent_iob == 1:
|
||||||
if token.ent_iob == 1:
|
if start == -1:
|
||||||
if start == -1:
|
seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
|
||||||
seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
|
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
|
||||||
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
|
elif token.ent_iob == 2 or token.ent_iob == 0 or \
|
||||||
elif token.ent_iob == 2 or token.ent_iob == 0 or \
|
(token.ent_iob == 3 and token.ent_type == 0):
|
||||||
(token.ent_iob == 3 and token.ent_type == 0):
|
if start != -1:
|
||||||
if start != -1:
|
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
|
||||||
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
|
start = -1
|
||||||
start = -1
|
label = 0
|
||||||
label = 0
|
kb_id = 0
|
||||||
kb_id = 0
|
ent_id = 0
|
||||||
ent_id = 0
|
elif token.ent_iob == 3:
|
||||||
elif token.ent_iob == 3:
|
if start != -1:
|
||||||
if start != -1:
|
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
|
||||||
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
|
start = i
|
||||||
start = i
|
label = token.ent_type
|
||||||
label = token.ent_type
|
kb_id = token.ent_kb_id
|
||||||
kb_id = token.ent_kb_id
|
ent_id = token.ent_id
|
||||||
ent_id = token.ent_id
|
if start != -1:
|
||||||
if start != -1:
|
output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id))
|
||||||
output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id))
|
# remove empty-label spans
|
||||||
# remove empty-label spans
|
output = [o for o in output if o.label_ != ""]
|
||||||
output = [o for o in output if o.label_ != ""]
|
return tuple(output)
|
||||||
return tuple(output)
|
|
||||||
|
|
||||||
def __set__(self, ents):
|
@ents.setter
|
||||||
# TODO:
|
def ents(self, ents):
|
||||||
# 1. Test basic data-driven ORTH gazetteer
|
# TODO:
|
||||||
# 2. Test more nuanced date and currency regex
|
# 1. Test basic data-driven ORTH gazetteer
|
||||||
cdef attr_t kb_id, ent_id
|
# 2. Test more nuanced date and currency regex
|
||||||
cdef int ent_start, ent_end
|
cdef attr_t kb_id, ent_id
|
||||||
ent_spans = []
|
cdef int ent_start, ent_end
|
||||||
for ent_info in ents:
|
ent_spans = []
|
||||||
entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info)
|
for ent_info in ents:
|
||||||
if isinstance(entity_type_, str):
|
entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info)
|
||||||
self.vocab.strings.add(entity_type_)
|
if isinstance(entity_type_, str):
|
||||||
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id)
|
self.vocab.strings.add(entity_type_)
|
||||||
ent_spans.append(span)
|
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id)
|
||||||
self.set_ents(ent_spans, default=SetEntsDefault.outside)
|
ent_spans.append(span)
|
||||||
|
self.set_ents(ent_spans, default=SetEntsDefault.outside)
|
||||||
|
|
||||||
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
|
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
|
||||||
"""Set entity annotation.
|
"""Set entity annotation.
|
||||||
|
|
|
@ -786,110 +786,130 @@ cdef class Span:
|
||||||
for word in self.rights:
|
for word in self.rights:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property start:
|
@property
|
||||||
def __get__(self):
|
def start(self):
|
||||||
return self.span_c().start
|
return self.span_c().start
|
||||||
|
|
||||||
def __set__(self, int start):
|
@start.setter
|
||||||
if start < 0 or start > self.doc.length:
|
def start(self, int start):
|
||||||
raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
|
if start < 0 or start > self.doc.length:
|
||||||
cdef SpanC* span_c = self.span_c()
|
raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
|
||||||
if start > span_c.end:
|
cdef SpanC * span_c = self.span_c()
|
||||||
raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
|
if start > span_c.end:
|
||||||
span_c.start = start
|
raise ValueError(
|
||||||
span_c.start_char = self.doc.c[start].idx
|
Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
|
||||||
|
span_c.start = start
|
||||||
|
span_c.start_char = self.doc.c[start].idx
|
||||||
|
|
||||||
property end:
|
@property
|
||||||
def __get__(self):
|
def end(self):
|
||||||
return self.span_c().end
|
return self.span_c().end
|
||||||
|
|
||||||
def __set__(self, int end):
|
@end.setter
|
||||||
if end < 0 or end > self.doc.length:
|
def end(self, int end):
|
||||||
raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
|
if end < 0 or end > self.doc.length:
|
||||||
cdef SpanC* span_c = self.span_c()
|
raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
|
||||||
if span_c.start > end:
|
cdef SpanC * span_c = self.span_c()
|
||||||
raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
|
if span_c.start > end:
|
||||||
span_c.end = end
|
raise ValueError(
|
||||||
if end > 0:
|
Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
|
||||||
span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
|
span_c.end = end
|
||||||
else:
|
if end > 0:
|
||||||
span_c.end_char = 0
|
span_c.end_char = self.doc.c[end - 1].idx + self.doc.c[end - 1].lex.length
|
||||||
|
else:
|
||||||
|
span_c.end_char = 0
|
||||||
|
|
||||||
property start_char:
|
@property
|
||||||
def __get__(self):
|
def start_char(self):
|
||||||
return self.span_c().start_char
|
return self.span_c().start_char
|
||||||
|
|
||||||
def __set__(self, int start_char):
|
@start_char.setter
|
||||||
if start_char < 0 or start_char > len(self.doc.text):
|
def start_char(self, int start_char):
|
||||||
raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
|
if start_char < 0 or start_char > len(self.doc.text):
|
||||||
cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
|
raise IndexError(
|
||||||
if start < 0:
|
Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
|
||||||
raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
|
cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
|
||||||
cdef SpanC* span_c = self.span_c()
|
if start < 0:
|
||||||
if start_char > span_c.end_char:
|
raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
|
||||||
raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
|
cdef SpanC * span_c = self.span_c()
|
||||||
span_c.start_char = start_char
|
if start_char > span_c.end_char:
|
||||||
span_c.start = start
|
raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char",
|
||||||
|
existing_value=span_c.end_char))
|
||||||
|
span_c.start_char = start_char
|
||||||
|
span_c.start = start
|
||||||
|
|
||||||
property end_char:
|
@property
|
||||||
def __get__(self):
|
def end_char(self):
|
||||||
return self.span_c().end_char
|
return self.span_c().end_char
|
||||||
|
|
||||||
def __set__(self, int end_char):
|
@end_char.setter
|
||||||
if end_char < 0 or end_char > len(self.doc.text):
|
def end_char(self, int end_char):
|
||||||
raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
|
if end_char < 0 or end_char > len(self.doc.text):
|
||||||
cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
|
raise IndexError(
|
||||||
if end < 0:
|
Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
|
||||||
raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
|
cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
|
||||||
cdef SpanC* span_c = self.span_c()
|
if end < 0:
|
||||||
if span_c.start_char > end_char:
|
raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
|
||||||
raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
|
cdef SpanC * span_c = self.span_c()
|
||||||
span_c.end_char = end_char
|
if span_c.start_char > end_char:
|
||||||
span_c.end = end
|
raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char",
|
||||||
|
existing_value=span_c.start_char))
|
||||||
|
span_c.end_char = end_char
|
||||||
|
span_c.end = end
|
||||||
|
|
||||||
property label:
|
@property
|
||||||
def __get__(self):
|
def label(self):
|
||||||
return self.span_c().label
|
return self.span_c().label
|
||||||
|
|
||||||
def __set__(self, attr_t label):
|
@label.setter
|
||||||
if label != self.span_c().label :
|
def label(self, attr_t label):
|
||||||
old_label = self.span_c().label
|
if label != self.span_c().label:
|
||||||
self.span_c().label = label
|
old_label = self.span_c().label
|
||||||
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
self.span_c().label = label
|
||||||
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
|
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
|
||||||
Underscore._replace_keys(old, new)
|
end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
||||||
|
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
|
||||||
|
end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
|
||||||
|
Underscore._replace_keys(old, new)
|
||||||
|
|
||||||
property kb_id:
|
@property
|
||||||
def __get__(self):
|
def kb_id(self):
|
||||||
return self.span_c().kb_id
|
return self.span_c().kb_id
|
||||||
|
|
||||||
def __set__(self, attr_t kb_id):
|
@kb_id.setter
|
||||||
if kb_id != self.span_c().kb_id :
|
def kb_id(self, attr_t kb_id):
|
||||||
old_kb_id = self.span_c().kb_id
|
if kb_id != self.span_c().kb_id:
|
||||||
self.span_c().kb_id = kb_id
|
old_kb_id = self.span_c().kb_id
|
||||||
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
self.span_c().kb_id = kb_id
|
||||||
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
|
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
|
||||||
Underscore._replace_keys(old, new)
|
end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
||||||
|
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
|
||||||
|
end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
|
||||||
|
Underscore._replace_keys(old, new)
|
||||||
|
|
||||||
property id:
|
@property
|
||||||
def __get__(self):
|
def id(self):
|
||||||
return self.span_c().id
|
return self.span_c().id
|
||||||
|
|
||||||
def __set__(self, attr_t id):
|
@id.setter
|
||||||
if id != self.span_c().id :
|
def id(self, attr_t id):
|
||||||
old_id = self.span_c().id
|
if id != self.span_c().id:
|
||||||
self.span_c().id = id
|
old_id = self.span_c().id
|
||||||
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
self.span_c().id = id
|
||||||
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
|
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
|
||||||
Underscore._replace_keys(old, new)
|
end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
||||||
|
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
|
||||||
|
end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
|
||||||
|
Underscore._replace_keys(old, new)
|
||||||
|
|
||||||
property ent_id:
|
@property
|
||||||
|
def ent_id(self):
|
||||||
"""Alias for the span's ID."""
|
"""Alias for the span's ID."""
|
||||||
def __get__(self):
|
return self.id
|
||||||
return self.id
|
|
||||||
|
|
||||||
def __set__(self, attr_t ent_id):
|
@ent_id.setter
|
||||||
self.id = ent_id
|
def ent_id(self, attr_t ent_id):
|
||||||
|
self.id = ent_id
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
|
@ -904,29 +924,32 @@ cdef class Span:
|
||||||
"""RETURNS (str): The span's lemma."""
|
"""RETURNS (str): The span's lemma."""
|
||||||
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
||||||
|
|
||||||
property label_:
|
@property
|
||||||
|
def label_(self):
|
||||||
"""The span's label."""
|
"""The span's label."""
|
||||||
def __get__(self):
|
return self.doc.vocab.strings[self.label]
|
||||||
return self.doc.vocab.strings[self.label]
|
|
||||||
|
|
||||||
def __set__(self, str label_):
|
@label_.setter
|
||||||
self.label = self.doc.vocab.strings.add(label_)
|
def label_(self, str label_):
|
||||||
|
self.label = self.doc.vocab.strings.add(label_)
|
||||||
|
|
||||||
property kb_id_:
|
@property
|
||||||
|
def kb_id_(self):
|
||||||
"""The span's KB ID."""
|
"""The span's KB ID."""
|
||||||
def __get__(self):
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
return self.doc.vocab.strings[self.kb_id]
|
|
||||||
|
|
||||||
def __set__(self, str kb_id_):
|
@kb_id_.setter
|
||||||
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
def kb_id_(self, str kb_id_):
|
||||||
|
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
||||||
|
|
||||||
property id_:
|
@property
|
||||||
|
def id_(self):
|
||||||
"""The span's ID."""
|
"""The span's ID."""
|
||||||
def __get__(self):
|
return self.doc.vocab.strings[self.id]
|
||||||
return self.doc.vocab.strings[self.id]
|
|
||||||
|
|
||||||
def __set__(self, str id_):
|
@id_.setter
|
||||||
self.id = self.doc.vocab.strings.add(id_)
|
def id_(self, str id_):
|
||||||
|
self.id = self.doc.vocab.strings.add(id_)
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""Alias for the span's ID."""
|
"""Alias for the span's ID."""
|
||||||
|
|
|
@ -250,15 +250,16 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return not self.c.morph == 0
|
return not self.c.morph == 0
|
||||||
|
|
||||||
property morph:
|
@property
|
||||||
def __get__(self):
|
def morph(self):
|
||||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||||
|
|
||||||
def __set__(self, MorphAnalysis morph):
|
@morph.setter
|
||||||
# Check that the morph has the same vocab
|
def morph(self, MorphAnalysis morph):
|
||||||
if self.vocab != morph.vocab:
|
# Check that the morph has the same vocab
|
||||||
raise ValueError(Errors.E1013)
|
if self.vocab != morph.vocab:
|
||||||
self.c.morph = deref(morph.c).key
|
raise ValueError(Errors.E1013)
|
||||||
|
self.c.morph = deref(morph.c).key
|
||||||
|
|
||||||
def set_morph(self, features):
|
def set_morph(self, features):
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
|
@ -370,39 +371,43 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.c.lex.suffix
|
return self.c.lex.suffix
|
||||||
|
|
||||||
property lemma:
|
@property
|
||||||
|
def lemma(self):
|
||||||
"""RETURNS (uint64): ID of the base form of the word, with no
|
"""RETURNS (uint64): ID of the base form of the word, with no
|
||||||
inflectional suffixes.
|
inflectional suffixes.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.lemma
|
||||||
return self.c.lemma
|
|
||||||
|
|
||||||
def __set__(self, attr_t lemma):
|
@lemma.setter
|
||||||
self.c.lemma = lemma
|
def lemma(self, attr_t lemma):
|
||||||
|
self.c.lemma = lemma
|
||||||
|
|
||||||
property pos:
|
@property
|
||||||
|
def pos(self):
|
||||||
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
|
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
return self.c.pos
|
||||||
return self.c.pos
|
|
||||||
|
|
||||||
def __set__(self, pos):
|
@pos.setter
|
||||||
self.c.pos = pos
|
def pos(self, pos):
|
||||||
|
self.c.pos = pos
|
||||||
|
|
||||||
property tag:
|
@property
|
||||||
|
def tag(self):
|
||||||
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
|
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
return self.c.tag
|
||||||
return self.c.tag
|
|
||||||
|
|
||||||
def __set__(self, attr_t tag):
|
@tag.setter
|
||||||
self.c.tag = tag
|
def tag(self, attr_t tag):
|
||||||
|
self.c.tag = tag
|
||||||
|
|
||||||
property dep:
|
@property
|
||||||
|
def dep(self):
|
||||||
"""RETURNS (uint64): ID of syntactic dependency label."""
|
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||||
def __get__(self):
|
return self.c.dep
|
||||||
return self.c.dep
|
|
||||||
|
|
||||||
def __set__(self, attr_t label):
|
@dep.setter
|
||||||
self.c.dep = label
|
def dep(self, attr_t label):
|
||||||
|
self.c.dep = label
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def has_vector(self):
|
def has_vector(self):
|
||||||
|
@ -483,48 +488,51 @@ cdef class Token:
|
||||||
return self.doc.user_token_hooks["sent"](self)
|
return self.doc.user_token_hooks["sent"](self)
|
||||||
return self.doc[self.i : self.i+1].sent
|
return self.doc[self.i : self.i+1].sent
|
||||||
|
|
||||||
property sent_start:
|
@property
|
||||||
def __get__(self):
|
def sent_start(self):
|
||||||
"""Deprecated: use Token.is_sent_start instead."""
|
"""Deprecated: use Token.is_sent_start instead."""
|
||||||
# Raising a deprecation warning here causes errors for autocomplete
|
# Raising a deprecation warning here causes errors for autocomplete
|
||||||
# Handle broken backwards compatibility case: doc[0].sent_start
|
# Handle broken backwards compatibility case: doc[0].sent_start
|
||||||
# was False.
|
# was False.
|
||||||
if self.i == 0:
|
if self.i == 0:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return self.c.sent_start
|
return self.c.sent_start
|
||||||
|
|
||||||
def __set__(self, value):
|
@sent_start.setter
|
||||||
self.is_sent_start = value
|
def sent_start(self, value):
|
||||||
|
self.is_sent_start = value
|
||||||
|
|
||||||
property is_sent_start:
|
@property
|
||||||
|
def is_sent_start(self):
|
||||||
"""A boolean value indicating whether the token starts a sentence.
|
"""A boolean value indicating whether the token starts a sentence.
|
||||||
`None` if unknown. Defaults to `True` for the first token in the `Doc`.
|
`None` if unknown. Defaults to `True` for the first token in the `Doc`.
|
||||||
|
|
||||||
RETURNS (bool / None): Whether the token starts a sentence.
|
RETURNS (bool / None): Whether the token starts a sentence.
|
||||||
None if unknown.
|
None if unknown.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
if self.c.sent_start == 0:
|
||||||
if self.c.sent_start == 0:
|
return None
|
||||||
return None
|
elif self.c.sent_start < 0:
|
||||||
elif self.c.sent_start < 0:
|
return False
|
||||||
return False
|
else:
|
||||||
else:
|
return True
|
||||||
return True
|
|
||||||
|
|
||||||
def __set__(self, value):
|
@is_sent_start.setter
|
||||||
if self.doc.has_annotation("DEP"):
|
def is_sent_start(self, value):
|
||||||
raise ValueError(Errors.E043)
|
if self.doc.has_annotation("DEP"):
|
||||||
if value is None:
|
raise ValueError(Errors.E043)
|
||||||
self.c.sent_start = 0
|
if value is None:
|
||||||
elif value is True:
|
self.c.sent_start = 0
|
||||||
self.c.sent_start = 1
|
elif value is True:
|
||||||
elif value is False:
|
self.c.sent_start = 1
|
||||||
self.c.sent_start = -1
|
elif value is False:
|
||||||
else:
|
self.c.sent_start = -1
|
||||||
raise ValueError(Errors.E044.format(value=value))
|
else:
|
||||||
|
raise ValueError(Errors.E044.format(value=value))
|
||||||
|
|
||||||
property is_sent_end:
|
@property
|
||||||
|
def is_sent_end(self):
|
||||||
"""A boolean value indicating whether the token ends a sentence.
|
"""A boolean value indicating whether the token ends a sentence.
|
||||||
`None` if unknown. Defaults to `True` for the last token in the `Doc`.
|
`None` if unknown. Defaults to `True` for the last token in the `Doc`.
|
||||||
|
|
||||||
|
@ -533,18 +541,18 @@ cdef class Token:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#is_sent_end
|
DOCS: https://spacy.io/api/token#is_sent_end
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
if self.i + 1 == len(self.doc):
|
||||||
if self.i + 1 == len(self.doc):
|
return True
|
||||||
return True
|
elif self.doc[self.i+1].is_sent_start is None:
|
||||||
elif self.doc[self.i+1].is_sent_start is None:
|
return None
|
||||||
return None
|
elif self.doc[self.i+1].is_sent_start is True:
|
||||||
elif self.doc[self.i+1].is_sent_start is True:
|
return True
|
||||||
return True
|
else:
|
||||||
else:
|
return False
|
||||||
return False
|
|
||||||
|
|
||||||
def __set__(self, value):
|
@is_sent_end.setter
|
||||||
raise ValueError(Errors.E196)
|
def is_sent_end(self, value):
|
||||||
|
raise ValueError(Errors.E196)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lefts(self):
|
def lefts(self):
|
||||||
|
@ -671,41 +679,42 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return not Token.missing_head(self.c)
|
return not Token.missing_head(self.c)
|
||||||
|
|
||||||
property head:
|
@property
|
||||||
|
def head(self):
|
||||||
"""The syntactic parent, or "governor", of this token.
|
"""The syntactic parent, or "governor", of this token.
|
||||||
If token.has_head() is `False`, this method will return itself.
|
If token.has_head() is `False`, this method will return itself.
|
||||||
|
|
||||||
RETURNS (Token): The token predicted by the parser to be the head of
|
RETURNS (Token): The token predicted by the parser to be the head of
|
||||||
the current token.
|
the current token.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
if not self.has_head():
|
||||||
if not self.has_head():
|
return self
|
||||||
return self
|
else:
|
||||||
else:
|
return self.doc[self.i + self.c.head]
|
||||||
return self.doc[self.i + self.c.head]
|
|
||||||
|
|
||||||
def __set__(self, Token new_head):
|
@head.setter
|
||||||
# This function sets the head of self to new_head and updates the
|
def head(self, Token new_head):
|
||||||
# counters for left/right dependents and left/right corner for the
|
# This function sets the head of self to new_head and updates the
|
||||||
# new and the old head
|
# counters for left/right dependents and left/right corner for the
|
||||||
# Check that token is from the same document
|
# new and the old head
|
||||||
if self.doc != new_head.doc:
|
# Check that token is from the same document
|
||||||
raise ValueError(Errors.E191)
|
if self.doc != new_head.doc:
|
||||||
# Do nothing if old head is new head
|
raise ValueError(Errors.E191)
|
||||||
if self.i + self.c.head == new_head.i:
|
# Do nothing if old head is new head
|
||||||
return
|
if self.i + self.c.head == new_head.i:
|
||||||
# Find the widest l/r_edges of the roots of the two tokens involved
|
return
|
||||||
# to limit the number of tokens for set_children_from_heads
|
# Find the widest l/r_edges of the roots of the two tokens involved
|
||||||
cdef Token self_root, new_head_root
|
# to limit the number of tokens for set_children_from_heads
|
||||||
self_root = ([self] + list(self.ancestors))[-1]
|
cdef Token self_root, new_head_root
|
||||||
new_head_ancestors = list(new_head.ancestors)
|
self_root = ([self] + list(self.ancestors))[-1]
|
||||||
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
|
new_head_ancestors = list(new_head.ancestors)
|
||||||
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
|
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
|
||||||
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
|
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
|
||||||
# Set new head
|
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
|
||||||
self.c.head = new_head.i - self.i
|
# Set new head
|
||||||
# Adjust parse properties and sentence starts
|
self.c.head = new_head.i - self.i
|
||||||
set_children_from_heads(self.doc.c, start, end + 1)
|
# Adjust parse properties and sentence starts
|
||||||
|
set_children_from_heads(self.doc.c, start, end + 1)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def conjuncts(self):
|
def conjuncts(self):
|
||||||
|
@ -733,21 +742,23 @@ cdef class Token:
|
||||||
queue.append(child)
|
queue.append(child)
|
||||||
return tuple([w for w in output if w.i != self.i])
|
return tuple([w for w in output if w.i != self.i])
|
||||||
|
|
||||||
property ent_type:
|
@property
|
||||||
|
def ent_type(self):
|
||||||
"""RETURNS (uint64): Named entity type."""
|
"""RETURNS (uint64): Named entity type."""
|
||||||
def __get__(self):
|
return self.c.ent_type
|
||||||
return self.c.ent_type
|
|
||||||
|
|
||||||
def __set__(self, ent_type):
|
@ent_type.setter
|
||||||
self.c.ent_type = ent_type
|
def ent_type(self, ent_type):
|
||||||
|
self.c.ent_type = ent_type
|
||||||
|
|
||||||
property ent_type_:
|
@property
|
||||||
|
def ent_type_(self):
|
||||||
"""RETURNS (str): Named entity type."""
|
"""RETURNS (str): Named entity type."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.ent_type]
|
||||||
return self.vocab.strings[self.c.ent_type]
|
|
||||||
|
|
||||||
def __set__(self, ent_type):
|
@ent_type_.setter
|
||||||
self.c.ent_type = self.vocab.strings.add(ent_type)
|
def ent_type_(self, ent_type):
|
||||||
|
self.c.ent_type = self.vocab.strings.add(ent_type)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_iob(self):
|
def ent_iob(self):
|
||||||
|
@ -773,41 +784,45 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.iob_strings()[self.c.ent_iob]
|
return self.iob_strings()[self.c.ent_iob]
|
||||||
|
|
||||||
property ent_id:
|
@property
|
||||||
|
def ent_id(self):
|
||||||
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||||
if any.
|
if any.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.ent_id
|
||||||
return self.c.ent_id
|
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
@ent_id.setter
|
||||||
self.c.ent_id = key
|
def ent_id(self, hash_t key):
|
||||||
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
@property
|
||||||
|
def ent_id_(self):
|
||||||
"""RETURNS (str): ID of the entity the token is an instance of,
|
"""RETURNS (str): ID of the entity the token is an instance of,
|
||||||
if any.
|
if any.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.ent_id]
|
||||||
return self.vocab.strings[self.c.ent_id]
|
|
||||||
|
|
||||||
def __set__(self, name):
|
@ent_id_.setter
|
||||||
self.c.ent_id = self.vocab.strings.add(name)
|
def ent_id_(self, name):
|
||||||
|
self.c.ent_id = self.vocab.strings.add(name)
|
||||||
|
|
||||||
property ent_kb_id:
|
@property
|
||||||
|
def ent_kb_id(self):
|
||||||
"""RETURNS (uint64): Named entity KB ID."""
|
"""RETURNS (uint64): Named entity KB ID."""
|
||||||
def __get__(self):
|
return self.c.ent_kb_id
|
||||||
return self.c.ent_kb_id
|
|
||||||
|
|
||||||
def __set__(self, attr_t ent_kb_id):
|
@ent_kb_id.setter
|
||||||
self.c.ent_kb_id = ent_kb_id
|
def ent_kb_id(self, attr_t ent_kb_id):
|
||||||
|
self.c.ent_kb_id = ent_kb_id
|
||||||
|
|
||||||
property ent_kb_id_:
|
@property
|
||||||
|
def ent_kb_id_(self):
|
||||||
"""RETURNS (str): Named entity KB ID."""
|
"""RETURNS (str): Named entity KB ID."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.ent_kb_id]
|
||||||
return self.vocab.strings[self.c.ent_kb_id]
|
|
||||||
|
|
||||||
def __set__(self, ent_kb_id):
|
@ent_kb_id_.setter
|
||||||
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
|
def ent_kb_id_(self, ent_kb_id):
|
||||||
|
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def whitespace_(self):
|
def whitespace_(self):
|
||||||
|
@ -829,16 +844,17 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.lower]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
property norm_:
|
@property
|
||||||
|
def norm_(self):
|
||||||
"""RETURNS (str): The token's norm, i.e. a normalised form of the
|
"""RETURNS (str): The token's norm, i.e. a normalised form of the
|
||||||
token text. Usually set in the language's tokenizer exceptions or
|
token text. Usually set in the language's tokenizer exceptions or
|
||||||
norm exceptions.
|
norm exceptions.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.norm]
|
||||||
return self.vocab.strings[self.norm]
|
|
||||||
|
|
||||||
def __set__(self, str norm_):
|
@norm_.setter
|
||||||
self.c.norm = self.vocab.strings.add(norm_)
|
def norm_(self, str norm_):
|
||||||
|
self.c.norm = self.vocab.strings.add(norm_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape_(self):
|
def shape_(self):
|
||||||
|
@ -868,33 +884,36 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.lang]
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
property lemma_:
|
@property
|
||||||
|
def lemma_(self):
|
||||||
"""RETURNS (str): The token lemma, i.e. the base form of the word,
|
"""RETURNS (str): The token lemma, i.e. the base form of the word,
|
||||||
with no inflectional suffixes.
|
with no inflectional suffixes.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.lemma]
|
||||||
return self.vocab.strings[self.c.lemma]
|
|
||||||
|
|
||||||
def __set__(self, str lemma_):
|
@lemma_.setter
|
||||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
def lemma_(self, str lemma_):
|
||||||
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
@property
|
||||||
|
def pos_(self):
|
||||||
"""RETURNS (str): Coarse-grained part-of-speech tag."""
|
"""RETURNS (str): Coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
return parts_of_speech.NAMES[self.c.pos]
|
||||||
return parts_of_speech.NAMES[self.c.pos]
|
|
||||||
|
|
||||||
def __set__(self, pos_name):
|
@pos_.setter
|
||||||
if pos_name not in parts_of_speech.IDS:
|
def pos_(self, pos_name):
|
||||||
raise ValueError(Errors.E1021.format(pp=pos_name))
|
if pos_name not in parts_of_speech.IDS:
|
||||||
self.c.pos = parts_of_speech.IDS[pos_name]
|
raise ValueError(Errors.E1021.format(pp=pos_name))
|
||||||
|
self.c.pos = parts_of_speech.IDS[pos_name]
|
||||||
|
|
||||||
property tag_:
|
@property
|
||||||
|
def tag_(self):
|
||||||
"""RETURNS (str): Fine-grained part-of-speech tag."""
|
"""RETURNS (str): Fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.tag]
|
||||||
return self.vocab.strings[self.c.tag]
|
|
||||||
|
|
||||||
def __set__(self, tag):
|
@tag_.setter
|
||||||
self.tag = self.vocab.strings.add(tag)
|
def tag_(self, tag):
|
||||||
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
def has_dep(self):
|
def has_dep(self):
|
||||||
"""Check whether the token has annotated dep information.
|
"""Check whether the token has annotated dep information.
|
||||||
|
@ -904,13 +923,14 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return not Token.missing_dep(self.c)
|
return not Token.missing_dep(self.c)
|
||||||
|
|
||||||
property dep_:
|
@property
|
||||||
|
def dep_(self):
|
||||||
"""RETURNS (str): The syntactic dependency label."""
|
"""RETURNS (str): The syntactic dependency label."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.dep]
|
||||||
return self.vocab.strings[self.c.dep]
|
|
||||||
|
|
||||||
def __set__(self, str label):
|
@dep_.setter
|
||||||
self.c.dep = self.vocab.strings.add(label)
|
def dep_(self, str label):
|
||||||
|
self.c.dep = self.vocab.strings.add(label)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_oov(self):
|
def is_oov(self):
|
||||||
|
|
|
@ -101,23 +101,25 @@ cdef class Example:
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.predicted)
|
return len(self.predicted)
|
||||||
|
|
||||||
property predicted:
|
@property
|
||||||
def __get__(self):
|
def predicted(self):
|
||||||
return self.x
|
return self.x
|
||||||
|
|
||||||
def __set__(self, doc):
|
@predicted.setter
|
||||||
self.x = doc
|
def predicted(self, doc):
|
||||||
self._cached_alignment = None
|
self.x = doc
|
||||||
self._cached_words_x = [t.text for t in doc]
|
self._cached_alignment = None
|
||||||
|
self._cached_words_x = [t.text for t in doc]
|
||||||
|
|
||||||
property reference:
|
@property
|
||||||
def __get__(self):
|
def reference(self):
|
||||||
return self.y
|
return self.y
|
||||||
|
|
||||||
def __set__(self, doc):
|
@reference.setter
|
||||||
self.y = doc
|
def reference(self, doc):
|
||||||
self._cached_alignment = None
|
self.y = doc
|
||||||
self._cached_words_y = [t.text for t in doc]
|
self._cached_alignment = None
|
||||||
|
self._cached_words_y = [t.text for t in doc]
|
||||||
|
|
||||||
def copy(self):
|
def copy(self):
|
||||||
return Example(
|
return Example(
|
||||||
|
@ -433,9 +435,9 @@ cdef class Example:
|
||||||
seen_indices.update(indices)
|
seen_indices.update(indices)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
property text:
|
@property
|
||||||
def __get__(self):
|
def text(self):
|
||||||
return self.x.text
|
return self.x.text
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return str(self.to_dict())
|
return str(self.to_dict())
|
||||||
|
|
|
@ -87,16 +87,17 @@ cdef class Vocab:
|
||||||
self.writing_system = writing_system
|
self.writing_system = writing_system
|
||||||
self.get_noun_chunks = get_noun_chunks
|
self.get_noun_chunks = get_noun_chunks
|
||||||
|
|
||||||
property vectors:
|
@property
|
||||||
def __get__(self):
|
def vectors(self):
|
||||||
return self._vectors
|
return self._vectors
|
||||||
|
|
||||||
def __set__(self, vectors):
|
@vectors.setter
|
||||||
if hasattr(vectors, "strings"):
|
def vectors(self, vectors):
|
||||||
for s in vectors.strings:
|
if hasattr(vectors, "strings"):
|
||||||
self.strings.add(s)
|
for s in vectors.strings:
|
||||||
self._vectors = vectors
|
self.strings.add(s)
|
||||||
self._vectors.strings = self.strings
|
self._vectors = vectors
|
||||||
|
self._vectors.strings = self.strings
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lang(self):
|
def lang(self):
|
||||||
|
@ -450,17 +451,18 @@ cdef class Vocab:
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
return key in self.vectors
|
return key in self.vectors
|
||||||
|
|
||||||
property lookups:
|
@property
|
||||||
def __get__(self):
|
def lookups(self):
|
||||||
return self._lookups
|
return self._lookups
|
||||||
|
|
||||||
def __set__(self, lookups):
|
@lookups.setter
|
||||||
self._lookups = lookups
|
def lookups(self, lookups):
|
||||||
if lookups.has_table("lexeme_norm"):
|
self._lookups = lookups
|
||||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
if lookups.has_table("lexeme_norm"):
|
||||||
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||||
self.lookups.get_table("lexeme_norm"),
|
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
||||||
)
|
self.lookups.get_table("lexeme_norm"),
|
||||||
|
)
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
|
@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed
|
||||||
as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
|
as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
|
||||||
appending `_` as in `token.dep_`.
|
appending `_` as in `token.dep_`.
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `DEP` | The token's dependency label. ~~str~~ |
|
| `DEP` | The token's dependency label. ~~str~~ |
|
||||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||||
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
|
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
|
||||||
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
|
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
|
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
|
||||||
| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ |
|
| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ |
|
||||||
| `IS_DIGIT` | Token text consists of digits. ~~bool~~ |
|
| `IS_DIGIT` | Token text consists of digits. ~~bool~~ |
|
||||||
| `IS_LOWER` | Token text is in lowercase. ~~bool~~ |
|
| `IS_LOWER` | Token text is in lowercase. ~~bool~~ |
|
||||||
| `IS_PUNCT` | Token is punctuation. ~~bool~~ |
|
| `IS_PUNCT` | Token is punctuation. ~~bool~~ |
|
||||||
| `IS_SPACE` | Token is whitespace. ~~bool~~ |
|
| `IS_SPACE` | Token is whitespace. ~~bool~~ |
|
||||||
| `IS_STOP` | Token is a stop word. ~~bool~~ |
|
| `IS_STOP` | Token is a stop word. ~~bool~~ |
|
||||||
| `IS_TITLE` | Token text is in titlecase. ~~bool~~ |
|
| `IS_TITLE` | Token text is in titlecase. ~~bool~~ |
|
||||||
| `IS_UPPER` | Token text is in uppercase. ~~bool~~ |
|
| `IS_UPPER` | Token text is in uppercase. ~~bool~~ |
|
||||||
| `LEMMA` | The token's lemma. ~~str~~ |
|
| `LEMMA` | The token's lemma. ~~str~~ |
|
||||||
| `LENGTH` | The length of the token text. ~~int~~ |
|
| `LENGTH` | The length of the token text. ~~int~~ |
|
||||||
| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ |
|
| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ |
|
||||||
| `LIKE_NUM` | Token text resembles a number. ~~bool~~ |
|
| `LIKE_NUM` | Token text resembles a number. ~~bool~~ |
|
||||||
| `LIKE_URL` | Token text resembles a URL. ~~bool~~ |
|
| `LIKE_URL` | Token text resembles a URL. ~~bool~~ |
|
||||||
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
||||||
| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ |
|
| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ |
|
||||||
| `NORM` | The normalized form of the token text. ~~str~~ |
|
| `NORM` | The normalized form of the token text. ~~str~~ |
|
||||||
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
||||||
| `POS` | The token's universal part of speech (UPOS). ~~str~~ |
|
| `POS` | The token's universal part of speech (UPOS). ~~str~~ |
|
||||||
| `SENT_START` | Token is start of sentence. ~~bool~~ |
|
| `SENT_START` | Token is start of sentence. ~~bool~~ |
|
||||||
| `SHAPE` | The token's shape. ~~str~~ |
|
| `SHAPE` | The token's shape. ~~str~~ |
|
||||||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||||
| `TAG` | The token's fine-grained part of speech. ~~str~~ |
|
| `TAG` | The token's fine-grained part of speech. ~~str~~ |
|
||||||
|
|
|
@ -566,7 +566,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
|
||||||
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
||||||
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
||||||
✔ Good amount of examples for all labels
|
✔ Good amount of examples for all labels
|
||||||
✔ Examples without occurences available for all labels
|
✔ Examples without occurrences available for all labels
|
||||||
✔ No entities consisting of or starting/ending with whitespace
|
✔ No entities consisting of or starting/ending with whitespace
|
||||||
|
|
||||||
=========================== Part-of-speech Tagging ===========================
|
=========================== Part-of-speech Tagging ===========================
|
||||||
|
@ -1322,7 +1322,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
|
||||||
|
|
||||||
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
||||||
|
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
|
|
@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters.
|
||||||
| `incl_context` | Whether the local context is included in the model. Defaults to `True`. ~~bool~~ |
|
| `incl_context` | Whether the local context is included in the model. Defaults to `True`. ~~bool~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [`EntityLinker`](/api/architectures#EntityLinker). ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [`EntityLinker`](/api/architectures#EntityLinker). ~~Model~~ |
|
||||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||||
| `use_gold_ents` | Whether entities are copied from the gold docs. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
| `use_gold_ents` | Whether entities are copied from the gold docs. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ |
|
||||||
| `get_candidates` <Tag variant="new">4.0</Tag> | Function that retrieves plausible candidates per entity mention in a given `Iterator[SpanGroup]` (one `SpanGroup` includes all mentions found in a given `Doc` instance). Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ |
|
| `get_candidates` <Tag variant="new">4.0</Tag> | Function that retrieves plausible candidates per entity mention in a given `Iterator[SpanGroup]` (one `SpanGroup` includes all mentions found in a given `Doc` instance). Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ |
|
||||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
@ -114,21 +114,21 @@ custom knowledge base, you should either call
|
||||||
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
|
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
|
||||||
[`initialize`](/api/entitylinker#initialize) call.
|
[`initialize`](/api/entitylinker#initialize) call.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
|
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
|
||||||
| `get_candidates` | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ |
|
| `get_candidates` | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ |
|
||||||
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
||||||
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
|
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
|
||||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
||||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
|
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -69,7 +69,7 @@ how the component should be configured. You can override its settings via the
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
||||||
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
||||||
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
||||||
|
|
|
@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis.
|
||||||
> assert "Feat1=Val1" in morph
|
> assert "Feat1=Val1" in morph
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------------- |
|
| ------------ | --------------------------------------------------------------------- |
|
||||||
| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
|
| `feature` | A feature/value pair. ~~str~~ |
|
||||||
|
| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ |
|
||||||
|
|
||||||
### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}
|
### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -287,7 +287,7 @@ does not permit other NPs to be nested within it – so no NP-level coordination
|
||||||
no prepositional phrases, and no relative clauses.
|
no prepositional phrases, and no relative clauses.
|
||||||
|
|
||||||
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
||||||
has not been implemeted for the given language, a `NotImplementedError` is
|
has not been implemented for the given language, a `NotImplementedError` is
|
||||||
raised.
|
raised.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
|
||||||
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||||
| `width` | The width of the last hidden layer. ~~int~~ |
|
| `width` | The width of the last hidden layer. ~~int~~ |
|
||||||
|
|
||||||
### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
|
### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
|
||||||
|
|
||||||
Create an empty `TransformerData` container.
|
Create an empty `TransformerData` container.
|
||||||
|
|
||||||
|
|
|
@ -832,7 +832,7 @@ retrieve and add to them.
|
||||||
|
|
||||||
After creation, the component needs to be
|
After creation, the component needs to be
|
||||||
[initialized](/usage/training#initialization). This method can define the
|
[initialized](/usage/training#initialization). This method can define the
|
||||||
relevant labels in two ways: explicitely by setting the `labels` argument in the
|
relevant labels in two ways: explicitly by setting the `labels` argument in the
|
||||||
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
||||||
implicately by deducing them from the `get_examples` callback that generates the
|
implicately by deducing them from the `get_examples` callback that generates the
|
||||||
full **training data set**, or a representative sample.
|
full **training data set**, or a representative sample.
|
||||||
|
|
|
@ -1899,7 +1899,7 @@ the two words.
|
||||||
"Shore": ("coast", 0.732257),
|
"Shore": ("coast", 0.732257),
|
||||||
"Precautionary": ("caution", 0.490973),
|
"Precautionary": ("caution", 0.490973),
|
||||||
"hopelessness": ("sadness", 0.742366),
|
"hopelessness": ("sadness", 0.742366),
|
||||||
"Continous": ("continuous", 0.732549),
|
"Continuous": ("continuous", 0.732549),
|
||||||
"Disemboweled": ("corpse", 0.499432),
|
"Disemboweled": ("corpse", 0.499432),
|
||||||
"biostatistician": ("scientist", 0.339724),
|
"biostatistician": ("scientist", 0.339724),
|
||||||
"somewheres": ("somewheres", 0.402736),
|
"somewheres": ("somewheres", 0.402736),
|
||||||
|
|
|
@ -530,13 +530,17 @@ application's `requirements.txt`. If you're running your own internal PyPi
|
||||||
installation, you can upload the pipeline packages there. pip's
|
installation, you can upload the pipeline packages there. pip's
|
||||||
[requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
|
[requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
|
||||||
supports both package names to download via a PyPi server, as well as
|
supports both package names to download via a PyPi server, as well as
|
||||||
[direct URLs](#pipeline-urls).
|
[direct URLs](#pipeline-urls). For instance, you can specify the
|
||||||
|
`en_core_web_sm` model for spaCy 3.7.x as follows:
|
||||||
|
|
||||||
```text {title="requirements.txt"}
|
```text {title="requirements.txt"}
|
||||||
spacy>=3.0.0,<4.0.0
|
spacy>=3.0.0,<4.0.0
|
||||||
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
|
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See the [list of models](https://spacy.io/models) for model download links for
|
||||||
|
the current spaCy version.
|
||||||
|
|
||||||
All pipeline packages are versioned and specify their spaCy dependency. This
|
All pipeline packages are versioned and specify their spaCy dependency. This
|
||||||
ensures cross-compatibility and lets you specify exact version requirements for
|
ensures cross-compatibility and lets you specify exact version requirements for
|
||||||
each pipeline. If you've [trained](/usage/training) your own pipeline, you can
|
each pipeline. If you've [trained](/usage/training) your own pipeline, you can
|
||||||
|
|
|
@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
|
||||||
dependency check, set `check_requirements: false` in your project's
|
dependency check, set `check_requirements: false` in your project's
|
||||||
`project.yml`.
|
`project.yml`.
|
||||||
|
|
||||||
### 4. Run a workflow {id="run-workfow"}
|
### 4. Run a workflow {id="run-workflow"}
|
||||||
|
|
||||||
> #### project.yml
|
> #### project.yml
|
||||||
>
|
>
|
||||||
|
@ -286,7 +286,7 @@ pipelines.
|
||||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
|
|
|
@ -306,7 +306,9 @@ installed in the same environment – that's it.
|
||||||
|
|
||||||
### Loading probability tables into existing models
|
### Loading probability tables into existing models
|
||||||
|
|
||||||
You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
|
You can load a probability table from
|
||||||
|
[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
|
||||||
|
existing spaCy model like `en_core_web_sm`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Requirements: pip install spacy-lookups-data
|
# Requirements: pip install spacy-lookups-data
|
||||||
|
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
|
||||||
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
|
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
|
||||||
```
|
```
|
||||||
|
|
||||||
When training a model from scratch you can also specify probability tables in the `config.cfg`.
|
When training a model from scratch you can also specify probability tables in
|
||||||
|
the `config.cfg`.
|
||||||
|
|
||||||
```ini {title="config.cfg (excerpt)"}
|
```ini {title="config.cfg (excerpt)"}
|
||||||
[initialize.lookups]
|
[initialize.lookups]
|
||||||
|
@ -346,8 +349,8 @@ them**!
|
||||||
To stick with the theme of
|
To stick with the theme of
|
||||||
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
consider the following custom spaCy
|
consider the following custom spaCy
|
||||||
[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
|
[pipeline component](/usage/processing-pipelines#custom-components) that prints
|
||||||
snake when it's called:
|
a snake when it's called:
|
||||||
|
|
||||||
> #### Package directory structure
|
> #### Package directory structure
|
||||||
>
|
>
|
||||||
|
|
|
@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
|
||||||
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
||||||
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
||||||
✔ Good amount of examples for all labels
|
✔ Good amount of examples for all labels
|
||||||
✔ Examples without occurences available for all labels
|
✔ Examples without occurrences available for all labels
|
||||||
✔ No entities consisting of or starting/ending with whitespace
|
✔ No entities consisting of or starting/ending with whitespace
|
||||||
|
|
||||||
=========================== Part-of-speech Tagging ===========================
|
=========================== Part-of-speech Tagging ===========================
|
||||||
|
|
|
@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
|
||||||
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
|
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
|
||||||
in the [transformer API docs](/api/architectures#TransformerModel).
|
in the [transformer API docs](/api/architectures#TransformerModel).
|
||||||
|
|
||||||
`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
|
`spacy-transformers` v1.1 also adds support for `transformer_config` settings
|
||||||
such as `output_attentions`. Additional output is stored under
|
such as `output_attentions`. Additional output is stored under
|
||||||
`TransformerData.model_output`. More details are in the
|
`TransformerData.model_output`. More details are in the
|
||||||
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
|
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
|
||||||
|
|
|
@ -23,7 +23,6 @@
|
||||||
},
|
},
|
||||||
"docSearch": {
|
"docSearch": {
|
||||||
"appId": "Y1LB128RON",
|
"appId": "Y1LB128RON",
|
||||||
"apiKey": "bb601a1daab73e2dc66faf2b79564807",
|
|
||||||
"indexName": "spacy"
|
"indexName": "spacy"
|
||||||
},
|
},
|
||||||
"binderUrl": "explosion/spacy-io-binder",
|
"binderUrl": "explosion/spacy-io-binder",
|
||||||
|
|
|
@ -32,6 +32,9 @@ const nextConfig = withPWA(
|
||||||
ignoreBuildErrors: true,
|
ignoreBuildErrors: true,
|
||||||
},
|
},
|
||||||
images: { unoptimized: true },
|
images: { unoptimized: true },
|
||||||
|
env: {
|
||||||
|
DOCSEARCH_API_KEY: process.env.DOCSEARCH_API_KEY
|
||||||
|
}
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import React, { useEffect, useState } from 'react'
|
import React from 'react'
|
||||||
import PropTypes from 'prop-types'
|
import PropTypes from 'prop-types'
|
||||||
import { DocSearch } from '@docsearch/react'
|
import { DocSearch } from '@docsearch/react'
|
||||||
import '@docsearch/css'
|
import '@docsearch/css'
|
||||||
|
@ -6,7 +6,8 @@ import '@docsearch/css'
|
||||||
import siteMetadata from '../../meta/site.json'
|
import siteMetadata from '../../meta/site.json'
|
||||||
|
|
||||||
export default function Search({ placeholder = 'Search docs' }) {
|
export default function Search({ placeholder = 'Search docs' }) {
|
||||||
const { apiKey, indexName, appId } = siteMetadata.docSearch
|
const apiKey = process.env.DOCSEARCH_API_KEY
|
||||||
|
const { indexName, appId } = siteMetadata.docSearch
|
||||||
return (
|
return (
|
||||||
<DocSearch appId={appId} indexName={indexName} apiKey={apiKey} placeholder={placeholder} />
|
<DocSearch appId={appId} indexName={indexName} apiKey={apiKey} placeholder={placeholder} />
|
||||||
)
|
)
|
||||||
|
|
|
@ -109,6 +109,8 @@
|
||||||
box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25)
|
box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25)
|
||||||
background: var(--color-dark)
|
background: var(--color-dark)
|
||||||
margin: 1.5rem 0 0 2rem
|
margin: 1.5rem 0 0 2rem
|
||||||
|
position: sticky
|
||||||
|
left: 2rem
|
||||||
|
|
||||||
.header
|
.header
|
||||||
width: 100%
|
width: 100%
|
||||||
|
|
Loading…
Reference in New Issue
Block a user