Merge pull request #13490 from svlandeg/feat/update_v4

Update v4 branch with latest from master
This commit is contained in:
Sofie Van Landeghem 2024-05-14 22:41:17 +02:00 committed by GitHub
commit 818fdb537e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
47 changed files with 1073 additions and 731 deletions

View File

@ -15,7 +15,7 @@ jobs:
env: env:
GITHUB_CONTEXT: ${{ toJson(github) }} GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT" run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4
- name: Install and run explosion-bot - name: Install and run explosion-bot
run: | run: |

View File

@ -9,7 +9,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
branch: [master, main] branch: [master, v4]
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

View File

@ -16,7 +16,7 @@ jobs:
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dessant/lock-threads@v4 - uses: dessant/lock-threads@v5
with: with:
process-only: 'issues' process-only: 'issues'
issue-inactive-days: '30' issue-inactive-days: '30'

View File

@ -9,12 +9,12 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
branch: [master, main] branch: [master, v4]
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
ref: ${{ matrix.branch }} ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours - name: Get commits from past 24 hours

View File

@ -18,7 +18,7 @@ jobs:
run: | run: |
echo "$GITHUB_CONTEXT" echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4
with: with:
python-version: '3.10' python-version: '3.10'

View File

@ -25,13 +25,12 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Check out repo - name: Check out repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Configure Python version - name: Configure Python version
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: "3.9" python-version: "3.9"
architecture: x64
- name: black - name: black
run: | run: |
@ -71,13 +70,12 @@ jobs:
steps: steps:
- name: Check out repo - name: Check out repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Configure Python version - name: Configure Python version
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python_version }} python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies - name: Install dependencies
run: | run: |

View File

@ -20,13 +20,12 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Check out repo - name: Check out repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Configure Python version - name: Configure Python version
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: "3.9" python-version: "3.9"
architecture: x64
- name: Validate website/meta/universe.json - name: Validate website/meta/universe.json
run: | run: |

View File

@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@ -9,9 +9,8 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.10.0 typer>=0.3.0,<1.0.0
smart-open>=5.2.1,<7.0.0 weasel>=0.1.0,<0.5.0
weasel>=0.1.0,<0.4.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0; python_version < "3.9" numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9" numpy>=1.19.0; python_version >= "3.9"

View File

@ -41,10 +41,9 @@ install_requires =
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
weasel>=0.1.0,<0.4.0 weasel>=0.1.0,<0.5.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.10.0 typer>=0.3.0,<1.0.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0; python_version < "3.9" numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9" numpy>=1.19.0; python_version >= "3.9"

View File

@ -1,5 +1,7 @@
from wasabi import msg from wasabi import msg
# Needed for testing
from . import download as download_module # noqa: F401
from ._util import app, setup_cli # noqa: F401 from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401 from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401 from .assemble import assemble_cli # noqa: F401

View File

@ -1,5 +1,6 @@
import sys import sys
from typing import Optional, Sequence from typing import Optional, Sequence
from urllib.parse import urljoin
import requests import requests
import typer import typer
@ -64,6 +65,13 @@ def download(
) )
pip_args = pip_args + ("--no-deps",) pip_args = pip_args + ("--no-deps",)
if direct: if direct:
# Reject model names with '/', in order to prevent shenanigans.
if "/" in model:
msg.fail(
title="Model download rejected",
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
exits=True,
)
components = model.split("-") components = model.split("-")
model_name = "".join(components[:-1]) model_name = "".join(components[:-1])
version = components[-1] version = components[-1]
@ -156,7 +164,16 @@ def get_latest_version(model: str) -> str:
def download_model( def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None: ) -> None:
download_url = about.__download_url__ + "/" + filename # Construct the download URL carefully. We need to make sure we don't
# allow relative paths or other shenanigans to trick us into download
# from outside our own repo.
base_url = about.__download_url__
# urljoin requires that the path ends with /, or the last path part will be dropped
if not base_url.endswith("/"):
base_url = about.__download_url__ + "/"
download_url = urljoin(base_url, filename)
if not download_url.startswith(about.__download_url__):
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
pip_args = list(user_pip_args) if user_pip_args is not None else [] pip_args = list(user_pip_args) if user_pip_args is not None else []
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
run_command(cmd) run_command(cmd)

View File

@ -39,7 +39,7 @@ def find_threshold_cli(
# fmt: on # fmt: on
): ):
""" """
Runs prediction trials for a trained model with varying tresholds to maximize Runs prediction trials for a trained model with varying thresholds to maximize
the specified metric. The search space for the threshold is traversed linearly the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@ -81,7 +81,7 @@ def find_threshold(
silent: bool = True, silent: bool = True,
) -> Tuple[float, float, Dict[float, float]]: ) -> Tuple[float, float, Dict[float, float]]:
""" """
Runs prediction trials for models with varying tresholds to maximize the specified metric. Runs prediction trials for models with varying thresholds to maximize the specified metric.
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
data_path (Path): Path to file with DocBin with docs to use for threshold search. data_path (Path): Path to file with DocBin with docs to use for threshold search.
pipe_name (str): Name of pipe to examine thresholds for. pipe_name (str): Name of pipe to examine thresholds for.

View File

@ -215,6 +215,7 @@ class Warnings(metaclass=ErrorsWithCodes):
"key attribute for vectors, configure it through Vectors(attr=) or " "key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'") "'spacy init vectors --attr'")
W126 = ("These keys are unsupported: {unsupported}") W126 = ("These keys are unsupported: {unsupported}")
W127 = ("Not all `Language.pipe` worker processes completed successfully")
# v4 warning strings # v4 warning strings
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "

View File

@ -1844,6 +1844,9 @@ class Language:
for proc in procs: for proc in procs:
proc.join() proc.join()
if not all(proc.exitcode == 0 for proc in procs):
warnings.warn(Warnings.W127)
def _link_components(self) -> None: def _link_components(self) -> None:
"""Register 'listeners' within pipeline components, to allow them to """Register 'listeners' within pipeline components, to allow them to
effectively share weights. effectively share weights.
@ -2467,6 +2470,7 @@ def _apply_pipes(
if isinstance(texts_with_ctx, _WorkDoneSentinel): if isinstance(texts_with_ctx, _WorkDoneSentinel):
sender.close() sender.close()
receiver.close() receiver.close()
return
docs = ( docs = (
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
@ -2492,6 +2496,7 @@ def _apply_pipes(
# stop processing. # stop processing.
sender.close() sender.close()
receiver.close() receiver.close()
return
class _Sender: class _Sender:

View File

@ -164,32 +164,34 @@ cdef class Lexeme:
vector = self.vector vector = self.vector
return numpy.sqrt((vector**2).sum()) return numpy.sqrt((vector**2).sum())
property vector: @property
def vector(self):
"""A real-valued meaning representation. """A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics. representing the lexeme's semantics.
""" """
def __get__(self): cdef int length = self.vocab.vectors_length
cdef int length = self.vocab.vectors_length if length == 0:
if length == 0: raise ValueError(Errors.E010)
raise ValueError(Errors.E010) return self.vocab.get_vector(self.c.orth)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector): @vector.setter
if len(vector) != self.vocab.vectors_length: def vector(self, vector):
raise ValueError(Errors.E073.format(new_length=len(vector), if len(vector) != self.vocab.vectors_length:
length=self.vocab.vectors_length)) raise ValueError(Errors.E073.format(new_length=len(vector),
self.vocab.set_vector(self.c.orth, vector) length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
property rank: @property
def rank(self):
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used """RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors.""" to index into tables, e.g. for word vectors."""
def __get__(self): return self.c.id
return self.c.id
def __set__(self, value): @rank.setter
self.c.id = value def rank(self, value):
self.c.id = value
@property @property
def orth_(self): def orth_(self):
@ -203,306 +205,338 @@ cdef class Lexeme:
"""RETURNS (str): The original verbatim text of the lexeme.""" """RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_ return self.orth_
property lower: @property
def lower(self):
"""RETURNS (uint64): Lowercase form of the lexeme.""" """RETURNS (uint64): Lowercase form of the lexeme."""
def __get__(self): return self.c.lower
return self.c.lower
def __set__(self, attr_t x): @lower.setter
self.c.lower = x def lower(self, attr_t x):
self.c.lower = x
property norm: @property
def norm(self):
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text. lexeme text.
""" """
def __get__(self): return self.c.norm
return self.c.norm
def __set__(self, attr_t x): @norm.setter
if "lexeme_norm" not in self.vocab.lookups: def norm(self, attr_t x):
self.vocab.lookups.add_table("lexeme_norm") if "lexeme_norm" not in self.vocab.lookups:
norm_table = self.vocab.lookups.get_table("lexeme_norm") self.vocab.lookups.add_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x] norm_table = self.vocab.lookups.get_table("lexeme_norm")
self.c.norm = x norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
property shape: @property
def shape(self):
"""RETURNS (uint64): Transform of the word's string, to show """RETURNS (uint64): Transform of the word's string, to show
orthographic features. orthographic features.
""" """
def __get__(self): return self.c.shape
return self.c.shape
def __set__(self, attr_t x): @shape.setter
self.c.shape = x def shape(self, attr_t x):
self.c.shape = x
property prefix: @property
def prefix(self):
"""RETURNS (uint64): Length-N substring from the start of the word. """RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`. Defaults to `N=1`.
""" """
def __get__(self): return self.c.prefix
return self.c.prefix
def __set__(self, attr_t x): @prefix.setter
self.c.prefix = x def prefix(self, attr_t x):
self.c.prefix = x
property suffix: @property
def suffix(self):
"""RETURNS (uint64): Length-N substring from the end of the word. """RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`. Defaults to `N=3`.
""" """
def __get__(self): return self.c.suffix
return self.c.suffix
def __set__(self, attr_t x): @suffix.setter
self.c.suffix = x def suffix(self, attr_t x):
self.c.suffix = x
property cluster: @property
def cluster(self):
"""RETURNS (int): Brown cluster ID.""" """RETURNS (int): Brown cluster ID."""
def __get__(self): cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) return cluster_table.get(self.c.orth, 0)
return cluster_table.get(self.c.orth, 0)
def __set__(self, int x): @cluster.setter
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) def cluster(self, int x):
cluster_table[self.c.orth] = x cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x
property lang: @property
def lang(self):
"""RETURNS (uint64): Language of the parent vocabulary.""" """RETURNS (uint64): Language of the parent vocabulary."""
def __get__(self): return self.c.lang
return self.c.lang
def __set__(self, attr_t x): @lang.setter
self.c.lang = x def lang(self, attr_t x):
self.c.lang = x
property prob: @property
def prob(self):
"""RETURNS (float): Smoothed log probability estimate of the lexeme's """RETURNS (float): Smoothed log probability estimate of the lexeme's
type.""" type."""
def __get__(self): prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) default_oov_prob = settings_table.get("oov_prob", -20.0)
default_oov_prob = settings_table.get("oov_prob", -20.0) return prob_table.get(self.c.orth, default_oov_prob)
return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x): @prob.setter
prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) def prob(self, float x):
prob_table[self.c.orth] = x prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x
property lower_: @property
def lower_(self):
"""RETURNS (str): Lowercase form of the word.""" """RETURNS (str): Lowercase form of the word."""
def __get__(self): return self.vocab.strings[self.c.lower]
return self.vocab.strings[self.c.lower]
def __set__(self, str x): @lower_.setter
self.c.lower = self.vocab.strings.add(x) def lower_(self, str x):
self.c.lower = self.vocab.strings.add(x)
property norm_: @property
def norm_(self):
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text. lexeme text.
""" """
def __get__(self): return self.vocab.strings[self.c.norm]
return self.vocab.strings[self.c.norm]
def __set__(self, str x): @norm_.setter
self.norm = self.vocab.strings.add(x) def norm_(self, str x):
self.norm = self.vocab.strings.add(x)
property shape_: @property
def shape_(self):
"""RETURNS (str): Transform of the word's string, to show """RETURNS (str): Transform of the word's string, to show
orthographic features. orthographic features.
""" """
def __get__(self): return self.vocab.strings[self.c.shape]
return self.vocab.strings[self.c.shape]
def __set__(self, str x): @shape_.setter
self.c.shape = self.vocab.strings.add(x) def shape_(self, str x):
self.c.shape = self.vocab.strings.add(x)
property prefix_: @property
def prefix_(self):
"""RETURNS (str): Length-N substring from the start of the word. """RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`. Defaults to `N=1`.
""" """
def __get__(self): return self.vocab.strings[self.c.prefix]
return self.vocab.strings[self.c.prefix]
def __set__(self, str x): @prefix_.setter
self.c.prefix = self.vocab.strings.add(x) def prefix_(self, str x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_: @property
def suffix_(self):
"""RETURNS (str): Length-N substring from the end of the word. """RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`. Defaults to `N=3`.
""" """
def __get__(self): return self.vocab.strings[self.c.suffix]
return self.vocab.strings[self.c.suffix]
def __set__(self, str x): @suffix_.setter
self.c.suffix = self.vocab.strings.add(x) def suffix_(self, str x):
self.c.suffix = self.vocab.strings.add(x)
property lang_: @property
def lang_(self):
"""RETURNS (str): Language of the parent vocabulary.""" """RETURNS (str): Language of the parent vocabulary."""
def __get__(self): return self.vocab.strings[self.c.lang]
return self.vocab.strings[self.c.lang]
def __set__(self, str x): @lang_.setter
self.c.lang = self.vocab.strings.add(x) def lang_(self, str x):
self.c.lang = self.vocab.strings.add(x)
property flags: @property
def flags(self):
"""RETURNS (uint64): Container of the lexeme's binary flags.""" """RETURNS (uint64): Container of the lexeme's binary flags."""
def __get__(self): return self.c.flags
return self.c.flags
def __set__(self, flags_t x): @flags.setter
self.c.flags = x def flags(self, flags_t x):
self.c.flags = x
@property @property
def is_oov(self): def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
return self.orth not in self.vocab.vectors return self.orth not in self.vocab.vectors
property is_stop: @property
def is_stop(self):
"""RETURNS (bool): Whether the lexeme is a stop word.""" """RETURNS (bool): Whether the lexeme is a stop word."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x): @is_stop.setter
Lexeme.c_set_flag(self.c, IS_STOP, x) def is_stop(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha: @property
def is_alpha(self):
"""RETURNS (bool): Whether the lexeme consists of alphabetic """RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`. characters. Equivalent to `lexeme.text.isalpha()`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x): @is_alpha.setter
Lexeme.c_set_flag(self.c, IS_ALPHA, x) def is_alpha(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii: @property
def is_ascii(self):
"""RETURNS (bool): Whether the lexeme consists of ASCII characters. """RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x): @is_ascii.setter
Lexeme.c_set_flag(self.c, IS_ASCII, x) def is_ascii(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit: @property
def is_digit(self):
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`. to `lexeme.text.isdigit()`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT)
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x): @is_digit.setter
Lexeme.c_set_flag(self.c, IS_DIGIT, x) def is_digit(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower: @property
def is_lower(self):
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
`lexeme.text.islower()`. `lexeme.text.islower()`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER)
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x): @is_lower.setter
Lexeme.c_set_flag(self.c, IS_LOWER, x) def is_lower(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper: @property
def is_upper(self):
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`. `lexeme.text.isupper()`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_UPPER)
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x): @is_upper.setter
Lexeme.c_set_flag(self.c, IS_UPPER, x) def is_upper(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title: @property
def is_title(self):
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
`lexeme.text.istitle()`. `lexeme.text.istitle()`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE)
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x): @is_title.setter
Lexeme.c_set_flag(self.c, IS_TITLE, x) def is_title(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct: @property
def is_punct(self):
"""RETURNS (bool): Whether the lexeme is punctuation.""" """RETURNS (bool): Whether the lexeme is punctuation."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x): @is_punct.setter
Lexeme.c_set_flag(self.c, IS_PUNCT, x) def is_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space: @property
def is_space(self):
"""RETURNS (bool): Whether the lexeme consist of whitespace characters. """RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`. Equivalent to `lexeme.text.isspace()`.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x): @is_space.setter
Lexeme.c_set_flag(self.c, IS_SPACE, x) def is_space(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket: @property
def is_bracket(self):
"""RETURNS (bool): Whether the lexeme is a bracket.""" """RETURNS (bool): Whether the lexeme is a bracket."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x): @is_bracket.setter
Lexeme.c_set_flag(self.c, IS_BRACKET, x) def is_bracket(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote: @property
def is_quote(self):
"""RETURNS (bool): Whether the lexeme is a quotation mark.""" """RETURNS (bool): Whether the lexeme is a quotation mark."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x): @is_quote.setter
Lexeme.c_set_flag(self.c, IS_QUOTE, x) def is_quote(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct: @property
def is_left_punct(self):
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x): @is_left_punct.setter
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) def is_left_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct: @property
def is_right_punct(self):
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x): @is_right_punct.setter
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) def is_right_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property is_currency: @property
def is_currency(self):
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €.""" """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
def __get__(self): return Lexeme.c_check_flag(self.c, IS_CURRENCY)
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
def __set__(self, bint x): @is_currency.setter
Lexeme.c_set_flag(self.c, IS_CURRENCY, x) def is_currency(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
property like_url: @property
def like_url(self):
"""RETURNS (bool): Whether the lexeme resembles a URL.""" """RETURNS (bool): Whether the lexeme resembles a URL."""
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x): @like_url.setter
Lexeme.c_set_flag(self.c, LIKE_URL, x) def like_url(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num: @property
def like_num(self):
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc. "10", "ten", etc.
""" """
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x): @like_num.setter
Lexeme.c_set_flag(self.c, LIKE_NUM, x) def like_num(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email: @property
def like_email(self):
"""RETURNS (bool): Whether the lexeme resembles an email address.""" """RETURNS (bool): Whether the lexeme resembles an email address."""
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x): @like_email.setter
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) def like_email(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)

View File

@ -241,6 +241,7 @@ def _build_parametric_attention_with_residual_nonlinear(
parametric_attention.set_ref("tok2vec", tok2vec) parametric_attention.set_ref("tok2vec", tok2vec)
parametric_attention.set_ref("attention_layer", attention_layer) parametric_attention.set_ref("attention_layer", attention_layer)
parametric_attention.set_ref("key_transform", key_transform)
parametric_attention.set_ref("nonlinear_layer", nonlinear_layer) parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
parametric_attention.set_ref("norm_layer", norm_layer) parametric_attention.set_ref("norm_layer", norm_layer)
@ -248,10 +249,19 @@ def _build_parametric_attention_with_residual_nonlinear(
def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
# When tok2vec is lazily initialized, we need to initialize it before
# the rest of the chain to ensure that we can get its width.
tok2vec = model.get_ref("tok2vec")
tok2vec.initialize(X)
tok2vec_width = get_tok2vec_width(model) tok2vec_width = get_tok2vec_width(model)
model.get_ref("attention_layer").set_dim("nO", tok2vec_width) model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) if model.get_ref("key_transform").has_dim("nI") is None:
model.get_ref("key_transform").set_dim("nI", tok2vec_width)
if model.get_ref("key_transform").has_dim("nO") is None:
model.get_ref("key_transform").set_dim("nO", tok2vec_width)
model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width) model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
model.get_ref("norm_layer").set_dim("nI", tok2vec_width) model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
model.get_ref("norm_layer").set_dim("nO", tok2vec_width) model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
init_chain(model, X, Y) init_chain(model, X, Y)

View File

@ -24,7 +24,6 @@ from .. import util
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..kb import Candidate, KnowledgeBase from ..kb import Candidate, KnowledgeBase
from ..language import Language from ..language import Language
from ..ml import empty_kb
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Span, SpanGroup from ..tokens import Doc, Span, SpanGroup
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_examples, validate_get_examples
@ -114,7 +113,7 @@ def make_entity_linker(
documents with textual mentions. documents with textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method. scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
component must provide entity annotations. component must provide entity annotations.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold. prediction is discarded. If None, predictions are not filtered by any threshold.
@ -227,7 +226,6 @@ class EntityLinker(TrainablePipe):
self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False) self.distance = CosineDistance(normalize=False)
self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.kb = generate_empty_kb(self.vocab, entity_vector_length)
self.scorer = scorer
self.use_gold_ents = use_gold_ents self.use_gold_ents = use_gold_ents
self.threshold = threshold self.threshold = threshold
self.save_activations = save_activations self.save_activations = save_activations
@ -235,6 +233,37 @@ class EntityLinker(TrainablePipe):
if self.incl_prior and not self.kb.supports_prior_probs: if self.incl_prior and not self.kb.supports_prior_probs:
warnings.warn(Warnings.W401) warnings.warn(Warnings.W401)
def _score_with_ents_set(examples: Iterable[Example], **kwargs):
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
# calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
if not scorer:
return scorer
if not self.use_gold_ents:
return scorer(examples, **kwargs)
else:
examples = self._ensure_ents(examples)
docs = self.pipe(
(eg.predicted for eg in examples),
)
for eg, doc in zip(examples, docs):
eg.predicted = doc
return scorer(examples, **kwargs)
self.scorer = _score_with_ents_set
def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
"""If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
if not self.use_gold_ents:
return examples
new_examples = []
for eg in examples:
ents, _ = eg.get_aligned_ents_and_ner()
new_eg = eg.copy()
new_eg.predicted.ents = ents
new_examples.append(new_eg)
return new_examples
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will """Define the KB of this pipe by providing a function that will
create it using this object's vocab.""" create it using this object's vocab."""
@ -276,11 +305,9 @@ class EntityLinker(TrainablePipe):
nO = self.kb.entity_vector_length nO = self.kb.entity_vector_length
doc_sample = [] doc_sample = []
vector_sample = [] vector_sample = []
for eg in islice(get_examples(), 10): examples = self._ensure_ents(islice(get_examples(), 10))
for eg in examples:
doc = eg.x doc = eg.x
if self.use_gold_ents:
ents, _ = eg.get_aligned_ents_and_ner()
doc.ents = ents
doc_sample.append(doc) doc_sample.append(doc)
vector_sample.append(self.model.ops.alloc1f(nO)) vector_sample.append(self.model.ops.alloc1f(nO))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
@ -347,31 +374,17 @@ class EntityLinker(TrainablePipe):
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
if not examples: if not examples:
return losses return losses
examples = self._ensure_ents(examples)
validate_examples(examples, "EntityLinker.update") validate_examples(examples, "EntityLinker.update")
set_dropout_rate(self.model, drop)
docs = [eg.predicted for eg in examples]
# save to restore later
old_ents = [doc.ents for doc in docs]
for doc, ex in zip(docs, examples):
if self.use_gold_ents:
ents, _ = ex.get_aligned_ents_and_ner()
doc.ents = ents
else:
# only keep matching ents
doc.ents = ex.get_matching_ents()
# make sure we have something to learn from, if not, short-circuit # make sure we have something to learn from, if not, short-circuit
if not self.batch_has_learnable_example(examples): if not self.batch_has_learnable_example(examples):
return losses return losses
set_dropout_rate(self.model, drop)
docs = [eg.predicted for eg in examples]
sentence_encodings, bp_context = self.model.begin_update(docs) sentence_encodings, bp_context = self.model.begin_update(docs)
# now restore the ents
for doc, old in zip(docs, old_ents):
doc.ents = old
loss, d_scores = self.get_loss( loss, d_scores = self.get_loss(
sentence_encodings=sentence_encodings, examples=examples sentence_encodings=sentence_encodings, examples=examples
) )
@ -379,11 +392,13 @@ class EntityLinker(TrainablePipe):
if sgd is not None: if sgd is not None:
self.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss losses[self.name] += loss
return losses return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker.get_loss") validate_examples(examples, "EntityLinker.get_loss")
entity_encodings = [] entity_encodings = []
# We assume that get_loss is called with gold ents set in the examples if need be
eidx = 0 # indices in gold entities to keep eidx = 0 # indices in gold entities to keep
keep_ents = [] # indices in sentence_encodings to keep keep_ents = [] # indices in sentence_encodings to keep

View File

@ -799,7 +799,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on # fmt: on
def test_overfitting_IO(): def test_overfitting_IO_gold_entities():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English() nlp = English()
vector_length = 3 vector_length = 3
@ -826,7 +826,9 @@ def test_overfitting_IO():
return mykb return mykb
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker = nlp.add_pipe(
"entity_linker", last=True, config={"use_gold_ents": True}
)
assert isinstance(entity_linker, EntityLinker) assert isinstance(entity_linker, EntityLinker)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.vocab.strings
@ -889,6 +891,107 @@ def test_overfitting_IO():
assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps) assert_equal(batch_deps_1, no_batch_deps)
eval = nlp.evaluate(train_examples)
assert "nel_macro_p" in eval
assert "nel_macro_r" in eval
assert "nel_macro_f" in eval
assert "nel_micro_p" in eval
assert "nel_micro_r" in eval
assert "nel_micro_f" in eval
assert "nel_f_per_type" in eval
assert "PERSON" in eval["nel_f_per_type"]
assert eval["nel_macro_f"] > 0
assert eval["nel_micro_f"] > 0
def test_overfitting_IO_with_ner():
# Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
nlp = English()
vector_length = 3
assert "Q2146908" not in nlp.vocab.strings
# Convert the texts to docs to make sure we have doc.ents set for the training examples
train_examples = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
train_examples.append(Example.from_dict(doc, annotation))
def create_kb(vocab):
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.5, 0.5],
)
return mykb
# Create the NER and EL components and add them to the pipeline
ner = nlp.add_pipe("ner", first=True)
entity_linker = nlp.add_pipe(
"entity_linker", last=True, config={"use_gold_ents": False}
)
entity_linker.set_kb(create_kb)
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
ner.add_label(ent[2])
optimizer = nlp.initialize()
# train the NER and NEL pipes
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.001
assert losses["entity_linker"] < 0.001
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# test the trained model
test_text = "Russ Cochran captured his first major title with his son as caddie."
doc = nlp(test_text)
ents = doc.ents
assert len(ents) == 1
assert ents[0].text == "Russ Cochran"
assert ents[0].label_ == "PERSON"
assert ents[0].kb_id_ != "NIL"
# TODO: below assert is still flaky - EL doesn't properly overfit quite yet
# assert ents[0].kb_id_ == "Q2146908"
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
assert nlp2.pipe_names == nlp.pipe_names
doc2 = nlp2(test_text)
ents2 = doc2.ents
assert len(ents2) == 1
assert ents2[0].text == "Russ Cochran"
assert ents2[0].label_ == "PERSON"
assert ents2[0].kb_id_ != "NIL"
eval = nlp.evaluate(train_examples)
assert "nel_macro_f" in eval
assert "nel_micro_f" in eval
assert "ents_f" in eval
assert "nel_f_per_type" in eval
assert "ents_per_type" in eval
assert "PERSON" in eval["nel_f_per_type"]
assert "PERSON" in eval["ents_per_type"]
assert eval["nel_macro_f"] > 0
assert eval["nel_micro_f"] > 0
assert eval["ents_f"] > 0
def test_kb_serialization(): def test_kb_serialization():
# Test that the KB can be used in a pipeline with a different vocab # Test that the KB can be used in a pipeline with a different vocab

View File

@ -29,6 +29,8 @@ from spacy.tokens import Doc, DocBin
from spacy.training import Example from spacy.training import Example
from spacy.training.initialize import init_nlp from spacy.training.initialize import init_nlp
# Ensure that the architecture gets added to the registry.
from ..tok2vec import build_lazy_init_tok2vec as _
from ..util import make_tempdir from ..util import make_tempdir
TRAIN_DATA_SINGLE_LABEL = [ TRAIN_DATA_SINGLE_LABEL = [
@ -41,6 +43,13 @@ TRAIN_DATA_MULTI_LABEL = [
("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}), ("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}),
] ]
lazy_init_model_config = """
[model]
@architectures = "test.LazyInitTok2Vec.v1"
width = 96
"""
LAZY_INIT_TOK2VEC_MODEL = Config().from_str(lazy_init_model_config)["model"]
def make_get_examples_single_label(nlp): def make_get_examples_single_label(nlp):
train_examples = [] train_examples = []
@ -551,6 +560,34 @@ def test_error_with_multi_labels():
nlp.initialize(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
# fmt: off
@pytest.mark.parametrize(
"name,textcat_config",
[
# ENSEMBLE V2
("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
# PARAMETRIC ATTENTION V1
("textcat", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False}),
# REDUCE
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
def test_tok2vec_lazy_init(name, textcat_config):
# Check that we can properly initialize and use a textcat model using
# a lazily-initialized tok2vec.
nlp = English()
pipe_config = {"model": textcat_config}
textcat = nlp.add_pipe(name, config=pipe_config)
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
nlp.initialize()
nlp.pipe(["This is a test."])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,get_examples, train_data", "name,get_examples, train_data",
[ [

View File

@ -12,7 +12,7 @@ from thinc.api import Config
import spacy import spacy
from spacy import about from spacy import about
from spacy.cli import info from spacy.cli import download_module, info
from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
from spacy.cli.apply import apply from spacy.cli.apply import apply
from spacy.cli.debug_data import ( from spacy.cli.debug_data import (
@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
def test_project_api_imports(): def test_project_api_imports():
from spacy.cli import project_run from spacy.cli import project_run
from spacy.cli.project.run import project_run # noqa: F401, F811 from spacy.cli.project.run import project_run # noqa: F401, F811
def test_download_rejects_relative_urls(monkeypatch):
"""Test that we can't tell spacy download to get an arbitrary model by using a
relative path in the filename"""
monkeypatch.setattr(download_module, "run_command", lambda cmd: None)
# Check that normal download works
download_module.download("en_core_web_sm-3.7.1", direct=True)
with pytest.raises(SystemExit):
download_module.download("../en_core_web_sm-3.7.1", direct=True)

View File

@ -1,5 +1,6 @@
import itertools import itertools
import logging import logging
import warnings
from unittest import mock from unittest import mock
import pytest import pytest
@ -423,7 +424,7 @@ def test_language_pipe_error_handler(n_process):
nlp.set_error_handler(raise_error) nlp.set_error_handler(raise_error)
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process)) list(nlp.pipe(texts, n_process=n_process))
# set explicitely to ignoring # set explicitly to ignoring
nlp.set_error_handler(ignore_error) nlp.set_error_handler(ignore_error)
docs = list(nlp.pipe(texts, n_process=n_process)) docs = list(nlp.pipe(texts, n_process=n_process))
assert len(docs) == 0 assert len(docs) == 0
@ -834,9 +835,13 @@ def test_pass_doc_to_pipeline(nlp, n_process):
assert doc.text == texts[0] assert doc.text == texts[0]
assert len(doc.cats) > 0 assert len(doc.cats) > 0
if isinstance(get_current_ops(), NumpyOps) or n_process < 2: if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
docs = nlp.pipe(docs, n_process=n_process) # Catch warnings to ensure that all worker processes exited
assert [doc.text for doc in docs] == texts # succesfully.
assert all(len(doc.cats) for doc in docs) with warnings.catch_warnings():
warnings.simplefilter("error")
docs = nlp.pipe(docs, n_process=n_process)
assert [doc.text for doc in docs] == texts
assert all(len(doc.cats) for doc in docs)
def test_invalid_arg_to_pipeline(nlp): def test_invalid_arg_to_pipeline(nlp):

36
spacy/tests/tok2vec.py Normal file
View File

@ -0,0 +1,36 @@
from typing import List
from thinc.api import Model
from thinc.types import Floats2d
from spacy.tokens import Doc
from spacy.util import registry
@registry.architectures("test.LazyInitTok2Vec.v1")
def build_lazy_init_tok2vec(*, width: int) -> Model[List[Doc], List[Floats2d]]:
"""tok2vec model of which the output size is only known after
initialization. This implementation does not output meaningful
embeddings, it is strictly for testing."""
return Model(
"lazy_init_tok2vec",
lazy_init_tok2vec_forward,
init=lazy_init_tok2vec_init,
dims={"nO": None},
attrs={"width": width},
)
def lazy_init_tok2vec_init(model: Model, X=None, Y=None):
width = model.attrs["width"]
model.set_dim("nO", width)
def lazy_init_tok2vec_forward(model: Model, X: List[Doc], is_train: bool):
width = model.get_dim("nO")
Y = [model.ops.alloc2f(len(doc), width) for doc in X]
def backprop(dY):
return []
return Y, backprop

View File

@ -71,65 +71,72 @@ cdef class Tokenizer:
self._special_matcher = PhraseMatcher(self.vocab) self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules) self._load_special_cases(rules)
property token_match: @property
def __get__(self): def token_match(self):
return self._token_match return self._token_match
def __set__(self, token_match): @token_match.setter
self._token_match = token_match def token_match(self, token_match):
self._reload_special_cases() self._token_match = token_match
self._reload_special_cases()
property url_match: @property
def __get__(self): def url_match(self):
return self._url_match return self._url_match
def __set__(self, url_match): @url_match.setter
self._url_match = url_match def url_match(self, url_match):
self._reload_special_cases() self._url_match = url_match
self._reload_special_cases()
property prefix_search: @property
def __get__(self): def prefix_search(self):
return self._prefix_search return self._prefix_search
def __set__(self, prefix_search): @prefix_search.setter
self._prefix_search = prefix_search def prefix_search(self, prefix_search):
self._reload_special_cases() self._prefix_search = prefix_search
self._reload_special_cases()
property suffix_search: @property
def __get__(self): def suffix_search(self):
return self._suffix_search return self._suffix_search
def __set__(self, suffix_search): @suffix_search.setter
self._suffix_search = suffix_search def suffix_search(self, suffix_search):
self._reload_special_cases() self._suffix_search = suffix_search
self._reload_special_cases()
property infix_finditer: @property
def __get__(self): def infix_finditer(self):
return self._infix_finditer return self._infix_finditer
def __set__(self, infix_finditer): @infix_finditer.setter
self._infix_finditer = infix_finditer def infix_finditer(self, infix_finditer):
self._reload_special_cases() self._infix_finditer = infix_finditer
self._reload_special_cases()
property rules: @property
def __get__(self): def rules(self):
return self._rules return self._rules
def __set__(self, rules): @rules.setter
self._rules = {} def rules(self, rules):
self._flush_cache() self._rules = {}
self._flush_specials() self._flush_cache()
self._cache = PreshMap() self._flush_specials()
self._specials = PreshMap() self._cache = PreshMap()
self._load_special_cases(rules) self._specials = PreshMap()
self._load_special_cases(rules)
property faster_heuristics: @property
def __get__(self): def faster_heuristics(self):
return self._faster_heuristics return self._faster_heuristics
def __set__(self, faster_heuristics): @faster_heuristics.setter
self._faster_heuristics = faster_heuristics def faster_heuristics(self, faster_heuristics):
self._reload_special_cases() self._faster_heuristics = faster_heuristics
self._reload_special_cases()
def __reduce__(self): def __reduce__(self):
args = (self.vocab, args = (self.vocab,

View File

@ -667,7 +667,8 @@ cdef class Doc:
else: else:
return False return False
property vector: @property
def vector(self):
"""A real-valued meaning representation. Defaults to an average of the """A real-valued meaning representation. Defaults to an average of the
token vectors. token vectors.
@ -676,45 +677,46 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#vector DOCS: https://spacy.io/api/doc#vector
""" """
def __get__(self): if "vector" in self.user_hooks:
if "vector" in self.user_hooks: return self.user_hooks["vector"](self)
return self.user_hooks["vector"](self) if self._vector is not None:
if self._vector is not None: return self._vector
return self._vector xp = get_array_module(self.vocab.vectors.data)
xp = get_array_module(self.vocab.vectors.data) if not len(self):
if not len(self): self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") return self._vector
return self._vector elif self.vocab.vectors.size > 0:
elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self)
self._vector = sum(t.vector for t in self) / len(self) return self._vector
return self._vector else:
else: return xp.zeros((self.vocab.vectors_length,), dtype="float32")
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
def __set__(self, value): @vector.setter
self._vector = value def vector(self, value):
self._vector = value
property vector_norm: @property
def vector_norm(self):
"""The L2 norm of the document's vector representation. """The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation. RETURNS (float): The L2 norm of the vector representation.
DOCS: https://spacy.io/api/doc#vector_norm DOCS: https://spacy.io/api/doc#vector_norm
""" """
def __get__(self): if "vector_norm" in self.user_hooks:
if "vector_norm" in self.user_hooks: return self.user_hooks["vector_norm"](self)
return self.user_hooks["vector_norm"](self) cdef float value
cdef float value cdef double norm = 0
cdef double norm = 0 if self._vector_norm is None:
if self._vector_norm is None: norm = 0.0
norm = 0.0 for value in self.vector:
for value in self.vector: norm += value * value
norm += value * value self._vector_norm = sqrt(norm) if norm != 0 else 0
self._vector_norm = sqrt(norm) if norm != 0 else 0 return self._vector_norm
return self._vector_norm
def __set__(self, value): @vector_norm.setter
self._vector_norm = value def vector_norm(self, value):
self._vector_norm = value
@property @property
def text(self): def text(self):
@ -733,7 +735,8 @@ cdef class Doc:
""" """
return self.text return self.text
property ents: @property
def ents(self):
"""The named entities in the document. Returns a list of named entity """The named entities in the document. Returns a list of named entity
`Span` objects, if the entity recognizer has been applied. `Span` objects, if the entity recognizer has been applied.
@ -741,55 +744,55 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#ents DOCS: https://spacy.io/api/doc#ents
""" """
def __get__(self): cdef int i
cdef int i cdef const TokenC* token
cdef const TokenC* token cdef int start = -1
cdef int start = -1 cdef attr_t label = 0
cdef attr_t label = 0 cdef attr_t kb_id = 0
cdef attr_t kb_id = 0 cdef attr_t ent_id = 0
cdef attr_t ent_id = 0 output = []
output = [] for i in range(self.length):
for i in range(self.length): token = &self.c[i]
token = &self.c[i] if token.ent_iob == 1:
if token.ent_iob == 1: if start == -1:
if start == -1: seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] raise ValueError(Errors.E093.format(seq=" ".join(seq)))
raise ValueError(Errors.E093.format(seq=" ".join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0 or \
elif token.ent_iob == 2 or token.ent_iob == 0 or \ (token.ent_iob == 3 and token.ent_type == 0):
(token.ent_iob == 3 and token.ent_type == 0): if start != -1:
if start != -1: output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) start = -1
start = -1 label = 0
label = 0 kb_id = 0
kb_id = 0 ent_id = 0
ent_id = 0 elif token.ent_iob == 3:
elif token.ent_iob == 3: if start != -1:
if start != -1: output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) start = i
start = i label = token.ent_type
label = token.ent_type kb_id = token.ent_kb_id
kb_id = token.ent_kb_id ent_id = token.ent_id
ent_id = token.ent_id if start != -1:
if start != -1: output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id))
output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) # remove empty-label spans
# remove empty-label spans output = [o for o in output if o.label_ != ""]
output = [o for o in output if o.label_ != ""] return tuple(output)
return tuple(output)
def __set__(self, ents): @ents.setter
# TODO: def ents(self, ents):
# 1. Test basic data-driven ORTH gazetteer # TODO:
# 2. Test more nuanced date and currency regex # 1. Test basic data-driven ORTH gazetteer
cdef attr_t kb_id, ent_id # 2. Test more nuanced date and currency regex
cdef int ent_start, ent_end cdef attr_t kb_id, ent_id
ent_spans = [] cdef int ent_start, ent_end
for ent_info in ents: ent_spans = []
entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) for ent_info in ents:
if isinstance(entity_type_, str): entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info)
self.vocab.strings.add(entity_type_) if isinstance(entity_type_, str):
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) self.vocab.strings.add(entity_type_)
ent_spans.append(span) span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id)
self.set_ents(ent_spans, default=SetEntsDefault.outside) ent_spans.append(span)
self.set_ents(ent_spans, default=SetEntsDefault.outside)
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
"""Set entity annotation. """Set entity annotation.

View File

@ -786,110 +786,130 @@ cdef class Span:
for word in self.rights: for word in self.rights:
yield from word.subtree yield from word.subtree
property start: @property
def __get__(self): def start(self):
return self.span_c().start return self.span_c().start
def __set__(self, int start): @start.setter
if start < 0 or start > self.doc.length: def start(self, int start):
raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start)) if start < 0 or start > self.doc.length:
cdef SpanC* span_c = self.span_c() raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
if start > span_c.end: cdef SpanC * span_c = self.span_c()
raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end)) if start > span_c.end:
span_c.start = start raise ValueError(
span_c.start_char = self.doc.c[start].idx Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
span_c.start = start
span_c.start_char = self.doc.c[start].idx
property end: @property
def __get__(self): def end(self):
return self.span_c().end return self.span_c().end
def __set__(self, int end): @end.setter
if end < 0 or end > self.doc.length: def end(self, int end):
raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end)) if end < 0 or end > self.doc.length:
cdef SpanC* span_c = self.span_c() raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
if span_c.start > end: cdef SpanC * span_c = self.span_c()
raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start)) if span_c.start > end:
span_c.end = end raise ValueError(
if end > 0: Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length span_c.end = end
else: if end > 0:
span_c.end_char = 0 span_c.end_char = self.doc.c[end - 1].idx + self.doc.c[end - 1].lex.length
else:
span_c.end_char = 0
property start_char: @property
def __get__(self): def start_char(self):
return self.span_c().start_char return self.span_c().start_char
def __set__(self, int start_char): @start_char.setter
if start_char < 0 or start_char > len(self.doc.text): def start_char(self, int start_char):
raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char)) if start_char < 0 or start_char > len(self.doc.text):
cdef int start = token_by_start(self.doc.c, self.doc.length, start_char) raise IndexError(
if start < 0: Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
raise ValueError(Errors.E4008.format(value=start_char, pos="start")) cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
cdef SpanC* span_c = self.span_c() if start < 0:
if start_char > span_c.end_char: raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char)) cdef SpanC * span_c = self.span_c()
span_c.start_char = start_char if start_char > span_c.end_char:
span_c.start = start raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char",
existing_value=span_c.end_char))
span_c.start_char = start_char
span_c.start = start
property end_char: @property
def __get__(self): def end_char(self):
return self.span_c().end_char return self.span_c().end_char
def __set__(self, int end_char): @end_char.setter
if end_char < 0 or end_char > len(self.doc.text): def end_char(self, int end_char):
raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char)) if end_char < 0 or end_char > len(self.doc.text):
cdef int end = token_by_end(self.doc.c, self.doc.length, end_char) raise IndexError(
if end < 0: Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
raise ValueError(Errors.E4008.format(value=end_char, pos="end")) cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
cdef SpanC* span_c = self.span_c() if end < 0:
if span_c.start_char > end_char: raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char)) cdef SpanC * span_c = self.span_c()
span_c.end_char = end_char if span_c.start_char > end_char:
span_c.end = end raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char",
existing_value=span_c.start_char))
span_c.end_char = end_char
span_c.end = end
property label: @property
def __get__(self): def label(self):
return self.span_c().label return self.span_c().label
def __set__(self, attr_t label): @label.setter
if label != self.span_c().label : def label(self, attr_t label):
old_label = self.span_c().label if label != self.span_c().label:
self.span_c().label = label old_label = self.span_c().label
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) self.span_c().label = label
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id) new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
Underscore._replace_keys(old, new) end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
Underscore._replace_keys(old, new)
property kb_id: @property
def __get__(self): def kb_id(self):
return self.span_c().kb_id return self.span_c().kb_id
def __set__(self, attr_t kb_id): @kb_id.setter
if kb_id != self.span_c().kb_id : def kb_id(self, attr_t kb_id):
old_kb_id = self.span_c().kb_id if kb_id != self.span_c().kb_id:
self.span_c().kb_id = kb_id old_kb_id = self.span_c().kb_id
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) self.span_c().kb_id = kb_id
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id) new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
Underscore._replace_keys(old, new) end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
Underscore._replace_keys(old, new)
property id: @property
def __get__(self): def id(self):
return self.span_c().id return self.span_c().id
def __set__(self, attr_t id): @id.setter
if id != self.span_c().id : def id(self, attr_t id):
old_id = self.span_c().id if id != self.span_c().id:
self.span_c().id = id old_id = self.span_c().id
new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) self.span_c().id = id
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id) new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
Underscore._replace_keys(old, new) end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char,
end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
Underscore._replace_keys(old, new)
property ent_id: @property
def ent_id(self):
"""Alias for the span's ID.""" """Alias for the span's ID."""
def __get__(self): return self.id
return self.id
def __set__(self, attr_t ent_id): @ent_id.setter
self.id = ent_id def ent_id(self, attr_t ent_id):
self.id = ent_id
@property @property
def orth_(self): def orth_(self):
@ -904,29 +924,32 @@ cdef class Span:
"""RETURNS (str): The span's lemma.""" """RETURNS (str): The span's lemma."""
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
property label_: @property
def label_(self):
"""The span's label.""" """The span's label."""
def __get__(self): return self.doc.vocab.strings[self.label]
return self.doc.vocab.strings[self.label]
def __set__(self, str label_): @label_.setter
self.label = self.doc.vocab.strings.add(label_) def label_(self, str label_):
self.label = self.doc.vocab.strings.add(label_)
property kb_id_: @property
def kb_id_(self):
"""The span's KB ID.""" """The span's KB ID."""
def __get__(self): return self.doc.vocab.strings[self.kb_id]
return self.doc.vocab.strings[self.kb_id]
def __set__(self, str kb_id_): @kb_id_.setter
self.kb_id = self.doc.vocab.strings.add(kb_id_) def kb_id_(self, str kb_id_):
self.kb_id = self.doc.vocab.strings.add(kb_id_)
property id_: @property
def id_(self):
"""The span's ID.""" """The span's ID."""
def __get__(self): return self.doc.vocab.strings[self.id]
return self.doc.vocab.strings[self.id]
def __set__(self, str id_): @id_.setter
self.id = self.doc.vocab.strings.add(id_) def id_(self, str id_):
self.id = self.doc.vocab.strings.add(id_)
property ent_id_: property ent_id_:
"""Alias for the span's ID.""" """Alias for the span's ID."""

View File

@ -250,15 +250,16 @@ cdef class Token:
""" """
return not self.c.morph == 0 return not self.c.morph == 0
property morph: @property
def __get__(self): def morph(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph) return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, MorphAnalysis morph): @morph.setter
# Check that the morph has the same vocab def morph(self, MorphAnalysis morph):
if self.vocab != morph.vocab: # Check that the morph has the same vocab
raise ValueError(Errors.E1013) if self.vocab != morph.vocab:
self.c.morph = deref(morph.c).key raise ValueError(Errors.E1013)
self.c.morph = deref(morph.c).key
def set_morph(self, features): def set_morph(self, features):
cdef hash_t key cdef hash_t key
@ -370,39 +371,43 @@ cdef class Token:
""" """
return self.c.lex.suffix return self.c.lex.suffix
property lemma: @property
def lemma(self):
"""RETURNS (uint64): ID of the base form of the word, with no """RETURNS (uint64): ID of the base form of the word, with no
inflectional suffixes. inflectional suffixes.
""" """
def __get__(self): return self.c.lemma
return self.c.lemma
def __set__(self, attr_t lemma): @lemma.setter
self.c.lemma = lemma def lemma(self, attr_t lemma):
self.c.lemma = lemma
property pos: @property
def pos(self):
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
def __get__(self): return self.c.pos
return self.c.pos
def __set__(self, pos): @pos.setter
self.c.pos = pos def pos(self, pos):
self.c.pos = pos
property tag: @property
def tag(self):
"""RETURNS (uint64): ID of fine-grained part-of-speech tag.""" """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
def __get__(self): return self.c.tag
return self.c.tag
def __set__(self, attr_t tag): @tag.setter
self.c.tag = tag def tag(self, attr_t tag):
self.c.tag = tag
property dep: @property
def dep(self):
"""RETURNS (uint64): ID of syntactic dependency label.""" """RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self): return self.c.dep
return self.c.dep
def __set__(self, attr_t label): @dep.setter
self.c.dep = label def dep(self, attr_t label):
self.c.dep = label
@property @property
def has_vector(self): def has_vector(self):
@ -483,48 +488,51 @@ cdef class Token:
return self.doc.user_token_hooks["sent"](self) return self.doc.user_token_hooks["sent"](self)
return self.doc[self.i : self.i+1].sent return self.doc[self.i : self.i+1].sent
property sent_start: @property
def __get__(self): def sent_start(self):
"""Deprecated: use Token.is_sent_start instead.""" """Deprecated: use Token.is_sent_start instead."""
# Raising a deprecation warning here causes errors for autocomplete # Raising a deprecation warning here causes errors for autocomplete
# Handle broken backwards compatibility case: doc[0].sent_start # Handle broken backwards compatibility case: doc[0].sent_start
# was False. # was False.
if self.i == 0: if self.i == 0:
return False return False
else: else:
return self.c.sent_start return self.c.sent_start
def __set__(self, value): @sent_start.setter
self.is_sent_start = value def sent_start(self, value):
self.is_sent_start = value
property is_sent_start: @property
def is_sent_start(self):
"""A boolean value indicating whether the token starts a sentence. """A boolean value indicating whether the token starts a sentence.
`None` if unknown. Defaults to `True` for the first token in the `Doc`. `None` if unknown. Defaults to `True` for the first token in the `Doc`.
RETURNS (bool / None): Whether the token starts a sentence. RETURNS (bool / None): Whether the token starts a sentence.
None if unknown. None if unknown.
""" """
def __get__(self): if self.c.sent_start == 0:
if self.c.sent_start == 0: return None
return None elif self.c.sent_start < 0:
elif self.c.sent_start < 0: return False
return False else:
else: return True
return True
def __set__(self, value): @is_sent_start.setter
if self.doc.has_annotation("DEP"): def is_sent_start(self, value):
raise ValueError(Errors.E043) if self.doc.has_annotation("DEP"):
if value is None: raise ValueError(Errors.E043)
self.c.sent_start = 0 if value is None:
elif value is True: self.c.sent_start = 0
self.c.sent_start = 1 elif value is True:
elif value is False: self.c.sent_start = 1
self.c.sent_start = -1 elif value is False:
else: self.c.sent_start = -1
raise ValueError(Errors.E044.format(value=value)) else:
raise ValueError(Errors.E044.format(value=value))
property is_sent_end: @property
def is_sent_end(self):
"""A boolean value indicating whether the token ends a sentence. """A boolean value indicating whether the token ends a sentence.
`None` if unknown. Defaults to `True` for the last token in the `Doc`. `None` if unknown. Defaults to `True` for the last token in the `Doc`.
@ -533,18 +541,18 @@ cdef class Token:
DOCS: https://spacy.io/api/token#is_sent_end DOCS: https://spacy.io/api/token#is_sent_end
""" """
def __get__(self): if self.i + 1 == len(self.doc):
if self.i + 1 == len(self.doc): return True
return True elif self.doc[self.i+1].is_sent_start is None:
elif self.doc[self.i+1].is_sent_start is None: return None
return None elif self.doc[self.i+1].is_sent_start is True:
elif self.doc[self.i+1].is_sent_start is True: return True
return True else:
else: return False
return False
def __set__(self, value): @is_sent_end.setter
raise ValueError(Errors.E196) def is_sent_end(self, value):
raise ValueError(Errors.E196)
@property @property
def lefts(self): def lefts(self):
@ -671,41 +679,42 @@ cdef class Token:
""" """
return not Token.missing_head(self.c) return not Token.missing_head(self.c)
property head: @property
def head(self):
"""The syntactic parent, or "governor", of this token. """The syntactic parent, or "governor", of this token.
If token.has_head() is `False`, this method will return itself. If token.has_head() is `False`, this method will return itself.
RETURNS (Token): The token predicted by the parser to be the head of RETURNS (Token): The token predicted by the parser to be the head of
the current token. the current token.
""" """
def __get__(self): if not self.has_head():
if not self.has_head(): return self
return self else:
else: return self.doc[self.i + self.c.head]
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head): @head.setter
# This function sets the head of self to new_head and updates the def head(self, Token new_head):
# counters for left/right dependents and left/right corner for the # This function sets the head of self to new_head and updates the
# new and the old head # counters for left/right dependents and left/right corner for the
# Check that token is from the same document # new and the old head
if self.doc != new_head.doc: # Check that token is from the same document
raise ValueError(Errors.E191) if self.doc != new_head.doc:
# Do nothing if old head is new head raise ValueError(Errors.E191)
if self.i + self.c.head == new_head.i: # Do nothing if old head is new head
return if self.i + self.c.head == new_head.i:
# Find the widest l/r_edges of the roots of the two tokens involved return
# to limit the number of tokens for set_children_from_heads # Find the widest l/r_edges of the roots of the two tokens involved
cdef Token self_root, new_head_root # to limit the number of tokens for set_children_from_heads
self_root = ([self] + list(self.ancestors))[-1] cdef Token self_root, new_head_root
new_head_ancestors = list(new_head.ancestors) self_root = ([self] + list(self.ancestors))[-1]
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head new_head_ancestors = list(new_head.ancestors)
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
# Set new head end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
self.c.head = new_head.i - self.i # Set new head
# Adjust parse properties and sentence starts self.c.head = new_head.i - self.i
set_children_from_heads(self.doc.c, start, end + 1) # Adjust parse properties and sentence starts
set_children_from_heads(self.doc.c, start, end + 1)
@property @property
def conjuncts(self): def conjuncts(self):
@ -733,21 +742,23 @@ cdef class Token:
queue.append(child) queue.append(child)
return tuple([w for w in output if w.i != self.i]) return tuple([w for w in output if w.i != self.i])
property ent_type: @property
def ent_type(self):
"""RETURNS (uint64): Named entity type.""" """RETURNS (uint64): Named entity type."""
def __get__(self): return self.c.ent_type
return self.c.ent_type
def __set__(self, ent_type): @ent_type.setter
self.c.ent_type = ent_type def ent_type(self, ent_type):
self.c.ent_type = ent_type
property ent_type_: @property
def ent_type_(self):
"""RETURNS (str): Named entity type.""" """RETURNS (str): Named entity type."""
def __get__(self): return self.vocab.strings[self.c.ent_type]
return self.vocab.strings[self.c.ent_type]
def __set__(self, ent_type): @ent_type_.setter
self.c.ent_type = self.vocab.strings.add(ent_type) def ent_type_(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type)
@property @property
def ent_iob(self): def ent_iob(self):
@ -773,41 +784,45 @@ cdef class Token:
""" """
return self.iob_strings()[self.c.ent_iob] return self.iob_strings()[self.c.ent_iob]
property ent_id: @property
def ent_id(self):
"""RETURNS (uint64): ID of the entity the token is an instance of, """RETURNS (uint64): ID of the entity the token is an instance of,
if any. if any.
""" """
def __get__(self): return self.c.ent_id
return self.c.ent_id
def __set__(self, hash_t key): @ent_id.setter
self.c.ent_id = key def ent_id(self, hash_t key):
self.c.ent_id = key
property ent_id_: @property
def ent_id_(self):
"""RETURNS (str): ID of the entity the token is an instance of, """RETURNS (str): ID of the entity the token is an instance of,
if any. if any.
""" """
def __get__(self): return self.vocab.strings[self.c.ent_id]
return self.vocab.strings[self.c.ent_id]
def __set__(self, name): @ent_id_.setter
self.c.ent_id = self.vocab.strings.add(name) def ent_id_(self, name):
self.c.ent_id = self.vocab.strings.add(name)
property ent_kb_id: @property
def ent_kb_id(self):
"""RETURNS (uint64): Named entity KB ID.""" """RETURNS (uint64): Named entity KB ID."""
def __get__(self): return self.c.ent_kb_id
return self.c.ent_kb_id
def __set__(self, attr_t ent_kb_id): @ent_kb_id.setter
self.c.ent_kb_id = ent_kb_id def ent_kb_id(self, attr_t ent_kb_id):
self.c.ent_kb_id = ent_kb_id
property ent_kb_id_: @property
def ent_kb_id_(self):
"""RETURNS (str): Named entity KB ID.""" """RETURNS (str): Named entity KB ID."""
def __get__(self): return self.vocab.strings[self.c.ent_kb_id]
return self.vocab.strings[self.c.ent_kb_id]
def __set__(self, ent_kb_id): @ent_kb_id_.setter
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) def ent_kb_id_(self, ent_kb_id):
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
@property @property
def whitespace_(self): def whitespace_(self):
@ -829,16 +844,17 @@ cdef class Token:
""" """
return self.vocab.strings[self.c.lex.lower] return self.vocab.strings[self.c.lex.lower]
property norm_: @property
def norm_(self):
"""RETURNS (str): The token's norm, i.e. a normalised form of the """RETURNS (str): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or token text. Usually set in the language's tokenizer exceptions or
norm exceptions. norm exceptions.
""" """
def __get__(self): return self.vocab.strings[self.norm]
return self.vocab.strings[self.norm]
def __set__(self, str norm_): @norm_.setter
self.c.norm = self.vocab.strings.add(norm_) def norm_(self, str norm_):
self.c.norm = self.vocab.strings.add(norm_)
@property @property
def shape_(self): def shape_(self):
@ -868,33 +884,36 @@ cdef class Token:
""" """
return self.vocab.strings[self.c.lex.lang] return self.vocab.strings[self.c.lex.lang]
property lemma_: @property
def lemma_(self):
"""RETURNS (str): The token lemma, i.e. the base form of the word, """RETURNS (str): The token lemma, i.e. the base form of the word,
with no inflectional suffixes. with no inflectional suffixes.
""" """
def __get__(self): return self.vocab.strings[self.c.lemma]
return self.vocab.strings[self.c.lemma]
def __set__(self, str lemma_): @lemma_.setter
self.c.lemma = self.vocab.strings.add(lemma_) def lemma_(self, str lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_: @property
def pos_(self):
"""RETURNS (str): Coarse-grained part-of-speech tag.""" """RETURNS (str): Coarse-grained part-of-speech tag."""
def __get__(self): return parts_of_speech.NAMES[self.c.pos]
return parts_of_speech.NAMES[self.c.pos]
def __set__(self, pos_name): @pos_.setter
if pos_name not in parts_of_speech.IDS: def pos_(self, pos_name):
raise ValueError(Errors.E1021.format(pp=pos_name)) if pos_name not in parts_of_speech.IDS:
self.c.pos = parts_of_speech.IDS[pos_name] raise ValueError(Errors.E1021.format(pp=pos_name))
self.c.pos = parts_of_speech.IDS[pos_name]
property tag_: @property
def tag_(self):
"""RETURNS (str): Fine-grained part-of-speech tag.""" """RETURNS (str): Fine-grained part-of-speech tag."""
def __get__(self): return self.vocab.strings[self.c.tag]
return self.vocab.strings[self.c.tag]
def __set__(self, tag): @tag_.setter
self.tag = self.vocab.strings.add(tag) def tag_(self, tag):
self.tag = self.vocab.strings.add(tag)
def has_dep(self): def has_dep(self):
"""Check whether the token has annotated dep information. """Check whether the token has annotated dep information.
@ -904,13 +923,14 @@ cdef class Token:
""" """
return not Token.missing_dep(self.c) return not Token.missing_dep(self.c)
property dep_: @property
def dep_(self):
"""RETURNS (str): The syntactic dependency label.""" """RETURNS (str): The syntactic dependency label."""
def __get__(self): return self.vocab.strings[self.c.dep]
return self.vocab.strings[self.c.dep]
def __set__(self, str label): @dep_.setter
self.c.dep = self.vocab.strings.add(label) def dep_(self, str label):
self.c.dep = self.vocab.strings.add(label)
@property @property
def is_oov(self): def is_oov(self):

View File

@ -101,23 +101,25 @@ cdef class Example:
def __len__(self): def __len__(self):
return len(self.predicted) return len(self.predicted)
property predicted: @property
def __get__(self): def predicted(self):
return self.x return self.x
def __set__(self, doc): @predicted.setter
self.x = doc def predicted(self, doc):
self._cached_alignment = None self.x = doc
self._cached_words_x = [t.text for t in doc] self._cached_alignment = None
self._cached_words_x = [t.text for t in doc]
property reference: @property
def __get__(self): def reference(self):
return self.y return self.y
def __set__(self, doc): @reference.setter
self.y = doc def reference(self, doc):
self._cached_alignment = None self.y = doc
self._cached_words_y = [t.text for t in doc] self._cached_alignment = None
self._cached_words_y = [t.text for t in doc]
def copy(self): def copy(self):
return Example( return Example(
@ -433,9 +435,9 @@ cdef class Example:
seen_indices.update(indices) seen_indices.update(indices)
return output return output
property text: @property
def __get__(self): def text(self):
return self.x.text return self.x.text
def __str__(self): def __str__(self):
return str(self.to_dict()) return str(self.to_dict())

View File

@ -87,16 +87,17 @@ cdef class Vocab:
self.writing_system = writing_system self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks self.get_noun_chunks = get_noun_chunks
property vectors: @property
def __get__(self): def vectors(self):
return self._vectors return self._vectors
def __set__(self, vectors): @vectors.setter
if hasattr(vectors, "strings"): def vectors(self, vectors):
for s in vectors.strings: if hasattr(vectors, "strings"):
self.strings.add(s) for s in vectors.strings:
self._vectors = vectors self.strings.add(s)
self._vectors.strings = self.strings self._vectors = vectors
self._vectors.strings = self.strings
@property @property
def lang(self): def lang(self):
@ -450,17 +451,18 @@ cdef class Vocab:
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
return key in self.vectors return key in self.vectors
property lookups: @property
def __get__(self): def lookups(self):
return self._lookups return self._lookups
def __set__(self, lookups): @lookups.setter
self._lookups = lookups def lookups(self, lookups):
if lookups.has_table("lexeme_norm"): self._lookups = lookups
self.lex_attr_getters[NORM] = util.add_lookups( if lookups.has_table("lexeme_norm"):
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lex_attr_getters[NORM] = util.add_lookups(
self.lookups.get_table("lexeme_norm"), self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
) self.lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.

View File

@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed
as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
appending `_` as in `token.dep_`. appending `_` as in `token.dep_`.
| Attribute | Description | | Attribute | Description |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `DEP` | The token's dependency label. ~~str~~ | | `DEP` | The token's dependency label. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | | `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | | `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | | `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ |
| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | | `IS_DIGIT` | Token text consists of digits. ~~bool~~ |
| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | | `IS_LOWER` | Token text is in lowercase. ~~bool~~ |
| `IS_PUNCT` | Token is punctuation. ~~bool~~ | | `IS_PUNCT` | Token is punctuation. ~~bool~~ |
| `IS_SPACE` | Token is whitespace. ~~bool~~ | | `IS_SPACE` | Token is whitespace. ~~bool~~ |
| `IS_STOP` | Token is a stop word. ~~bool~~ | | `IS_STOP` | Token is a stop word. ~~bool~~ |
| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | | `IS_TITLE` | Token text is in titlecase. ~~bool~~ |
| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | | `IS_UPPER` | Token text is in uppercase. ~~bool~~ |
| `LEMMA` | The token's lemma. ~~str~~ | | `LEMMA` | The token's lemma. ~~str~~ |
| `LENGTH` | The length of the token text. ~~int~~ | | `LENGTH` | The length of the token text. ~~int~~ |
| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | | `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ |
| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | | `LIKE_NUM` | Token text resembles a number. ~~bool~~ |
| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | | `LIKE_URL` | Token text resembles a URL. ~~bool~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ | | `LOWER` | The lowercase form of the token text. ~~str~~ |
| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | | `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ |
| `NORM` | The normalized form of the token text. ~~str~~ | | `NORM` | The normalized form of the token text. ~~str~~ |
| `ORTH` | The exact verbatim text of a token. ~~str~~ | | `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `POS` | The token's universal part of speech (UPOS). ~~str~~ | | `POS` | The token's universal part of speech (UPOS). ~~str~~ |
| `SENT_START` | Token is start of sentence. ~~bool~~ | | `SENT_START` | Token is start of sentence. ~~bool~~ |
| `SHAPE` | The token's shape. ~~str~~ | | `SHAPE` | The token's shape. ~~str~~ |
| `SPACY` | Token has a trailing space. ~~bool~~ | | `SPACY` | Token has a trailing space. ~~bool~~ |
| `TAG` | The token's fine-grained part of speech. ~~str~~ | | `TAG` | The token's fine-grained part of speech. ~~str~~ |

View File

@ -566,7 +566,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
✔ Good amount of examples for all labels ✔ Good amount of examples for all labels
✔ Examples without occurences available for all labels ✔ Examples without occurrences available for all labels
✔ No entities consisting of or starting/ending with whitespace ✔ No entities consisting of or starting/ending with whitespace
=========================== Part-of-speech Tagging =========================== =========================== Part-of-speech Tagging ===========================
@ -1322,7 +1322,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
## find-threshold {id="find-threshold",version="3.5",tag="command"} ## find-threshold {id="find-threshold",version="3.5",tag="command"}
Runs prediction trials for a trained model with varying tresholds to maximize Runs prediction trials for a trained model with varying thresholds to maximize
the specified metric. The search space for the threshold is traversed linearly the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`

View File

@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters.
| `incl_context` | Whether the local context is included in the model. Defaults to `True`. ~~bool~~ | | `incl_context` | Whether the local context is included in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [`EntityLinker`](/api/architectures#EntityLinker). ~~Model~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [`EntityLinker`](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether entities are copied from the gold docs. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `use_gold_ents` | Whether entities are copied from the gold docs. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ |
| `get_candidates` <Tag variant="new">4.0</Tag> | Function that retrieves plausible candidates per entity mention in a given `Iterator[SpanGroup]` (one `SpanGroup` includes all mentions found in a given `Doc` instance). Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ | | `get_candidates` <Tag variant="new">4.0</Tag> | Function that retrieves plausible candidates per entity mention in a given `Iterator[SpanGroup]` (one `SpanGroup` includes all mentions found in a given `Doc` instance). Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ |
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
<Infobox variant="warning"> <Infobox variant="warning">
@ -114,21 +114,21 @@ custom knowledge base, you should either call
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
[`initialize`](/api/entitylinker#initialize) call. [`initialize`](/api/entitylinker#initialize) call.
| Name | Description | | Name | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | | `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
| `get_candidates` | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ | | `get_candidates` | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | | `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | | `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
## EntityLinker.\_\_call\_\_ {id="call",tag="method"} ## EntityLinker.\_\_call\_\_ {id="call",tag="method"}

View File

@ -69,7 +69,7 @@ how the component should be configured. You can override its settings via the
| Setting | Description | | Setting | Description |
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |

View File

@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis.
> assert "Feat1=Val1" in morph > assert "Feat1=Val1" in morph
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | --------------------------------------------- | | ------------ | --------------------------------------------------------------------- |
| **RETURNS** | A feature/value pair in the analysis. ~~str~~ | | `feature` | A feature/value pair. ~~str~~ |
| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ |
### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"} ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}

View File

@ -287,7 +287,7 @@ does not permit other NPs to be nested within it so no NP-level coordination
no prepositional phrases, and no relative clauses. no prepositional phrases, and no relative clauses.
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
has not been implemeted for the given language, a `NotImplementedError` is has not been implemented for the given language, a `NotImplementedError` is
raised. raised.
> #### Example > #### Example

View File

@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
| `width` | The width of the last hidden layer. ~~int~~ | | `width` | The width of the last hidden layer. ~~int~~ |
### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"} ### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
Create an empty `TransformerData` container. Create an empty `TransformerData` container.

View File

@ -832,7 +832,7 @@ retrieve and add to them.
After creation, the component needs to be After creation, the component needs to be
[initialized](/usage/training#initialization). This method can define the [initialized](/usage/training#initialization). This method can define the
relevant labels in two ways: explicitely by setting the `labels` argument in the relevant labels in two ways: explicitly by setting the `labels` argument in the
[`initialize` block](/api/data-formats#config-initialize) of the config, or [`initialize` block](/api/data-formats#config-initialize) of the config, or
implicately by deducing them from the `get_examples` callback that generates the implicately by deducing them from the `get_examples` callback that generates the
full **training data set**, or a representative sample. full **training data set**, or a representative sample.

View File

@ -1899,7 +1899,7 @@ the two words.
"Shore": ("coast", 0.732257), "Shore": ("coast", 0.732257),
"Precautionary": ("caution", 0.490973), "Precautionary": ("caution", 0.490973),
"hopelessness": ("sadness", 0.742366), "hopelessness": ("sadness", 0.742366),
"Continous": ("continuous", 0.732549), "Continuous": ("continuous", 0.732549),
"Disemboweled": ("corpse", 0.499432), "Disemboweled": ("corpse", 0.499432),
"biostatistician": ("scientist", 0.339724), "biostatistician": ("scientist", 0.339724),
"somewheres": ("somewheres", 0.402736), "somewheres": ("somewheres", 0.402736),

View File

@ -530,13 +530,17 @@ application's `requirements.txt`. If you're running your own internal PyPi
installation, you can upload the pipeline packages there. pip's installation, you can upload the pipeline packages there. pip's
[requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/) [requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
supports both package names to download via a PyPi server, as well as supports both package names to download via a PyPi server, as well as
[direct URLs](#pipeline-urls). [direct URLs](#pipeline-urls). For instance, you can specify the
`en_core_web_sm` model for spaCy 3.7.x as follows:
```text {title="requirements.txt"} ```text {title="requirements.txt"}
spacy>=3.0.0,<4.0.0 spacy>=3.0.0,<4.0.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
``` ```
See the [list of models](https://spacy.io/models) for model download links for
the current spaCy version.
All pipeline packages are versioned and specify their spaCy dependency. This All pipeline packages are versioned and specify their spaCy dependency. This
ensures cross-compatibility and lets you specify exact version requirements for ensures cross-compatibility and lets you specify exact version requirements for
each pipeline. If you've [trained](/usage/training) your own pipeline, you can each pipeline. If you've [trained](/usage/training) your own pipeline, you can

View File

@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
dependency check, set `check_requirements: false` in your project's dependency check, set `check_requirements: false` in your project's
`project.yml`. `project.yml`.
### 4. Run a workflow {id="run-workfow"} ### 4. Run a workflow {id="run-workflow"}
> #### project.yml > #### project.yml
> >
@ -286,7 +286,7 @@ pipelines.
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |

View File

@ -306,7 +306,9 @@ installed in the same environment that's it.
### Loading probability tables into existing models ### Loading probability tables into existing models
You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`. You can load a probability table from
[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
existing spaCy model like `en_core_web_sm`.
```python ```python
# Requirements: pip install spacy-lookups-data # Requirements: pip install spacy-lookups-data
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob")) nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
``` ```
When training a model from scratch you can also specify probability tables in the `config.cfg`. When training a model from scratch you can also specify probability tables in
the `config.cfg`.
```ini {title="config.cfg (excerpt)"} ```ini {title="config.cfg (excerpt)"}
[initialize.lookups] [initialize.lookups]
@ -346,8 +349,8 @@ them**!
To stick with the theme of To stick with the theme of
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
consider the following custom spaCy consider the following custom spaCy
[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a [pipeline component](/usage/processing-pipelines#custom-components) that prints
snake when it's called: a snake when it's called:
> #### Package directory structure > #### Package directory structure
> >

View File

@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
✔ Good amount of examples for all labels ✔ Good amount of examples for all labels
✔ Examples without occurences available for all labels ✔ Examples without occurrences available for all labels
✔ No entities consisting of or starting/ending with whitespace ✔ No entities consisting of or starting/ending with whitespace
=========================== Part-of-speech Tagging =========================== =========================== Part-of-speech Tagging ===========================

View File

@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
in the [transformer API docs](/api/architectures#TransformerModel). in the [transformer API docs](/api/architectures#TransformerModel).
`spacy-transfomers` v1.1 also adds support for `transformer_config` settings `spacy-transformers` v1.1 also adds support for `transformer_config` settings
such as `output_attentions`. Additional output is stored under such as `output_attentions`. Additional output is stored under
`TransformerData.model_output`. More details are in the `TransformerData.model_output`. More details are in the
[TransformerModel docs](/api/architectures#TransformerModel). The training speed [TransformerModel docs](/api/architectures#TransformerModel). The training speed

View File

@ -23,7 +23,6 @@
}, },
"docSearch": { "docSearch": {
"appId": "Y1LB128RON", "appId": "Y1LB128RON",
"apiKey": "bb601a1daab73e2dc66faf2b79564807",
"indexName": "spacy" "indexName": "spacy"
}, },
"binderUrl": "explosion/spacy-io-binder", "binderUrl": "explosion/spacy-io-binder",

View File

@ -32,6 +32,9 @@ const nextConfig = withPWA(
ignoreBuildErrors: true, ignoreBuildErrors: true,
}, },
images: { unoptimized: true }, images: { unoptimized: true },
env: {
DOCSEARCH_API_KEY: process.env.DOCSEARCH_API_KEY
}
}) })
) )

View File

@ -1,4 +1,4 @@
import React, { useEffect, useState } from 'react' import React from 'react'
import PropTypes from 'prop-types' import PropTypes from 'prop-types'
import { DocSearch } from '@docsearch/react' import { DocSearch } from '@docsearch/react'
import '@docsearch/css' import '@docsearch/css'
@ -6,7 +6,8 @@ import '@docsearch/css'
import siteMetadata from '../../meta/site.json' import siteMetadata from '../../meta/site.json'
export default function Search({ placeholder = 'Search docs' }) { export default function Search({ placeholder = 'Search docs' }) {
const { apiKey, indexName, appId } = siteMetadata.docSearch const apiKey = process.env.DOCSEARCH_API_KEY
const { indexName, appId } = siteMetadata.docSearch
return ( return (
<DocSearch appId={appId} indexName={indexName} apiKey={apiKey} placeholder={placeholder} /> <DocSearch appId={appId} indexName={indexName} apiKey={apiKey} placeholder={placeholder} />
) )

View File

@ -109,6 +109,8 @@
box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25) box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25)
background: var(--color-dark) background: var(--color-dark)
margin: 1.5rem 0 0 2rem margin: 1.5rem 0 0 2rem
position: sticky
left: 2rem
.header .header
width: 100% width: 100%