mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-03 04:33:08 +03:00
Merge branch 'upstream_master' into sync_v4
This commit is contained in:
commit
0e3b6a87d6
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
|
@ -45,6 +45,12 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install flake8==5.0.4
|
python -m pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
- name: cython-lint
|
||||||
|
run: |
|
||||||
|
python -m pip install cython-lint -c requirements.txt
|
||||||
|
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||||
|
cython-lint spacy --ignore E501,W291,E266
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
name: Test
|
name: Test
|
||||||
needs: Validate
|
needs: Validate
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
|
@ -36,4 +36,5 @@ types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
|
cython-lint>=0.15.0; python_version >= "3.7"
|
||||||
isort>=5.0,<6.0
|
isort>=5.0,<6.0
|
||||||
|
|
|
@ -47,4 +47,5 @@ cdef enum attr_id_t:
|
||||||
MORPH = symbols.MORPH
|
MORPH = symbols.MORPH
|
||||||
ENT_ID = symbols.ENT_ID
|
ENT_ID = symbols.ENT_ID
|
||||||
|
|
||||||
IDX = symbols.IDX
|
IDX
|
||||||
|
SENT_END
|
||||||
|
|
|
@ -32,6 +32,7 @@ def init_vectors_cli(
|
||||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
@ -53,6 +54,7 @@ def init_vectors_cli(
|
||||||
truncate=truncate,
|
truncate=truncate,
|
||||||
prune=prune,
|
prune=prune,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
|
attr=attr,
|
||||||
)
|
)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
|
|
@ -128,7 +128,7 @@ grad_factor = 1.0
|
||||||
{% if "span_finder" in components -%}
|
{% if "span_finder" in components -%}
|
||||||
[components.span_finder]
|
[components.span_finder]
|
||||||
factory = "span_finder"
|
factory = "span_finder"
|
||||||
max_length = null
|
max_length = 25
|
||||||
min_length = null
|
min_length = null
|
||||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
spans_key = "sc"
|
spans_key = "sc"
|
||||||
|
@ -415,7 +415,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
{% if "span_finder" in components %}
|
{% if "span_finder" in components %}
|
||||||
[components.span_finder]
|
[components.span_finder]
|
||||||
factory = "span_finder"
|
factory = "span_finder"
|
||||||
max_length = null
|
max_length = 25
|
||||||
min_length = null
|
min_length = null
|
||||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
spans_key = "sc"
|
spans_key = "sc"
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import itertools
|
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
@ -218,7 +217,7 @@ class SpanRenderer:
|
||||||
+ (self.offset_step * (len(entities) - 1))
|
+ (self.offset_step * (len(entities) - 1))
|
||||||
)
|
)
|
||||||
markup += self.span_template.format(
|
markup += self.span_template.format(
|
||||||
text=token["text"],
|
text=escape_html(token["text"]),
|
||||||
span_slices=slices,
|
span_slices=slices,
|
||||||
span_starts=starts,
|
span_starts=starts,
|
||||||
total_height=total_height,
|
total_height=total_height,
|
||||||
|
|
|
@ -208,6 +208,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
||||||
|
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||||
|
"'spacy init vectors --attr'")
|
||||||
|
|
||||||
# v4 warning strings
|
# v4 warning strings
|
||||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||||
|
|
|
@ -12,8 +12,9 @@ from .candidate import Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
||||||
to support entity linking of named entities to real-world concepts.
|
their textual aliases, to support entity linking of named entities to
|
||||||
|
real-world concepts.
|
||||||
This is an abstract class and requires its operations to be implemented.
|
This is an abstract class and requires its operations to be implemented.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://spacy.io/api/kb
|
||||||
|
@ -31,7 +32,9 @@ cdef class KnowledgeBase:
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
||||||
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(
|
||||||
|
self, mentions: SpanGroup
|
||||||
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||||
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
|
@ -52,7 +55,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||||
|
@ -70,7 +75,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (Iterable[float]): Vector for specified entity.
|
RETURNS (Iterable[float]): Vector for specified entity.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_bytes(self, **kwargs) -> bytes:
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
@ -78,7 +85,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (bytes): Current state as binary string.
|
RETURNS (bytes): Current state as binary string.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||||
|
@ -87,27 +96,37 @@ cdef class KnowledgeBase:
|
||||||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
def to_disk(
|
||||||
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Write KnowledgeBase content to disk.
|
Write KnowledgeBase content to disk.
|
||||||
path (Union[str, Path]): Target file path.
|
path (Union[str, Path]): Target file path.
|
||||||
exclude (Iterable[str]): List of components to exclude.
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
def from_disk(
|
||||||
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Load KnowledgeBase content from disk.
|
Load KnowledgeBase content from disk.
|
||||||
path (Union[str, Path]): Target file path.
|
path (Union[str, Path]): Target file path.
|
||||||
exclude (Iterable[str]): List of components to exclude.
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
# optional data, we can let users configure a DB as the backend for this.
|
# optional data, we can let users configure a DB as the backend for this.
|
||||||
cdef object _features_table
|
cdef object _features_table
|
||||||
|
|
||||||
|
|
||||||
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
||||||
"""Add an entity vector to the vectors table."""
|
"""Add an entity vector to the vectors table."""
|
||||||
cdef int64_t new_index = self._vectors_table.size()
|
cdef int64_t new_index = self._vectors_table.size()
|
||||||
self._vectors_table.push_back(entity_vector)
|
self._vectors_table.push_back(entity_vector)
|
||||||
return new_index
|
return new_index
|
||||||
|
|
||||||
|
cdef inline int64_t c_add_entity(
|
||||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
self,
|
||||||
int32_t vector_index, int feats_row) nogil:
|
hash_t entity_hash,
|
||||||
|
float freq,
|
||||||
|
int32_t vector_index,
|
||||||
|
int feats_row
|
||||||
|
) nogil:
|
||||||
"""Add an entry to the vector of entries.
|
"""Add an entry to the vector of entries.
|
||||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
After calling this method, make sure to update also the _entry_index
|
||||||
|
using the return value"""
|
||||||
# This is what we'll map the entity hash key to. It's where the entry will sit
|
# This is what we'll map the entity hash key to. It's where the entry will sit
|
||||||
# in the vector of entries, so we can get it later.
|
# in the vector of entries, so we can get it later.
|
||||||
cdef int64_t new_index = self._entries.size()
|
cdef int64_t new_index = self._entries.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
# Avoid struct initializer to enable nogil, cf.
|
||||||
|
# https://github.com/cython/cython/issues/1642
|
||||||
cdef KBEntryC entry
|
cdef KBEntryC entry
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.vector_index = vector_index
|
entry.vector_index = vector_index
|
||||||
|
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
self._entries.push_back(entry)
|
self._entries.push_back(entry)
|
||||||
return new_index
|
return new_index
|
||||||
|
|
||||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
|
cdef inline int64_t c_add_aliases(
|
||||||
"""Connect a mention to a list of potential entities with their prior probabilities .
|
self,
|
||||||
After calling this method, make sure to update also the _alias_index using the return value"""
|
hash_t alias_hash,
|
||||||
# This is what we'll map the alias hash key to. It's where the alias will be defined
|
vector[int64_t] entry_indices,
|
||||||
# in the vector of aliases.
|
vector[float] probs
|
||||||
|
) nogil:
|
||||||
|
"""Connect a mention to a list of potential entities with their prior
|
||||||
|
probabilities. After calling this method, make sure to update also the
|
||||||
|
_alias_index using the return value"""
|
||||||
|
# This is what we'll map the alias hash key to. It's where the alias will be
|
||||||
|
# defined in the vector of aliases.
|
||||||
cdef int64_t new_index = self._aliases_table.size()
|
cdef int64_t new_index = self._aliases_table.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil
|
# Avoid struct initializer to enable nogil
|
||||||
|
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
||||||
"""
|
"""
|
||||||
Initializing the vectors and making sure the first element of each vector is a dummy,
|
Initializing the vectors and making sure the first element of each vector is a
|
||||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
dummy, because the PreshMap maps pointing to indices in these vectors can not
|
||||||
|
contain 0 as value.
|
||||||
cf. https://github.com/explosion/preshed/issues/17
|
cf. https://github.com/explosion/preshed/issues/17
|
||||||
"""
|
"""
|
||||||
cdef int32_t dummy_value = 0
|
cdef int32_t dummy_value = 0
|
||||||
|
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
cdef class Writer:
|
cdef class Writer:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
cdef int write_header(
|
||||||
|
self, int64_t nr_entries, int64_t entity_vector_length
|
||||||
|
) except -1
|
||||||
cdef int write_vector_element(self, float element) except -1
|
cdef int write_vector_element(self, float element) except -1
|
||||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
cdef int write_entry(
|
||||||
|
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||||
|
) except -1
|
||||||
|
|
||||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
cdef int write_alias_header(
|
||||||
|
self, hash_t alias_hash, int64_t candidate_length
|
||||||
|
) except -1
|
||||||
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
||||||
|
|
||||||
cdef int _write(self, void* value, size_t size) except -1
|
cdef int _write(self, void* value, size_t size) except -1
|
||||||
|
@ -143,12 +161,18 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
cdef int read_header(
|
||||||
|
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||||
|
) except -1
|
||||||
cdef int read_vector_element(self, float* element) except -1
|
cdef int read_vector_element(self, float* element) except -1
|
||||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
cdef int read_entry(
|
||||||
|
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||||
|
) except -1
|
||||||
|
|
||||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
cdef int read_alias_header(
|
||||||
|
self, hash_t* alias_hash, int64_t* candidate_length
|
||||||
|
) except -1
|
||||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||||
|
|
||||||
cdef int _read(self, void* value, size_t size) except -1
|
cdef int _read(self, void* value, size_t size) except -1
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Any, Callable, Dict, Iterable, Union
|
from typing import Any, Callable, Dict, Iterable
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -27,8 +27,9 @@ from .candidate import InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
"""An `InMemoryLookupKB` instance stores unique identifiers for entities
|
||||||
to support entity linking of named entities to real-world concepts.
|
and their textual aliases, to support entity linking of named entities to
|
||||||
|
real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||||
"""
|
"""
|
||||||
|
@ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability
|
||||||
|
based on corpus frequency.
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
"""
|
"""
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||||
|
@ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
# Raise an error if the provided entity vector is not of the correct length
|
# Raise an error if the provided entity vector is not of the correct length
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
raise ValueError(
|
||||||
|
Errors.E141.format(
|
||||||
|
found=len(entity_vector), required=self.entity_vector_length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||||
|
|
||||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
new_index = self.c_add_entity(
|
||||||
|
entity_hash=entity_hash,
|
||||||
freq=freq,
|
freq=freq,
|
||||||
vector_index=vector_index,
|
vector_index=vector_index,
|
||||||
feats_row=-1) # Features table currently not implemented
|
feats_row=-1
|
||||||
|
) # Features table currently not implemented
|
||||||
self._entry_index[entity_hash] = new_index
|
self._entry_index[entity_hash] = new_index
|
||||||
|
|
||||||
return entity_hash
|
return entity_hash
|
||||||
|
@ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
else:
|
else:
|
||||||
entity_vector = vector_list[i]
|
entity_vector = vector_list[i]
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
raise ValueError(
|
||||||
|
Errors.E141.format(
|
||||||
|
found=len(entity_vector),
|
||||||
|
required=self.entity_vector_length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.freq = freq_list[i]
|
entry.freq = freq_list[i]
|
||||||
|
@ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
previous_alias_nr = self.get_size_aliases()
|
previous_alias_nr = self.get_size_aliases()
|
||||||
# Throw an error if the length of entities and probabilities are not the same
|
# Throw an error if the length of entities and probabilities are not the same
|
||||||
if not len(entities) == len(probabilities):
|
if not len(entities) == len(probabilities):
|
||||||
raise ValueError(Errors.E132.format(alias=alias,
|
raise ValueError(
|
||||||
|
Errors.E132.format(
|
||||||
|
alias=alias,
|
||||||
entities_length=len(entities),
|
entities_length=len(entities),
|
||||||
probabilities_length=len(probabilities)))
|
probabilities_length=len(probabilities))
|
||||||
|
)
|
||||||
|
|
||||||
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
|
# Throw an error if the probabilities sum up to more than 1 (allow for
|
||||||
|
# some rounding errors)
|
||||||
prob_sum = sum(probabilities)
|
prob_sum = sum(probabilities)
|
||||||
if prob_sum > 1.00001:
|
if prob_sum > 1.00001:
|
||||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||||
|
@ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
for entity, prob in zip(entities, probabilities):
|
for entity, prob in zip(entities, probabilities):
|
||||||
entity_hash = self.vocab.strings[entity]
|
entity_hash = self.vocab.strings[entity]
|
||||||
if not entity_hash in self._entry_index:
|
if entity_hash not in self._entry_index:
|
||||||
raise ValueError(Errors.E134.format(entity=entity))
|
raise ValueError(Errors.E134.format(entity=entity))
|
||||||
|
|
||||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||||
entry_indices.push_back(int(entry_index))
|
entry_indices.push_back(int(entry_index))
|
||||||
probs.push_back(float(prob))
|
probs.push_back(float(prob))
|
||||||
|
|
||||||
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
|
new_index = self.c_add_aliases(
|
||||||
|
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
|
||||||
|
)
|
||||||
self._alias_index[alias_hash] = new_index
|
self._alias_index[alias_hash] = new_index
|
||||||
|
|
||||||
if previous_alias_nr + 1 != self.get_size_aliases():
|
if previous_alias_nr + 1 != self.get_size_aliases():
|
||||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||||
return alias_hash
|
return alias_hash
|
||||||
|
|
||||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
def append_alias(
|
||||||
|
self, str alias, str entity, float prior_prob, ignore_warnings=False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
For an alias already existing in the KB, extend its potential entities with one more.
|
For an alias already existing in the KB, extend its potential entities
|
||||||
|
with one more.
|
||||||
Throw a warning if either the alias or the entity is unknown,
|
Throw a warning if either the alias or the entity is unknown,
|
||||||
or when the combination is already previously recorded.
|
or when the combination is already previously recorded.
|
||||||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
For efficiency, it's best to use the method `add_alias` as much as
|
||||||
|
possible instead of this one.
|
||||||
"""
|
"""
|
||||||
# Check if the alias exists in the KB
|
# Check if the alias exists in the KB
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
if not alias_hash in self._alias_index:
|
if alias_hash not in self._alias_index:
|
||||||
raise ValueError(Errors.E176.format(alias=alias))
|
raise ValueError(Errors.E176.format(alias=alias))
|
||||||
|
|
||||||
# Check if the entity exists in the KB
|
# Check if the entity exists in the KB
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
if not entity_hash in self._entry_index:
|
if entity_hash not in self._entry_index:
|
||||||
raise ValueError(Errors.E134.format(entity=entity))
|
raise ValueError(Errors.E134.format(entity=entity))
|
||||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||||
|
|
||||||
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
|
# Throw an error if the prior probabilities (including the new one)
|
||||||
|
# sum up to more than 1
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
current_sum = sum([p for p in alias_entry.probs])
|
current_sum = sum([p for p in alias_entry.probs])
|
||||||
|
@ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity, the original alias, and the prior probability of that alias
|
||||||
|
resolving to that entity.
|
||||||
If the alias is not known in the KB, and empty list is returned.
|
If the alias is not known in the KB, and empty list is returned.
|
||||||
"""
|
"""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
if not alias_hash in self._alias_index:
|
if alias_hash not in self._alias_index:
|
||||||
return []
|
return []
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
|
@ -270,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||||
|
|
||||||
def get_prior_prob(self, str entity, str alias):
|
def get_prior_prob(self, str entity, str alias):
|
||||||
""" Return the prior probability of a given alias being linked to a given entity,
|
""" Return the prior probability of a given alias being linked to a
|
||||||
or return 0.0 when this combination is not known in the knowledge base"""
|
given entity, or return 0.0 when this combination is not known in the
|
||||||
|
knowledge base."""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
|
||||||
|
@ -282,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
entry_index = self._entry_index[entity_hash]
|
entry_index = self._entry_index[entity_hash]
|
||||||
|
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
|
for (entry_index, prior_prob) in zip(
|
||||||
|
alias_entry.entry_indices, alias_entry.probs
|
||||||
|
):
|
||||||
if self._entries[entry_index].entity_hash == entity_hash:
|
if self._entries[entry_index].entity_hash == entity_hash:
|
||||||
return prior_prob
|
return prior_prob
|
||||||
|
|
||||||
|
@ -295,13 +323,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
"""
|
"""
|
||||||
def serialize_header():
|
def serialize_header():
|
||||||
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
header = (
|
||||||
|
self.get_size_entities(),
|
||||||
|
self.get_size_aliases(),
|
||||||
|
self.entity_vector_length
|
||||||
|
)
|
||||||
return srsly.json_dumps(header)
|
return srsly.json_dumps(header)
|
||||||
|
|
||||||
def serialize_entries():
|
def serialize_entries():
|
||||||
i = 1
|
i = 1
|
||||||
tuples = []
|
tuples = []
|
||||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
for entry_hash, entry_index in sorted(
|
||||||
|
self._entry_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
assert entry.entity_hash == entry_hash
|
assert entry.entity_hash == entry_hash
|
||||||
assert entry_index == i
|
assert entry_index == i
|
||||||
|
@ -314,7 +348,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
headers = []
|
headers = []
|
||||||
indices_lists = []
|
indices_lists = []
|
||||||
probs_lists = []
|
probs_lists = []
|
||||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
for alias_hash, alias_index in sorted(
|
||||||
|
self._alias_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
alias = self._aliases_table[alias_index]
|
alias = self._aliases_table[alias_index]
|
||||||
assert alias_index == i
|
assert alias_index == i
|
||||||
candidate_length = len(alias.entry_indices)
|
candidate_length = len(alias.entry_indices)
|
||||||
|
@ -372,7 +408,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
indices = srsly.json_loads(all_data[1])
|
indices = srsly.json_loads(all_data[1])
|
||||||
probs = srsly.json_loads(all_data[2])
|
probs = srsly.json_loads(all_data[2])
|
||||||
for header, indices, probs in zip(headers, indices, probs):
|
for header, indices, probs in zip(headers, indices, probs):
|
||||||
alias_hash, candidate_length = header
|
alias_hash, _candidate_length = header
|
||||||
alias.entry_indices = indices
|
alias.entry_indices = indices
|
||||||
alias.probs = probs
|
alias.probs = probs
|
||||||
self._aliases_table[i] = alias
|
self._aliases_table[i] = alias
|
||||||
|
@ -421,10 +457,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
writer.write_vector_element(element)
|
writer.write_vector_element(element)
|
||||||
i = i+1
|
i = i+1
|
||||||
|
|
||||||
# dumping the entry records in the order in which they are in the _entries vector.
|
# dumping the entry records in the order in which they are in the
|
||||||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
# _entries vector.
|
||||||
|
# index 0 is a dummy object not stored in the _entry_index and can
|
||||||
|
# be ignored.
|
||||||
i = 1
|
i = 1
|
||||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
for entry_hash, entry_index in sorted(
|
||||||
|
self._entry_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
assert entry.entity_hash == entry_hash
|
assert entry.entity_hash == entry_hash
|
||||||
assert entry_index == i
|
assert entry_index == i
|
||||||
|
@ -436,7 +476,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
# dumping the aliases in the order in which they are in the _alias_index vector.
|
# dumping the aliases in the order in which they are in the _alias_index vector.
|
||||||
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
||||||
i = 1
|
i = 1
|
||||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
for alias_hash, alias_index in sorted(
|
||||||
|
self._alias_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
alias = self._aliases_table[alias_index]
|
alias = self._aliases_table[alias_index]
|
||||||
assert alias_index == i
|
assert alias_index == i
|
||||||
|
|
||||||
|
@ -542,7 +584,8 @@ cdef class Writer:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
cdef bytes bytes_loc = content.encode('utf8') \
|
||||||
|
if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=path))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
|
@ -552,14 +595,18 @@ cdef class Writer:
|
||||||
cdef size_t status = fclose(self._fp)
|
cdef size_t status = fclose(self._fp)
|
||||||
assert status == 0
|
assert status == 0
|
||||||
|
|
||||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
|
cdef int write_header(
|
||||||
|
self, int64_t nr_entries, int64_t entity_vector_length
|
||||||
|
) except -1:
|
||||||
self._write(&nr_entries, sizeof(nr_entries))
|
self._write(&nr_entries, sizeof(nr_entries))
|
||||||
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
||||||
|
|
||||||
cdef int write_vector_element(self, float element) except -1:
|
cdef int write_vector_element(self, float element) except -1:
|
||||||
self._write(&element, sizeof(element))
|
self._write(&element, sizeof(element))
|
||||||
|
|
||||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
cdef int write_entry(
|
||||||
|
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||||
|
) except -1:
|
||||||
self._write(&entry_hash, sizeof(entry_hash))
|
self._write(&entry_hash, sizeof(entry_hash))
|
||||||
self._write(&entry_freq, sizeof(entry_freq))
|
self._write(&entry_freq, sizeof(entry_freq))
|
||||||
self._write(&vector_index, sizeof(vector_index))
|
self._write(&vector_index, sizeof(vector_index))
|
||||||
|
@ -568,7 +615,9 @@ cdef class Writer:
|
||||||
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
||||||
self._write(&alias_length, sizeof(alias_length))
|
self._write(&alias_length, sizeof(alias_length))
|
||||||
|
|
||||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
|
cdef int write_alias_header(
|
||||||
|
self, hash_t alias_hash, int64_t candidate_length
|
||||||
|
) except -1:
|
||||||
self._write(&alias_hash, sizeof(alias_hash))
|
self._write(&alias_hash, sizeof(alias_hash))
|
||||||
self._write(&candidate_length, sizeof(candidate_length))
|
self._write(&candidate_length, sizeof(candidate_length))
|
||||||
|
|
||||||
|
@ -584,16 +633,19 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
cdef bytes bytes_loc = content.encode('utf8') \
|
||||||
|
if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
fclose(self._fp)
|
fclose(self._fp)
|
||||||
|
|
||||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
|
cdef int read_header(
|
||||||
|
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||||
|
) except -1:
|
||||||
status = self._read(nr_entries, sizeof(int64_t))
|
status = self._read(nr_entries, sizeof(int64_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -613,7 +665,9 @@ cdef class Reader:
|
||||||
return 0 # end of file
|
return 0 # end of file
|
||||||
raise IOError(Errors.E145.format(param="vector element"))
|
raise IOError(Errors.E145.format(param="vector element"))
|
||||||
|
|
||||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
cdef int read_entry(
|
||||||
|
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||||
|
) except -1:
|
||||||
status = self._read(entity_hash, sizeof(hash_t))
|
status = self._read(entity_hash, sizeof(hash_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -644,7 +698,9 @@ cdef class Reader:
|
||||||
return 0 # end of file
|
return 0 # end of file
|
||||||
raise IOError(Errors.E145.format(param="alias length"))
|
raise IOError(Errors.E145.format(param="alias length"))
|
||||||
|
|
||||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
|
cdef int read_alias_header(
|
||||||
|
self, hash_t* alias_hash, int64_t* candidate_length
|
||||||
|
) except -1:
|
||||||
status = self._read(alias_hash, sizeof(hash_t))
|
status = self._read(alias_hash, sizeof(hash_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
|
|
@ -740,6 +740,11 @@ class Language:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
pipe = source.get_pipe(source_name)
|
pipe = source.get_pipe(source_name)
|
||||||
|
# There is no actual solution here. Either the component has the right
|
||||||
|
# name for the source pipeline or the component has the right name for
|
||||||
|
# the current pipeline. This prioritizes the current pipeline.
|
||||||
|
if hasattr(pipe, "name"):
|
||||||
|
pipe.name = name
|
||||||
# Make sure the source config is interpolated so we don't end up with
|
# Make sure the source config is interpolated so we don't end up with
|
||||||
# orphaned variables in our final config
|
# orphaned variables in our final config
|
||||||
source_config = source.config.interpolate()
|
source_config = source.config.interpolate()
|
||||||
|
@ -817,6 +822,7 @@ class Language:
|
||||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||||
self._components.insert(pipe_index, (name, pipe_component))
|
self._components.insert(pipe_index, (name, pipe_component))
|
||||||
|
self._link_components()
|
||||||
return pipe_component
|
return pipe_component
|
||||||
|
|
||||||
def _get_pipe_index(
|
def _get_pipe_index(
|
||||||
|
@ -956,6 +962,7 @@ class Language:
|
||||||
if old_name in self._config["initialize"]["components"]:
|
if old_name in self._config["initialize"]["components"]:
|
||||||
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||||||
self._config["initialize"]["components"][new_name] = init_cfg
|
self._config["initialize"]["components"][new_name] = init_cfg
|
||||||
|
self._link_components()
|
||||||
|
|
||||||
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
||||||
"""Remove a component from the pipeline.
|
"""Remove a component from the pipeline.
|
||||||
|
@ -979,6 +986,7 @@ class Language:
|
||||||
# Make sure the name is also removed from the set of disabled components
|
# Make sure the name is also removed from the set of disabled components
|
||||||
if name in self.disabled:
|
if name in self.disabled:
|
||||||
self._disabled.remove(name)
|
self._disabled.remove(name)
|
||||||
|
self._link_components()
|
||||||
return removed
|
return removed
|
||||||
|
|
||||||
def disable_pipe(self, name: str) -> None:
|
def disable_pipe(self, name: str) -> None:
|
||||||
|
@ -1823,8 +1831,16 @@ class Language:
|
||||||
# The problem is we need to do it during deserialization...And the
|
# The problem is we need to do it during deserialization...And the
|
||||||
# components don't receive the pipeline then. So this does have to be
|
# components don't receive the pipeline then. So this does have to be
|
||||||
# here :(
|
# here :(
|
||||||
|
# First, fix up all the internal component names in case they have
|
||||||
|
# gotten out of sync due to sourcing components from different
|
||||||
|
# pipelines, since find_listeners uses proc2.name for the listener
|
||||||
|
# map.
|
||||||
|
for name, proc in self.pipeline:
|
||||||
|
if hasattr(proc, "name"):
|
||||||
|
proc.name = name
|
||||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||||
if isinstance(proc1, ty.ListenedToComponent):
|
if isinstance(proc1, ty.ListenedToComponent):
|
||||||
|
proc1.listener_map = {}
|
||||||
for name2, proc2 in self.pipeline[i + 1 :]:
|
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||||
proc1.find_listeners(proc2)
|
proc1.find_listeners(proc2)
|
||||||
|
|
||||||
|
@ -1934,7 +1950,6 @@ class Language:
|
||||||
# Later we replace the component config with the raw config again.
|
# Later we replace the component config with the raw config again.
|
||||||
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||||
pipeline = interpolated.get("components", {})
|
pipeline = interpolated.get("components", {})
|
||||||
sourced = util.get_sourced_components(interpolated)
|
|
||||||
# If components are loaded from a source (existing models), we cache
|
# If components are loaded from a source (existing models), we cache
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
|
@ -1962,6 +1977,7 @@ class Language:
|
||||||
raw_config=raw_config,
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
assert "source" in pipe_cfg
|
||||||
# We need the sourced components to reference the same
|
# We need the sourced components to reference the same
|
||||||
# vocab without modifying the current vocab state **AND**
|
# vocab without modifying the current vocab state **AND**
|
||||||
# we still want to load the source model vectors to perform
|
# we still want to load the source model vectors to perform
|
||||||
|
@ -1981,6 +1997,10 @@ class Language:
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
listeners_replaced = False
|
listeners_replaced = False
|
||||||
if "replace_listeners" in pipe_cfg:
|
if "replace_listeners" in pipe_cfg:
|
||||||
|
# Make sure that the listened-to component has the
|
||||||
|
# state of the source pipeline listener map so that the
|
||||||
|
# replace_listeners method below works as intended.
|
||||||
|
source_nlps[model]._link_components()
|
||||||
for name, proc in source_nlps[model].pipeline:
|
for name, proc in source_nlps[model].pipeline:
|
||||||
if source_name in getattr(proc, "listening_components", []):
|
if source_name in getattr(proc, "listening_components", []):
|
||||||
source_nlps[model].replace_listeners(
|
source_nlps[model].replace_listeners(
|
||||||
|
@ -1992,6 +2012,8 @@ class Language:
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
source_name, source=source_nlps[model], name=pipe_name
|
source_name, source=source_nlps[model], name=pipe_name
|
||||||
)
|
)
|
||||||
|
# At this point after nlp.add_pipe, the listener map
|
||||||
|
# corresponds to the new pipeline.
|
||||||
if model not in source_nlp_vectors_hashes:
|
if model not in source_nlp_vectors_hashes:
|
||||||
source_nlp_vectors_hashes[model] = hash(
|
source_nlp_vectors_hashes[model] = hash(
|
||||||
source_nlps[model].vocab.vectors.to_bytes(
|
source_nlps[model].vocab.vectors.to_bytes(
|
||||||
|
@ -2046,27 +2068,6 @@ class Language:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E942.format(name="pipeline_creation", value=type(nlp))
|
Errors.E942.format(name="pipeline_creation", value=type(nlp))
|
||||||
)
|
)
|
||||||
# Detect components with listeners that are not frozen consistently
|
|
||||||
for name, proc in nlp.pipeline:
|
|
||||||
if isinstance(proc, ty.ListenedToComponent):
|
|
||||||
# Remove listeners not in the pipeline
|
|
||||||
listener_names = proc.listening_components
|
|
||||||
unused_listener_names = [
|
|
||||||
ll for ll in listener_names if ll not in nlp.pipe_names
|
|
||||||
]
|
|
||||||
for listener_name in unused_listener_names:
|
|
||||||
for listener in proc.listener_map.get(listener_name, []):
|
|
||||||
proc.remove_listener(listener, listener_name)
|
|
||||||
|
|
||||||
for listener_name in proc.listening_components:
|
|
||||||
# e.g. tok2vec/transformer
|
|
||||||
# If it's a component sourced from another pipeline, we check if
|
|
||||||
# the tok2vec listeners should be replaced with standalone tok2vec
|
|
||||||
# models (e.g. so component can be frozen without its performance
|
|
||||||
# degrading when other components/tok2vec are updated)
|
|
||||||
paths = sourced.get(listener_name, {}).get("replace_listeners", [])
|
|
||||||
if paths:
|
|
||||||
nlp.replace_listeners(name, listener_name, paths)
|
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
def replace_listeners(
|
def replace_listeners(
|
||||||
|
@ -2081,7 +2082,7 @@ class Language:
|
||||||
useful when training a pipeline with components sourced from an existing
|
useful when training a pipeline with components sourced from an existing
|
||||||
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
|
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
|
||||||
the same tok2vec component, but some of them are frozen and not updated,
|
the same tok2vec component, but some of them are frozen and not updated,
|
||||||
their performance may degrade significally as the tok2vec component is
|
their performance may degrade significantly as the tok2vec component is
|
||||||
updated with new data. To prevent this, listeners can be replaced with
|
updated with new data. To prevent this, listeners can be replaced with
|
||||||
a standalone tok2vec layer that is owned by the component and doesn't
|
a standalone tok2vec layer that is owned by the component and doesn't
|
||||||
change if the component isn't updated.
|
change if the component isn't updated.
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from cython.view cimport array as cvarray
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
@ -137,9 +136,11 @@ cdef class Lexeme:
|
||||||
if hasattr(other, "orth"):
|
if hasattr(other, "orth"):
|
||||||
if self.c.orth == other.orth:
|
if self.c.orth == other.orth:
|
||||||
return 1.0
|
return 1.0
|
||||||
elif hasattr(other, "__len__") and len(other) == 1 \
|
elif (
|
||||||
and hasattr(other[0], "orth"):
|
hasattr(other, "__len__") and len(other) == 1
|
||||||
if self.c.orth == other[0].orth:
|
and hasattr(other[0], "orth")
|
||||||
|
and self.c.orth == other[0].orth
|
||||||
|
):
|
||||||
return 1.0
|
return 1.0
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
warnings.warn(Warnings.W008.format(obj="Lexeme"))
|
warnings.warn(Warnings.W008.format(obj="Lexeme"))
|
||||||
|
|
|
@ -108,7 +108,7 @@ cdef class DependencyMatcher:
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
"""
|
"""
|
||||||
return self.has_key(key)
|
return self.has_key(key) # no-cython-lint: W601
|
||||||
|
|
||||||
def _validate_input(self, pattern, key):
|
def _validate_input(self, pattern, key):
|
||||||
idx = 0
|
idx = 0
|
||||||
|
@ -264,7 +264,7 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
def remove(self, key):
|
def remove(self, key):
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
if not key in self._patterns:
|
if key not in self._patterns:
|
||||||
raise ValueError(Errors.E175.format(key=key))
|
raise ValueError(Errors.E175.format(key=key))
|
||||||
self._patterns.pop(key)
|
self._patterns.pop(key)
|
||||||
self._raw_patterns.pop(key)
|
self._raw_patterns.pop(key)
|
||||||
|
@ -382,7 +382,7 @@ cdef class DependencyMatcher:
|
||||||
return []
|
return []
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
|
|
||||||
def _gov(self,doc,node):
|
def _gov(self, doc, node):
|
||||||
return list(doc[node].children)
|
return list(doc[node].children)
|
||||||
|
|
||||||
def _dep_chain(self, doc, node):
|
def _dep_chain(self, doc, node):
|
||||||
|
|
|
@ -12,25 +12,13 @@ import warnings
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..attrs cimport (
|
from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
|
||||||
DEP,
|
|
||||||
ENT_IOB,
|
|
||||||
ID,
|
|
||||||
LEMMA,
|
|
||||||
MORPH,
|
|
||||||
NULL_ATTR,
|
|
||||||
ORTH,
|
|
||||||
POS,
|
|
||||||
TAG,
|
|
||||||
attr_id_t,
|
|
||||||
)
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..vocab cimport Vocab
|
|
||||||
|
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
|
@ -42,7 +30,6 @@ from ..attrs import IDS
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
from ..util import registry
|
|
||||||
from .levenshtein import levenshtein_compare
|
from .levenshtein import levenshtein_compare
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -93,9 +80,9 @@ cdef class Matcher:
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
"""
|
"""
|
||||||
return self.has_key(key)
|
return self.has_key(key) # no-cython-lint: W601
|
||||||
|
|
||||||
def add(self, key, patterns, *, on_match=None, greedy: str=None):
|
def add(self, key, patterns, *, on_match=None, greedy: str = None):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
key, an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
|
@ -149,8 +136,13 @@ cdef class Matcher:
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(
|
||||||
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
pattern,
|
||||||
|
self.vocab,
|
||||||
|
self._extensions,
|
||||||
|
self._extra_predicates,
|
||||||
|
self._fuzzy_compare
|
||||||
|
)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -174,7 +166,7 @@ cdef class Matcher:
|
||||||
key (str): The ID of the match rule.
|
key (str): The ID of the match rule.
|
||||||
"""
|
"""
|
||||||
norm_key = self._normalize_key(key)
|
norm_key = self._normalize_key(key)
|
||||||
if not norm_key in self._patterns:
|
if norm_key not in self._patterns:
|
||||||
raise ValueError(Errors.E175.format(key=key))
|
raise ValueError(Errors.E175.format(key=key))
|
||||||
self._patterns.pop(norm_key)
|
self._patterns.pop(norm_key)
|
||||||
self._callbacks.pop(norm_key)
|
self._callbacks.pop(norm_key)
|
||||||
|
@ -274,8 +266,15 @@ cdef class Matcher:
|
||||||
if self.patterns.empty():
|
if self.patterns.empty():
|
||||||
matches = []
|
matches = []
|
||||||
else:
|
else:
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(
|
||||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
&self.patterns[0],
|
||||||
|
self.patterns.size(),
|
||||||
|
doclike,
|
||||||
|
length,
|
||||||
|
extensions=self._extensions,
|
||||||
|
predicates=self._extra_predicates,
|
||||||
|
with_alignments=with_alignments
|
||||||
|
)
|
||||||
final_matches = []
|
final_matches = []
|
||||||
pairs_by_id = {}
|
pairs_by_id = {}
|
||||||
# For each key, either add all matches, or only the filtered,
|
# For each key, either add all matches, or only the filtered,
|
||||||
|
@ -373,7 +372,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
cdef vector[MatchC] matches
|
cdef vector[MatchC] matches
|
||||||
cdef vector[vector[MatchAlignmentC]] align_states
|
cdef vector[vector[MatchAlignmentC]] align_states
|
||||||
cdef vector[vector[MatchAlignmentC]] align_matches
|
cdef vector[vector[MatchAlignmentC]] align_matches
|
||||||
cdef PatternStateC state
|
|
||||||
cdef int i, j, nr_extra_attr
|
cdef int i, j, nr_extra_attr
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
output = []
|
output = []
|
||||||
|
@ -395,14 +393,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
value = token.vocab.strings[value]
|
value = token.vocab.strings[value]
|
||||||
extra_attr_values[i * nr_extra_attr + index] = value
|
extra_attr_values[i * nr_extra_attr + index] = value
|
||||||
# Main loop
|
# Main loop
|
||||||
cdef int nr_predicate = len(predicates)
|
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
for j in range(n):
|
for j in range(n):
|
||||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states.resize(states.size())
|
align_states.resize(states.size())
|
||||||
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
transition_states(
|
||||||
doclike[i], extra_attr_values, predicates, with_alignments)
|
states,
|
||||||
|
matches,
|
||||||
|
align_states,
|
||||||
|
align_matches,
|
||||||
|
predicate_cache,
|
||||||
|
doclike[i],
|
||||||
|
extra_attr_values,
|
||||||
|
predicates,
|
||||||
|
with_alignments
|
||||||
|
)
|
||||||
extra_attr_values += nr_extra_attr
|
extra_attr_values += nr_extra_attr
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
|
@ -428,18 +434,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(
|
||||||
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
vector[PatternStateC]& states,
|
||||||
|
vector[MatchC]& matches,
|
||||||
|
vector[vector[MatchAlignmentC]]& align_states,
|
||||||
|
vector[vector[MatchAlignmentC]]& align_matches,
|
||||||
int8_t* cached_py_predicates,
|
int8_t* cached_py_predicates,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
|
Token token,
|
||||||
|
const attr_t* extra_attrs,
|
||||||
|
py_predicates,
|
||||||
|
bint with_alignments
|
||||||
|
) except *:
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
cdef vector[PatternStateC] new_states
|
cdef vector[PatternStateC] new_states
|
||||||
cdef vector[vector[MatchAlignmentC]] align_new_states
|
cdef vector[vector[MatchAlignmentC]] align_new_states
|
||||||
cdef int nr_predicate = len(py_predicates)
|
|
||||||
for i in range(states.size()):
|
for i in range(states.size()):
|
||||||
if states[i].pattern.nr_py >= 1:
|
if states[i].pattern.nr_py >= 1:
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(
|
||||||
states[i].pattern, token, py_predicates)
|
cached_py_predicates,
|
||||||
|
states[i].pattern,
|
||||||
|
token,
|
||||||
|
py_predicates
|
||||||
|
)
|
||||||
action = get_action(states[i], token.c, extra_attrs,
|
action = get_action(states[i], token.c, extra_attrs,
|
||||||
cached_py_predicates)
|
cached_py_predicates)
|
||||||
if action == REJECT:
|
if action == REJECT:
|
||||||
|
@ -475,8 +491,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
align_new_states.push_back(align_states[q])
|
align_new_states.push_back(align_states[q])
|
||||||
states[q].pattern += 1
|
states[q].pattern += 1
|
||||||
if states[q].pattern.nr_py != 0:
|
if states[q].pattern.nr_py != 0:
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(
|
||||||
states[q].pattern, token, py_predicates)
|
cached_py_predicates,
|
||||||
|
states[q].pattern,
|
||||||
|
token,
|
||||||
|
py_predicates
|
||||||
|
)
|
||||||
action = get_action(states[q], token.c, extra_attrs,
|
action = get_action(states[q], token.c, extra_attrs,
|
||||||
cached_py_predicates)
|
cached_py_predicates)
|
||||||
# Update alignment before the transition of current state
|
# Update alignment before the transition of current state
|
||||||
|
@ -492,8 +512,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
ent_id = get_ent_id(state.pattern)
|
ent_id = get_ent_id(state.pattern)
|
||||||
if action == MATCH:
|
if action == MATCH:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length+1))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length+1
|
||||||
|
)
|
||||||
|
)
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
|
@ -501,23 +525,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
# push match without last token if length > 0
|
# push match without last token if length > 0
|
||||||
if state.length > 0:
|
if state.length > 0:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length
|
||||||
|
)
|
||||||
|
)
|
||||||
# MATCH_DOUBLE emits matches twice,
|
# MATCH_DOUBLE emits matches twice,
|
||||||
# add one more to align_matches in order to keep 1:1 relationship
|
# add one more to align_matches in order to keep 1:1 relationship
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
# push match with last token
|
# push match with last token
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length+1))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length + 1
|
||||||
|
)
|
||||||
|
)
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
elif action == MATCH_REJECT:
|
elif action == MATCH_REJECT:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length
|
||||||
|
)
|
||||||
|
)
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
|
@ -540,8 +576,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
align_states.push_back(align_new_states[i])
|
align_states.push_back(align_new_states[i])
|
||||||
|
|
||||||
|
|
||||||
cdef int update_predicate_cache(int8_t* cache,
|
cdef int update_predicate_cache(
|
||||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
int8_t* cache,
|
||||||
|
const TokenPatternC* pattern,
|
||||||
|
Token token,
|
||||||
|
predicates
|
||||||
|
) except -1:
|
||||||
# If the state references any extra predicates, check whether they match.
|
# If the state references any extra predicates, check whether they match.
|
||||||
# These are cached, so that we don't call these potentially expensive
|
# These are cached, so that we don't call these potentially expensive
|
||||||
# Python functions more than we need to.
|
# Python functions more than we need to.
|
||||||
|
@ -587,10 +627,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||||
else:
|
else:
|
||||||
state.pattern += 1
|
state.pattern += 1
|
||||||
|
|
||||||
|
cdef action_t get_action(
|
||||||
cdef action_t get_action(PatternStateC state,
|
PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC * token,
|
||||||
const int8_t* predicate_matches) nogil:
|
const attr_t * extra_attrs,
|
||||||
|
const int8_t * predicate_matches
|
||||||
|
) nogil:
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -700,9 +742,12 @@ cdef action_t get_action(PatternStateC state,
|
||||||
return RETRY
|
return RETRY
|
||||||
|
|
||||||
|
|
||||||
cdef int8_t get_is_match(PatternStateC state,
|
cdef int8_t get_is_match(
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
PatternStateC state,
|
||||||
const int8_t* predicate_matches) nogil:
|
const TokenC* token,
|
||||||
|
const attr_t* extra_attrs,
|
||||||
|
const int8_t* predicate_matches
|
||||||
|
) nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -1108,8 +1153,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
def _get_extension_extra_predicates(
|
||||||
seen_predicates):
|
spec, extra_predicates, predicate_types, seen_predicates
|
||||||
|
):
|
||||||
output = []
|
output = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
|
@ -1138,7 +1184,7 @@ def _get_operators(spec):
|
||||||
return (ONE,)
|
return (ONE,)
|
||||||
elif spec["OP"] in lookup:
|
elif spec["OP"] in lookup:
|
||||||
return lookup[spec["OP"]]
|
return lookup[spec["OP"]]
|
||||||
#Min_max {n,m}
|
# Min_max {n,m}
|
||||||
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
||||||
# {n} --> {n,n} exactly n ONE,(n)
|
# {n} --> {n,n} exactly n ONE,(n)
|
||||||
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
||||||
|
@ -1149,8 +1195,8 @@ def _get_operators(spec):
|
||||||
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
||||||
n, m = min_max.split(",")
|
n, m = min_max.split(",")
|
||||||
|
|
||||||
#1. Either n or m is a blank string and the other is numeric -->isdigit
|
# 1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||||
#2. Both are numeric and n <= m
|
# 2. Both are numeric and n <= m
|
||||||
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
||||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||||
|
|
|
@ -2,16 +2,14 @@
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from libc.stdint cimport uintptr_t
|
|
||||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
|
from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
|
||||||
|
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import warnings
|
||||||
from typing import Callable, List, Optional, Sequence, Tuple, cast
|
from typing import Callable, List, Optional, Sequence, Tuple, cast
|
||||||
|
|
||||||
from thinc.api import Model, Ops, registry
|
from thinc.api import Model, Ops, registry
|
||||||
|
@ -5,7 +6,8 @@ from thinc.initializers import glorot_uniform_init
|
||||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||||
from thinc.util import partial
|
from thinc.util import partial
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..attrs import ORTH
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..vectors import Mode
|
from ..vectors import Mode
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
@ -24,6 +26,8 @@ def StaticVectors(
|
||||||
linear projection to control the dimensionality. If a dropout rate is
|
linear projection to control the dimensionality. If a dropout rate is
|
||||||
specified, the dropout is applied per dimension over the whole batch.
|
specified, the dropout is applied per dimension over the whole batch.
|
||||||
"""
|
"""
|
||||||
|
if key_attr != "ORTH":
|
||||||
|
warnings.warn(Warnings.W125, DeprecationWarning)
|
||||||
return Model(
|
return Model(
|
||||||
"static_vectors",
|
"static_vectors",
|
||||||
forward,
|
forward,
|
||||||
|
@ -40,9 +44,9 @@ def forward(
|
||||||
token_count = sum(len(doc) for doc in docs)
|
token_count = sum(len(doc) for doc in docs)
|
||||||
if not token_count:
|
if not token_count:
|
||||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||||
key_attr: int = model.attrs["key_attr"]
|
|
||||||
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
|
|
||||||
vocab: Vocab = docs[0].vocab
|
vocab: Vocab = docs[0].vocab
|
||||||
|
key_attr: int = getattr(vocab.vectors, "attr", ORTH)
|
||||||
|
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
|
||||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||||
if vocab.vectors.mode == Mode.default:
|
if vocab.vectors.mode == Mode.default:
|
||||||
V = model.ops.asarray(vocab.vectors.data)
|
V = model.ops.asarray(vocab.vectors.data)
|
||||||
|
|
|
@ -246,6 +246,7 @@ cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph,
|
||||||
n_results += 1
|
n_results += 1
|
||||||
return n_results
|
return n_results
|
||||||
|
|
||||||
|
|
||||||
def unpickle_morphology(strings, tags):
|
def unpickle_morphology(strings, tags):
|
||||||
cdef Morphology morphology = Morphology(strings)
|
cdef Morphology morphology = Morphology(strings)
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
|
|
|
@ -46,11 +46,18 @@ cdef struct EditTreeC:
|
||||||
bint is_match_node
|
bint is_match_node
|
||||||
NodeC inner
|
NodeC inner
|
||||||
|
|
||||||
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
|
cdef inline EditTreeC edittree_new_match(
|
||||||
uint32_t prefix_tree, uint32_t suffix_tree):
|
len_t prefix_len,
|
||||||
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
|
len_t suffix_len,
|
||||||
suffix_len=suffix_len, prefix_tree=prefix_tree,
|
uint32_t prefix_tree,
|
||||||
suffix_tree=suffix_tree)
|
uint32_t suffix_tree
|
||||||
|
):
|
||||||
|
cdef MatchNodeC match_node = MatchNodeC(
|
||||||
|
prefix_len=prefix_len,
|
||||||
|
suffix_len=suffix_len,
|
||||||
|
prefix_tree=prefix_tree,
|
||||||
|
suffix_tree=suffix_tree
|
||||||
|
)
|
||||||
cdef NodeC inner = NodeC(match_node=match_node)
|
cdef NodeC inner = NodeC(match_node=match_node)
|
||||||
return EditTreeC(is_match_node=True, inner=inner)
|
return EditTreeC(is_match_node=True, inner=inner)
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,6 @@ from libc.string cimport memset
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ...typedefs cimport hash_t
|
from ...typedefs cimport hash_t
|
||||||
|
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -25,17 +23,16 @@ cdef LCS find_lcs(str source, str target):
|
||||||
target (str): The second string.
|
target (str): The second string.
|
||||||
RETURNS (LCS): The spans of the longest common subsequences.
|
RETURNS (LCS): The spans of the longest common subsequences.
|
||||||
"""
|
"""
|
||||||
cdef Py_ssize_t source_len = len(source)
|
|
||||||
cdef Py_ssize_t target_len = len(target)
|
cdef Py_ssize_t target_len = len(target)
|
||||||
cdef size_t longest_align = 0;
|
cdef size_t longest_align = 0
|
||||||
cdef int source_idx, target_idx
|
cdef int source_idx, target_idx
|
||||||
cdef LCS lcs
|
cdef LCS lcs
|
||||||
cdef Py_UCS4 source_cp, target_cp
|
cdef Py_UCS4 source_cp, target_cp
|
||||||
|
|
||||||
memset(&lcs, 0, sizeof(lcs))
|
memset(&lcs, 0, sizeof(lcs))
|
||||||
|
|
||||||
cdef vector[size_t] prev_aligns = vector[size_t](target_len);
|
cdef vector[size_t] prev_aligns = vector[size_t](target_len)
|
||||||
cdef vector[size_t] cur_aligns = vector[size_t](target_len);
|
cdef vector[size_t] cur_aligns = vector[size_t](target_len)
|
||||||
|
|
||||||
for (source_idx, source_cp) in enumerate(source):
|
for (source_idx, source_cp) in enumerate(source):
|
||||||
for (target_idx, target_cp) in enumerate(target):
|
for (target_idx, target_cp) in enumerate(target):
|
||||||
|
@ -89,7 +86,7 @@ cdef class EditTrees:
|
||||||
cdef LCS lcs = find_lcs(form, lemma)
|
cdef LCS lcs = find_lcs(form, lemma)
|
||||||
|
|
||||||
cdef EditTreeC tree
|
cdef EditTreeC tree
|
||||||
cdef uint32_t tree_id, prefix_tree, suffix_tree
|
cdef uint32_t prefix_tree, suffix_tree
|
||||||
if lcs_is_empty(lcs):
|
if lcs_is_empty(lcs):
|
||||||
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
||||||
else:
|
else:
|
||||||
|
@ -289,6 +286,7 @@ def _tree2dict(tree):
|
||||||
tree = tree["inner"]["subst_node"]
|
tree = tree["inner"]["subst_node"]
|
||||||
return(dict(tree))
|
return(dict(tree))
|
||||||
|
|
||||||
|
|
||||||
def _dict2tree(tree):
|
def _dict2tree(tree):
|
||||||
errors = validate_edit_tree(tree)
|
errors = validate_edit_tree(tree)
|
||||||
if errors:
|
if errors:
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
cimport numpy as np
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from cpython.ref cimport Py_XDECREF, PyObject
|
from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
from ...typedefs cimport class_t, hash_t
|
from thinc.extra.search import MaxViolation
|
||||||
|
|
||||||
|
from thinc.extra.search cimport MaxViolation
|
||||||
|
|
||||||
|
from ...typedefs cimport class_t
|
||||||
from .transition_system cimport Transition, TransitionSystem
|
from .transition_system cimport Transition, TransitionSystem
|
||||||
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
@ -146,7 +148,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
|
||||||
cdef MaxViolation violn
|
cdef MaxViolation violn
|
||||||
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
|
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
|
||||||
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
|
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
|
||||||
cdef StateClass state
|
|
||||||
beam_maps = []
|
beam_maps = []
|
||||||
backprops = []
|
backprops = []
|
||||||
violns = [MaxViolation() for _ in range(len(states))]
|
violns = [MaxViolation() for _ in range(len(states))]
|
||||||
|
|
|
@ -280,7 +280,6 @@ cdef cppclass StateC:
|
||||||
|
|
||||||
return n
|
return n
|
||||||
|
|
||||||
|
|
||||||
int n_L(int head) nogil const:
|
int n_L(int head) nogil const:
|
||||||
return n_arcs(this._left_arcs, head)
|
return n_arcs(this._left_arcs, head)
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ...strings cimport hash_string
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ...tokens.token cimport MISSING_DEP
|
from ...tokens.token cimport MISSING_DEP
|
||||||
from ...typedefs cimport attr_t, hash_t
|
from ...typedefs cimport attr_t
|
||||||
|
|
||||||
from ...training import split_bilu_label
|
from ...training import split_bilu_label
|
||||||
|
|
||||||
|
@ -68,8 +68,9 @@ cdef struct GoldParseStateC:
|
||||||
weight_t pop_cost
|
weight_t pop_cost
|
||||||
|
|
||||||
|
|
||||||
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
cdef GoldParseStateC create_gold_state(
|
||||||
heads, labels, sent_starts) except *:
|
Pool mem, const StateC* state, heads, labels, sent_starts
|
||||||
|
) except *:
|
||||||
cdef GoldParseStateC gs
|
cdef GoldParseStateC gs
|
||||||
gs.length = len(heads)
|
gs.length = len(heads)
|
||||||
gs.stride = 1
|
gs.stride = 1
|
||||||
|
@ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
||||||
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
|
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
|
||||||
|
|
||||||
for i, is_sent_start in enumerate(sent_starts):
|
for i, is_sent_start in enumerate(sent_starts):
|
||||||
if is_sent_start == True:
|
if is_sent_start is True:
|
||||||
gs.state_bits[i] = set_state_flag(
|
gs.state_bits[i] = set_state_flag(
|
||||||
gs.state_bits[i],
|
gs.state_bits[i],
|
||||||
IS_SENT_START,
|
IS_SENT_START,
|
||||||
|
@ -210,6 +211,7 @@ cdef class ArcEagerGold:
|
||||||
def update(self, StateClass stcls):
|
def update(self, StateClass stcls):
|
||||||
update_gold_state(&self.c, stcls.c)
|
update_gold_state(&self.c, stcls.c)
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_sent_starts(example):
|
def _get_aligned_sent_starts(example):
|
||||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
If the reference has not sentence starts, return a list of None values.
|
If the reference has not sentence starts, return a list of None values.
|
||||||
|
@ -524,7 +526,6 @@ cdef class Break:
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int i
|
|
||||||
if st.buffer_length() < 2:
|
if st.buffer_length() < 2:
|
||||||
return False
|
return False
|
||||||
elif st.B(1) != st.B(0) + 1:
|
elif st.B(1) != st.B(0) + 1:
|
||||||
|
@ -556,8 +557,8 @@ cdef class Break:
|
||||||
cost -= 1
|
cost -= 1
|
||||||
if gold.heads[si] == b0:
|
if gold.heads[si] == b0:
|
||||||
cost -= 1
|
cost -= 1
|
||||||
if not is_sent_start(gold, state.B(1)) \
|
if not is_sent_start(gold, state.B(1)) and\
|
||||||
and not is_sent_start_unknown(gold, state.B(1)):
|
not is_sent_start_unknown(gold, state.B(1)):
|
||||||
cost += 1
|
cost += 1
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
@ -805,7 +806,6 @@ cdef class ArcEager(TransitionSystem):
|
||||||
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
||||||
cdef ArcEagerGold gold_ = gold
|
cdef ArcEagerGold gold_ = gold
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
n_gold = 0
|
|
||||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||||
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
||||||
else:
|
else:
|
||||||
|
@ -878,7 +878,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
print("Gold")
|
print("Gold")
|
||||||
for token in example.y:
|
for token in example.y:
|
||||||
print(token.i, token.text, token.dep_, token.head.text)
|
print(token.i, token.text, token.dep_, token.head.text)
|
||||||
aligned_heads, aligned_labels = example.get_aligned_parse()
|
aligned_heads, _aligned_labels = example.get_aligned_parse()
|
||||||
print("Aligned heads")
|
print("Aligned heads")
|
||||||
for i, head in enumerate(aligned_heads):
|
for i, head in enumerate(aligned_heads):
|
||||||
print(example.x[i], example.x[head] if head is not None else "__")
|
print(example.x[i], example.x[head] if head is not None else "__")
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
import os
|
|
||||||
import random
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from libcpp.memory cimport shared_ptr
|
from libcpp.memory cimport shared_ptr
|
||||||
|
@ -14,7 +11,7 @@ from ...tokens.span import Span
|
||||||
|
|
||||||
from ...attrs cimport IS_SPACE
|
from ...attrs cimport IS_SPACE
|
||||||
from ...lexeme cimport Lexeme
|
from ...lexeme cimport Lexeme
|
||||||
from ...structs cimport SpanC, TokenC
|
from ...structs cimport SpanC
|
||||||
from ...tokens.span cimport Span
|
from ...tokens.span cimport Span
|
||||||
from ...typedefs cimport attr_t, weight_t
|
from ...typedefs cimport attr_t, weight_t
|
||||||
|
|
||||||
|
@ -142,7 +139,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
actions[action][entity_type] = 1
|
actions[action][entity_type] = 1
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
|
||||||
for example in kwargs.get('examples', []):
|
for example in kwargs.get('examples', []):
|
||||||
for token in example.y:
|
for token in example.y:
|
||||||
ent_type = token.ent_type_
|
ent_type = token.ent_type_
|
||||||
|
@ -324,7 +320,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
||||||
cdef BiluoGold gold_ = gold
|
cdef BiluoGold gold_ = gold
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
n_gold = 0
|
|
||||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||||
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
||||||
else:
|
else:
|
||||||
|
@ -487,10 +482,8 @@ cdef class In:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
move = IN
|
|
||||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
|
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
|
||||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
|
@ -550,12 +543,10 @@ cdef class Last:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
move = LAST
|
|
||||||
b0 = s.B(0)
|
b0 = s.B(0)
|
||||||
ent_start = s.E(0)
|
ent_start = s.E(0)
|
||||||
|
|
||||||
cdef int g_act = gold.ner[b0].move
|
cdef int g_act = gold.ner[b0].move
|
||||||
cdef attr_t g_tag = gold.ner[b0].label
|
|
||||||
|
|
||||||
cdef int cost = 0
|
cdef int cost = 0
|
||||||
|
|
||||||
|
@ -655,7 +646,6 @@ cdef class Unit:
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Out:
|
cdef class Out:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
|
@ -678,7 +668,6 @@ cdef class Out:
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
|
||||||
cdef weight_t cost = 0
|
cdef weight_t cost = 0
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -125,14 +125,17 @@ def decompose(label):
|
||||||
def is_decorated(label):
|
def is_decorated(label):
|
||||||
return DELIMITER in label
|
return DELIMITER in label
|
||||||
|
|
||||||
|
|
||||||
def count_decorated_labels(gold_data):
|
def count_decorated_labels(gold_data):
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for example in gold_data:
|
for example in gold_data:
|
||||||
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
||||||
example.get_aligned("DEP"))
|
example.get_aligned("DEP"))
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
deco_deps = [
|
||||||
for i, head in enumerate(proj_heads)]
|
'ROOT' if head == i else deco_deps[i]
|
||||||
|
for i, head in enumerate(proj_heads)
|
||||||
|
]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
for label in deco_deps:
|
for label in deco_deps:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
|
@ -160,9 +163,9 @@ def projectivize(heads, labels):
|
||||||
|
|
||||||
|
|
||||||
cdef vector[int] _heads_to_c(heads):
|
cdef vector[int] _heads_to_c(heads):
|
||||||
cdef vector[int] c_heads;
|
cdef vector[int] c_heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
if head == None:
|
if head is None:
|
||||||
c_heads.push_back(-1)
|
c_heads.push_back(-1)
|
||||||
else:
|
else:
|
||||||
assert head < len(heads)
|
assert head < len(heads)
|
||||||
|
@ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels):
|
||||||
deco_labels.append(labels[tokenid])
|
deco_labels.append(labels[tokenid])
|
||||||
return deco_labels
|
return deco_labels
|
||||||
|
|
||||||
|
|
||||||
def get_smallest_nonproj_arc_slow(heads):
|
def get_smallest_nonproj_arc_slow(heads):
|
||||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
return _get_smallest_nonproj_arc(c_heads)
|
return _get_smallest_nonproj_arc(c_heads)
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
import numpy
|
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from ...tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
|
@ -42,11 +40,11 @@ cdef class StateClass:
|
||||||
cdef vector[ArcC] arcs
|
cdef vector[ArcC] arcs
|
||||||
self.c.get_arcs(&arcs)
|
self.c.get_arcs(&arcs)
|
||||||
return list(arcs)
|
return list(arcs)
|
||||||
#py_arcs = []
|
# py_arcs = []
|
||||||
#for arc in arcs:
|
# for arc in arcs:
|
||||||
# if arc.head != -1 and arc.child != -1:
|
# if arc.head != -1 and arc.child != -1:
|
||||||
# py_arcs.append((arc.head, arc.child, arc.label))
|
# py_arcs.append((arc.head, arc.child, arc.label))
|
||||||
#return arcs
|
# return arcs
|
||||||
|
|
||||||
def add_arc(self, int head, int child, int label):
|
def add_arc(self, int head, int child, int label):
|
||||||
self.c.add_arc(head, child, label)
|
self.c.add_arc(head, child, label)
|
||||||
|
@ -138,7 +136,7 @@ cdef class StateClass:
|
||||||
|
|
||||||
def at_break(self):
|
def at_break(self):
|
||||||
return False
|
return False
|
||||||
#return self.c.at_break()
|
# return self.c.at_break()
|
||||||
|
|
||||||
def has_head(self, int i):
|
def has_head(self, int i):
|
||||||
return self.c.has_head(i)
|
return self.c.has_head(i)
|
||||||
|
|
|
@ -20,11 +20,15 @@ cdef struct Transition:
|
||||||
int (*do)(StateC* state, attr_t label) nogil
|
int (*do)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
|
|
||||||
ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold,
|
ctypedef weight_t (*get_cost_func_t)(
|
||||||
attr_tlabel) nogil
|
const StateC* state, const void* gold, attr_tlabel
|
||||||
ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil
|
) nogil
|
||||||
ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void*
|
ctypedef weight_t (*move_cost_func_t)(
|
||||||
gold, attr_t label) nogil
|
const StateC* state, const void* gold
|
||||||
|
) nogil
|
||||||
|
ctypedef weight_t (*label_cost_func_t)(
|
||||||
|
const StateC* state, const void* gold, attr_t label
|
||||||
|
) nogil
|
||||||
|
|
||||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,7 @@ from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc
|
|
||||||
from ...typedefs cimport attr_t, weight_t
|
from ...typedefs cimport attr_t, weight_t
|
||||||
from . cimport _beam_utils
|
|
||||||
from ._parser_utils cimport arg_max_if_valid
|
from ._parser_utils cimport arg_max_if_valid
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
|
||||||
|
@ -270,7 +268,6 @@ cdef class TransitionSystem:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
transitions = []
|
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: srsly.json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes(),
|
'strings': lambda: self.strings.to_bytes(),
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Callable, Iterable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Config, Model
|
from thinc.api import Config, Model
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Config, Model
|
from thinc.api import Config, Model
|
||||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||||
from thinc.types import Floats2d, Ints1d
|
|
||||||
|
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
@ -16,11 +15,9 @@ from ..errors import Errors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..parts_of_speech import IDS as POS_IDS
|
from ..parts_of_speech import IDS as POS_IDS
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..symbols import POS
|
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .pipe import deserialize_config
|
from .tagger import Tagger
|
||||||
from .tagger import ActivationsT, Tagger
|
|
||||||
|
|
||||||
# See #9050
|
# See #9050
|
||||||
BACKWARD_OVERWRITE = True
|
BACKWARD_OVERWRITE = True
|
||||||
|
@ -86,8 +83,11 @@ def morphologizer_score(examples, **kwargs):
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
results.update(
|
||||||
"morph", getter=morph_key_getter, **kwargs))
|
Scorer.score_token_attr_per_feat(
|
||||||
|
examples, "morph", getter=morph_key_getter, **kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@ -249,7 +249,6 @@ class Morphologizer(Tagger):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
|
||||||
cdef bint overwrite = self.cfg["overwrite"]
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
cdef bint extend = self.cfg["extend"]
|
cdef bint extend = self.cfg["extend"]
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Callable, Iterable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Config, Model
|
from thinc.api import Config, Model
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import PRFScore, get_ner_prf
|
from ..scorer import get_ner_prf
|
||||||
from ..training import remove_bilu_prefix, validate_examples
|
from ..training import remove_bilu_prefix
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from ._parser_internals.ner import BiluoPushDown
|
from ._parser_internals.ner import BiluoPushDown
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
|
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
|
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -52,7 +52,7 @@ cdef class Pipe:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, [doc], e)
|
error_handler(self.name, self, [doc], e)
|
||||||
|
|
||||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
|
||||||
"""Initialize the pipe. For non-trainable components, this method
|
"""Initialize the pipe. For non-trainable components, this method
|
||||||
is optional. For trainable components, which should inherit
|
is optional. For trainable components, which should inherit
|
||||||
from the subclass TrainablePipe, the provided data examples
|
from the subclass TrainablePipe, the provided data examples
|
||||||
|
|
|
@ -7,11 +7,11 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .senter import senter_score
|
from .senter import senter_score
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"sentencizer",
|
"sentencizer",
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
|
@ -34,7 +34,8 @@ class Sentencizer(Pipe):
|
||||||
DOCS: https://spacy.io/api/sentencizer
|
DOCS: https://spacy.io/api/sentencizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
default_punct_chars = [
|
||||||
|
'!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||||
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||||
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||||
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
||||||
|
@ -44,7 +45,8 @@ class Sentencizer(Pipe):
|
||||||
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
||||||
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
||||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||||
'。', '。']
|
'。', '。'
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -127,7 +129,6 @@ class Sentencizer(Pipe):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int idx = 0
|
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
|
@ -168,7 +169,6 @@ class Sentencizer(Pipe):
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||||
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
"""Load the sentencizer from disk.
|
"""Load the sentencizer from disk.
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
from typing import Callable, Iterable, Optional
|
||||||
|
|
||||||
import srsly
|
|
||||||
from thinc.api import Config, Model
|
from thinc.api import Config, Model
|
||||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||||
from thinc.types import Floats2d, Ints1d
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
|
@ -48,14 +48,14 @@ DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model
|
||||||
"threshold": 0.5,
|
"threshold": 0.5,
|
||||||
"model": DEFAULT_SPAN_FINDER_MODEL,
|
"model": DEFAULT_SPAN_FINDER_MODEL,
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
"max_length": None,
|
"max_length": 25,
|
||||||
"min_length": None,
|
"min_length": None,
|
||||||
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0,
|
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
||||||
f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0,
|
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
||||||
f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0,
|
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_span_finder(
|
def make_span_finder(
|
||||||
|
@ -104,7 +104,7 @@ def make_span_finder_scorer():
|
||||||
|
|
||||||
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
kwargs = dict(kwargs)
|
kwargs = dict(kwargs)
|
||||||
attr_prefix = "span_finder_"
|
attr_prefix = "spans_"
|
||||||
key = kwargs["spans_key"]
|
key = kwargs["spans_key"]
|
||||||
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||||
kwargs.setdefault(
|
kwargs.setdefault(
|
||||||
|
|
|
@ -1,27 +1,20 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
import warnings
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
|
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
|
||||||
from thinc.api import Config, Model, set_dropout_rate
|
from thinc.api import Config, Model, set_dropout_rate
|
||||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||||
from thinc.types import Floats2d, Ints1d
|
from thinc.types import Floats2d, Ints1d
|
||||||
|
|
||||||
from ..morphology cimport Morphology
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..attrs import ID, POS
|
from ..errors import Errors
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..parts_of_speech import X
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .pipe import deserialize_config
|
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||||
|
@ -188,7 +181,6 @@ class Tagger(TrainablePipe):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
|
||||||
cdef bint overwrite = self.cfg["overwrite"]
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
|
|
@ -103,7 +103,7 @@ cdef class TrainablePipe(Pipe):
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -150,9 +150,9 @@ cdef class TrainablePipe(Pipe):
|
||||||
def update(self,
|
def update(self,
|
||||||
examples: Iterable["Example"],
|
examples: Iterable["Example"],
|
||||||
*,
|
*,
|
||||||
drop: float=0.0,
|
drop: float = 0.0,
|
||||||
sgd: Optimizer=None,
|
sgd: Optimizer = None,
|
||||||
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
losses: Optional[Dict[str, float]] = None) -> Dict[str, float]:
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
updating the pipe's model. Delegates to predict and get_loss.
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
@ -186,8 +186,8 @@ cdef class TrainablePipe(Pipe):
|
||||||
def rehearse(self,
|
def rehearse(self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
*,
|
*,
|
||||||
sgd: Optimizer=None,
|
sgd: Optimizer = None,
|
||||||
losses: Dict[str, float]=None,
|
losses: Dict[str, float] = None,
|
||||||
**config) -> Dict[str, float]:
|
**config) -> Dict[str, float]:
|
||||||
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||||
teach the current model to make predictions similar to an initial model,
|
teach the current model to make predictions similar to an initial model,
|
||||||
|
@ -238,7 +238,7 @@ cdef class TrainablePipe(Pipe):
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
This method needs to be implemented by each TrainablePipe component,
|
This method needs to be implemented by each TrainablePipe component,
|
||||||
ensuring the internal model (if available) is initialized properly
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
|
|
@ -8,58 +8,35 @@ from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
|
||||||
from libc.string cimport memcpy, memset
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
import contextlib
|
import contextlib
|
||||||
import random
|
import random
|
||||||
import warnings
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import (
|
|
||||||
CupyOps,
|
from thinc.api import CupyOps, NumpyOps, set_dropout_rate
|
||||||
NumpyOps,
|
|
||||||
Optimizer,
|
|
||||||
chain,
|
|
||||||
get_array_module,
|
|
||||||
get_ops,
|
|
||||||
set_dropout_rate,
|
|
||||||
softmax_activation,
|
|
||||||
use_ops,
|
|
||||||
)
|
|
||||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
|
||||||
from thinc.types import Floats2d, Ints1d
|
from thinc.types import Floats2d, Ints1d
|
||||||
|
|
||||||
from ..ml.tb_framework import TransitionModelInputs
|
from ..ml.tb_framework import TransitionModelInputs
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ._parser_internals cimport _beam_utils
|
from ._parser_internals cimport _beam_utils
|
||||||
from ._parser_internals.search cimport Beam
|
|
||||||
from ._parser_internals.stateclass cimport StateC, StateClass
|
from ._parser_internals.stateclass cimport StateC, StateClass
|
||||||
from .trainable_pipe cimport TrainablePipe
|
from .trainable_pipe cimport TrainablePipe
|
||||||
|
|
||||||
from ._parser_internals import _beam_utils
|
|
||||||
|
|
||||||
from ..typedefs cimport weight_t
|
from ..typedefs cimport weight_t
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors
|
||||||
from ..training import (
|
from ..training import (
|
||||||
validate_distillation_examples,
|
validate_distillation_examples,
|
||||||
validate_examples,
|
validate_examples,
|
||||||
validate_get_examples,
|
validate_get_examples,
|
||||||
)
|
)
|
||||||
|
from ._parser_internals import _beam_utils
|
||||||
|
|
||||||
# TODO: Remove when we switch to Cython 3.
|
|
||||||
cdef extern from "<algorithm>" namespace "std" nogil:
|
|
||||||
bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
|
|
||||||
|
|
||||||
|
|
||||||
NUMPY_OPS = NumpyOps()
|
NUMPY_OPS = NumpyOps()
|
||||||
|
|
||||||
|
@ -384,7 +361,6 @@ class Parser(TrainablePipe):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, batch_in_order, e)
|
error_handler(self.name, self, batch_in_order, e)
|
||||||
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -414,7 +390,6 @@ class Parser(TrainablePipe):
|
||||||
|
|
||||||
def set_annotations(self, docs, states_or_beams):
|
def set_annotations(self, docs, states_or_beams):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
cdef Beam beam
|
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
states = _beam_utils.collect_states(states_or_beams, docs)
|
states = _beam_utils.collect_states(states_or_beams, docs)
|
||||||
for i, (state, doc) in enumerate(zip(states, docs)):
|
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||||
|
@ -423,7 +398,6 @@ class Parser(TrainablePipe):
|
||||||
hook(doc)
|
hook(doc)
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||||
cdef StateClass state
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
|
|
@ -4,7 +4,6 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Uni
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libcpp.set cimport set
|
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
|
@ -52,7 +52,8 @@ TEST_PATTERNS = [
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
|
"pattern",
|
||||||
|
[[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]],
|
||||||
)
|
)
|
||||||
def test_matcher_pattern_validation(en_vocab, pattern):
|
def test_matcher_pattern_validation(en_vocab, pattern):
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
|
|
|
@ -11,6 +11,7 @@ def test_build_dependencies():
|
||||||
"flake8",
|
"flake8",
|
||||||
"hypothesis",
|
"hypothesis",
|
||||||
"pre-commit",
|
"pre-commit",
|
||||||
|
"cython-lint",
|
||||||
"black",
|
"black",
|
||||||
"isort",
|
"isort",
|
||||||
"mypy",
|
"mypy",
|
||||||
|
|
|
@ -230,10 +230,10 @@ def test_overfitting_IO():
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(train_examples)
|
scores = nlp.evaluate(train_examples)
|
||||||
assert f"span_finder_{SPANS_KEY}_f" in scores
|
assert f"spans_{SPANS_KEY}_f" in scores
|
||||||
# It's not perfect 1.0 F1 because it's designed to overgenerate for now.
|
# It's not perfect 1.0 F1 because it's designed to overgenerate for now.
|
||||||
assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75
|
assert scores[f"spans_{SPANS_KEY}_p"] == 0.75
|
||||||
assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0
|
assert scores[f"spans_{SPANS_KEY}_r"] == 1.0
|
||||||
|
|
||||||
# also test that the spancat works for just a single entity in a sentence
|
# also test that the spancat works for just a single entity in a sentence
|
||||||
doc = nlp("London")
|
doc = nlp("London")
|
||||||
|
|
|
@ -192,8 +192,7 @@ def test_tok2vec_listener(with_vectors):
|
||||||
for tag in t[1]["tags"]:
|
for tag in t[1]["tags"]:
|
||||||
tagger.add_label(tag)
|
tagger.add_label(tag)
|
||||||
|
|
||||||
# Check that the Tok2Vec component finds it listeners
|
# Check that the Tok2Vec component finds its listeners
|
||||||
assert tok2vec.listeners == []
|
|
||||||
optimizer = nlp.initialize(lambda: train_examples)
|
optimizer = nlp.initialize(lambda: train_examples)
|
||||||
assert tok2vec.listeners == [tagger_tok2vec]
|
assert tok2vec.listeners == [tagger_tok2vec]
|
||||||
|
|
||||||
|
@ -221,7 +220,6 @@ def test_tok2vec_listener_callback():
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
tagger = nlp.get_pipe("tagger")
|
tagger = nlp.get_pipe("tagger")
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
nlp._link_components()
|
|
||||||
docs = [nlp.make_doc("A random sentence")]
|
docs = [nlp.make_doc("A random sentence")]
|
||||||
tok2vec.model.initialize(X=docs)
|
tok2vec.model.initialize(X=docs)
|
||||||
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
|
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
|
||||||
|
@ -430,29 +428,46 @@ def test_replace_listeners_from_config():
|
||||||
nlp.to_disk(dir_path)
|
nlp.to_disk(dir_path)
|
||||||
base_model = str(dir_path)
|
base_model = str(dir_path)
|
||||||
new_config = {
|
new_config = {
|
||||||
"nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
|
"nlp": {
|
||||||
|
"lang": "en",
|
||||||
|
"pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
|
||||||
|
},
|
||||||
"components": {
|
"components": {
|
||||||
"tok2vec": {"source": base_model},
|
"tok2vec": {"source": base_model},
|
||||||
"tagger": {
|
"tagger2": {
|
||||||
"source": base_model,
|
"source": base_model,
|
||||||
|
"component": "tagger",
|
||||||
"replace_listeners": ["model.tok2vec"],
|
"replace_listeners": ["model.tok2vec"],
|
||||||
},
|
},
|
||||||
"ner": {"source": base_model},
|
"ner3": {
|
||||||
|
"source": base_model,
|
||||||
|
"component": "ner",
|
||||||
|
},
|
||||||
|
"tagger4": {
|
||||||
|
"source": base_model,
|
||||||
|
"component": "tagger",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
new_nlp = util.load_model_from_config(new_config, auto_fill=True)
|
new_nlp = util.load_model_from_config(new_config, auto_fill=True)
|
||||||
new_nlp.initialize(lambda: examples)
|
new_nlp.initialize(lambda: examples)
|
||||||
tok2vec = new_nlp.get_pipe("tok2vec")
|
tok2vec = new_nlp.get_pipe("tok2vec")
|
||||||
tagger = new_nlp.get_pipe("tagger")
|
tagger = new_nlp.get_pipe("tagger2")
|
||||||
ner = new_nlp.get_pipe("ner")
|
ner = new_nlp.get_pipe("ner3")
|
||||||
assert tok2vec.listening_components == ["ner"]
|
assert "ner" not in new_nlp.pipe_names
|
||||||
|
assert "tagger" not in new_nlp.pipe_names
|
||||||
|
assert tok2vec.listening_components == ["ner3", "tagger4"]
|
||||||
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
|
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
|
||||||
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
|
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
|
||||||
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
|
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
|
||||||
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
|
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
|
||||||
assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
|
assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
|
||||||
assert (
|
assert (
|
||||||
new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
|
new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
|
||||||
|
== "spacy.Tok2VecListener.v1"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
|
||||||
== "spacy.Tok2VecListener.v1"
|
== "spacy.Tok2VecListener.v1"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -627,3 +642,57 @@ def test_tok2vec_distillation_teacher_annotations():
|
||||||
|
|
||||||
student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
|
student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
|
||||||
student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
|
student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_listener_source_link_name():
|
||||||
|
"""The component's internal name and the tok2vec listener map correspond
|
||||||
|
to the most recently modified pipeline.
|
||||||
|
"""
|
||||||
|
orig_config = Config().from_str(cfg_string_multi)
|
||||||
|
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
|
||||||
|
|
||||||
|
nlp2 = English()
|
||||||
|
nlp2.add_pipe("tok2vec", source=nlp1)
|
||||||
|
nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
|
||||||
|
|
||||||
|
# there is no way to have the component have the right name for both
|
||||||
|
# pipelines, right now the most recently modified pipeline is prioritized
|
||||||
|
assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
|
||||||
|
|
||||||
|
# there is no way to have the tok2vec have the right listener map for both
|
||||||
|
# pipelines, right now the most recently modified pipeline is prioritized
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
|
||||||
|
nlp2.add_pipe("ner", name="ner3", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
|
||||||
|
nlp2.remove_pipe("ner3")
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
|
||||||
|
nlp2.remove_pipe("tagger2")
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == []
|
||||||
|
|
||||||
|
# at this point the tok2vec component corresponds to nlp2
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == []
|
||||||
|
|
||||||
|
# modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
|
||||||
|
nlp1.add_pipe("sentencizer")
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
|
||||||
|
|
||||||
|
# modifying nlp2 syncs it back to nlp2
|
||||||
|
nlp2.add_pipe("sentencizer")
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_listener_source_replace_listeners():
|
||||||
|
orig_config = Config().from_str(cfg_string_multi)
|
||||||
|
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
|
||||||
|
nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
|
||||||
|
|
||||||
|
nlp2 = English()
|
||||||
|
nlp2.add_pipe("tok2vec", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == []
|
||||||
|
nlp2.add_pipe("tagger", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == []
|
||||||
|
nlp2.add_pipe("ner", name="ner2", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
|
|
@ -469,6 +469,55 @@ def test_config_overrides():
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
|
def test_config_overrides_registered_functions():
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
nlp.add_pipe("attribute_ruler")
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
nlp_re1 = spacy.load(
|
||||||
|
d,
|
||||||
|
config={
|
||||||
|
"components": {
|
||||||
|
"attribute_ruler": {
|
||||||
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
|
||||||
|
== "spacy.tagger_scorer.v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
@registry.misc("test_some_other_key")
|
||||||
|
def misc_some_other_key():
|
||||||
|
return "some_other_key"
|
||||||
|
|
||||||
|
nlp_re2 = spacy.load(
|
||||||
|
d,
|
||||||
|
config={
|
||||||
|
"components": {
|
||||||
|
"attribute_ruler": {
|
||||||
|
"scorer": {
|
||||||
|
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
||||||
|
"spans_key": {"@misc": "test_some_other_key"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
|
||||||
|
"spans_key"
|
||||||
|
] == {"@misc": "test_some_other_key"}
|
||||||
|
# run dummy evaluation (will return None scores) in order to test that
|
||||||
|
# the spans_key value in the nested override is working as intended in
|
||||||
|
# the config
|
||||||
|
example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
|
||||||
|
scores = nlp_re2.evaluate([example])
|
||||||
|
assert "spans_some_other_key_f" in scores
|
||||||
|
|
||||||
|
|
||||||
def test_config_interpolation():
|
def test_config_interpolation():
|
||||||
config = Config().from_str(nlp_config_string, interpolate=False)
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
||||||
|
|
|
@ -697,7 +697,6 @@ def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Temporarily skip before models are published")
|
|
||||||
def test_download_compatibility():
|
def test_download_compatibility():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
@ -708,7 +707,6 @@ def test_download_compatibility():
|
||||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Temporarily skip before models are published")
|
|
||||||
def test_validate_compatibility_table():
|
def test_validate_compatibility_table():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
|
|
@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities():
|
||||||
|
|
||||||
html = displacy.render(doc, style="ent", manual=True)
|
html = displacy.render(doc, style="ent", manual=True)
|
||||||
assert html.find("FIRST") < html.find("SECOND")
|
assert html.find("FIRST") < html.find("SECOND")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(12816)
|
||||||
|
def test_issue12816(en_vocab) -> None:
|
||||||
|
"""Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
|
||||||
|
# Create a doc containing an annotated word and an unannotated HTML tag
|
||||||
|
doc = Doc(en_vocab, words=["test", "<TEST>"])
|
||||||
|
doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
|
||||||
|
|
||||||
|
# Verify that the HTML tag is escaped when unannotated
|
||||||
|
html = displacy.render(doc, style="span")
|
||||||
|
assert "<TEST>" in html
|
||||||
|
|
||||||
|
# Annotate the HTML tag
|
||||||
|
doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
|
||||||
|
|
||||||
|
# Verify that the HTML tag is still escaped
|
||||||
|
html = displacy.render(doc, style="span")
|
||||||
|
assert "<TEST>" in html
|
||||||
|
|
|
@ -220,6 +220,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
|
||||||
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
||||||
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
{"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
|
||||||
|
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_dot_to_dict(dot_notation, expected):
|
def test_dot_to_dict(dot_notation, expected):
|
||||||
|
@ -228,6 +232,29 @@ def test_dot_to_dict(dot_notation, expected):
|
||||||
assert util.dict_to_dot(result) == dot_notation
|
assert util.dict_to_dot(result) == dot_notation
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"dot_notation,expected",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
{"token.pos": True, "token._.xyz": True},
|
||||||
|
{"token": {"pos": True, "_": {"xyz": True}}},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
||||||
|
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
||||||
|
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_dot_to_dict_overrides(dot_notation, expected):
|
||||||
|
result = util.dot_to_dict(dot_notation)
|
||||||
|
assert result == expected
|
||||||
|
assert util.dict_to_dot(result, for_overrides=True) == dot_notation
|
||||||
|
|
||||||
|
|
||||||
def test_set_dot_to_object():
|
def test_set_dot_to_object():
|
||||||
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
|
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
|
|
|
@ -401,6 +401,7 @@ def test_vectors_serialize():
|
||||||
row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
|
row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
|
||||||
assert row == row_r
|
assert row == row_r
|
||||||
assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
|
assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
|
||||||
|
assert v.attr == v_r.attr
|
||||||
|
|
||||||
|
|
||||||
def test_vector_is_oov():
|
def test_vector_is_oov():
|
||||||
|
@ -645,3 +646,32 @@ def test_equality():
|
||||||
vectors1.resize((5, 9))
|
vectors1.resize((5, 9))
|
||||||
vectors2.resize((5, 9))
|
vectors2.resize((5, 9))
|
||||||
assert vectors1 == vectors2
|
assert vectors1 == vectors2
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectors_attr():
|
||||||
|
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
|
||||||
|
# default ORTH
|
||||||
|
nlp = English()
|
||||||
|
nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"])
|
||||||
|
assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab["A"].has_vector is True
|
||||||
|
assert nlp.vocab["a"].has_vector is False
|
||||||
|
assert nlp("A")[0].has_vector is True
|
||||||
|
assert nlp("a")[0].has_vector is False
|
||||||
|
|
||||||
|
# custom LOWER
|
||||||
|
nlp = English()
|
||||||
|
nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER")
|
||||||
|
assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab["A"].has_vector is True
|
||||||
|
assert nlp.vocab["a"].has_vector is True
|
||||||
|
assert nlp("A")[0].has_vector is True
|
||||||
|
assert nlp("a")[0].has_vector is True
|
||||||
|
# add a new vectors entry
|
||||||
|
assert nlp.vocab["D"].has_vector is False
|
||||||
|
assert nlp.vocab["d"].has_vector is False
|
||||||
|
nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6]))
|
||||||
|
assert nlp.vocab["D"].has_vector is True
|
||||||
|
assert nlp.vocab["d"].has_vector is True
|
||||||
|
|
|
@ -26,24 +26,57 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
cdef void _filter_special_spans(
|
||||||
vector[SpanC] &filtered, int doc_len) nogil
|
self,
|
||||||
cdef object _prepare_special_spans(self, Doc doc,
|
vector[SpanC] &original,
|
||||||
vector[SpanC] &filtered)
|
vector[SpanC] &filtered,
|
||||||
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
int doc_len,
|
||||||
object span_data)
|
) nogil
|
||||||
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
cdef object _prepare_special_spans(
|
||||||
|
self,
|
||||||
|
Doc doc,
|
||||||
|
vector[SpanC] &filtered,
|
||||||
|
)
|
||||||
|
cdef int _retokenize_special_spans(
|
||||||
|
self,
|
||||||
|
Doc doc,
|
||||||
|
TokenC* tokens,
|
||||||
|
object span_data,
|
||||||
|
)
|
||||||
|
cdef int _try_specials_and_cache(
|
||||||
|
self,
|
||||||
|
hash_t key,
|
||||||
|
Doc tokens,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases) except -1
|
bint with_special_cases,
|
||||||
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
|
) except -1
|
||||||
int* has_special, bint with_special_cases) except -1
|
cdef int _tokenize(
|
||||||
cdef str _split_affixes(self, str string,
|
self,
|
||||||
|
Doc tokens,
|
||||||
|
str span,
|
||||||
|
hash_t key,
|
||||||
|
int* has_special,
|
||||||
|
bint with_special_cases,
|
||||||
|
) except -1
|
||||||
|
cdef str _split_affixes(
|
||||||
|
self,
|
||||||
|
str string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
bint with_special_cases)
|
bint with_special_cases,
|
||||||
cdef int _attach_tokens(self, Doc tokens, str string,
|
)
|
||||||
|
cdef int _attach_tokens(
|
||||||
|
self,
|
||||||
|
Doc tokens,
|
||||||
|
str string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
bint with_special_cases) except -1
|
bint with_special_cases,
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
) except -1
|
||||||
int* has_special, int n) except -1
|
cdef int _save_cached(
|
||||||
|
self,
|
||||||
|
const TokenC* tokens,
|
||||||
|
hash_t key,
|
||||||
|
int* has_special,
|
||||||
|
int n,
|
||||||
|
) except -1
|
||||||
|
|
|
@ -323,7 +323,7 @@ cdef class Tokenizer:
|
||||||
cdef int span_start
|
cdef int span_start
|
||||||
cdef int span_end
|
cdef int span_end
|
||||||
while i < doc.length:
|
while i < doc.length:
|
||||||
if not i in span_data:
|
if i not in span_data:
|
||||||
tokens[i + offset] = doc.c[i]
|
tokens[i + offset] = doc.c[i]
|
||||||
i += 1
|
i += 1
|
||||||
else:
|
else:
|
||||||
|
@ -394,12 +394,14 @@ cdef class Tokenizer:
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef str _split_affixes(self, str string,
|
cdef str _split_affixes(
|
||||||
|
self,
|
||||||
|
str string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases):
|
bint with_special_cases
|
||||||
cdef size_t i
|
):
|
||||||
cdef str prefix
|
cdef str prefix
|
||||||
cdef str suffix
|
cdef str suffix
|
||||||
cdef str minus_pre
|
cdef str minus_pre
|
||||||
|
@ -444,10 +446,6 @@ cdef class Tokenizer:
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases) except -1:
|
bint with_special_cases) except -1:
|
||||||
cdef bint specials_hit = 0
|
|
||||||
cdef bint cache_hit = 0
|
|
||||||
cdef int split, end
|
|
||||||
cdef const LexemeC* const* lexemes
|
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
cdef str span
|
cdef str span
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -457,9 +455,11 @@ cdef class Tokenizer:
|
||||||
if string:
|
if string:
|
||||||
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||||
pass
|
pass
|
||||||
elif (self.token_match and self.token_match(string)) or \
|
elif (
|
||||||
(self.url_match and \
|
(self.token_match and self.token_match(string)) or
|
||||||
self.url_match(string)):
|
(self.url_match and self.url_match(string))
|
||||||
|
):
|
||||||
|
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
# fix up the outermost one, with reference to the original.
|
# fix up the outermost one, with reference to the original.
|
||||||
# See Issue #859
|
# See Issue #859
|
||||||
|
@ -820,7 +820,7 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = None
|
self.infix_finditer = None
|
||||||
self.token_match = None
|
self.token_match = None
|
||||||
self.url_match = None
|
self.url_match = None
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
||||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||||
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
||||||
|
|
|
@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
|
||||||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
|
|
||||||
|
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
|
@ -61,7 +61,6 @@ cdef class Doc:
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
|
|
||||||
cdef public object noun_chunks_iterator
|
cdef public object noun_chunks_iterator
|
||||||
|
|
||||||
cdef object __weakref__
|
cdef object __weakref__
|
||||||
|
|
|
@ -35,6 +35,7 @@ from ..attrs cimport (
|
||||||
LENGTH,
|
LENGTH,
|
||||||
MORPH,
|
MORPH,
|
||||||
NORM,
|
NORM,
|
||||||
|
ORTH,
|
||||||
POS,
|
POS,
|
||||||
SENT_START,
|
SENT_START,
|
||||||
SPACY,
|
SPACY,
|
||||||
|
@ -42,14 +43,13 @@ from ..attrs cimport (
|
||||||
attr_id_t,
|
attr_id_t,
|
||||||
)
|
)
|
||||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
|
||||||
from .. import parts_of_speech, schemas, util
|
from .. import parts_of_speech, schemas, util
|
||||||
from ..attrs import IDS, intify_attr
|
from ..attrs import IDS, intify_attr
|
||||||
from ..compat import copy_reg, pickle
|
from ..compat import copy_reg
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..morphology import Morphology
|
|
||||||
from ..util import get_words_and_spaces
|
from ..util import get_words_and_spaces
|
||||||
from .retokenizer import Retokenizer
|
from .retokenizer import Retokenizer
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
@ -613,13 +613,26 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.user_hooks:
|
if "similarity" in self.user_hooks:
|
||||||
return self.user_hooks["similarity"](self, other)
|
return self.user_hooks["similarity"](self, other)
|
||||||
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
attr = getattr(self.vocab.vectors, "attr", ORTH)
|
||||||
if self.c[0].lex.orth == other.orth:
|
cdef Token this_token
|
||||||
|
cdef Token other_token
|
||||||
|
cdef Lexeme other_lex
|
||||||
|
if len(self) == 1 and isinstance(other, Token):
|
||||||
|
this_token = self[0]
|
||||||
|
other_token = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
elif isinstance(other, (Span, Doc)) and len(self) == len(other):
|
elif len(self) == 1 and isinstance(other, Lexeme):
|
||||||
|
this_token = self[0]
|
||||||
|
other_lex = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
|
||||||
|
return 1.0
|
||||||
|
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
||||||
similar = True
|
similar = True
|
||||||
for i in range(self.length):
|
for i in range(len(self)):
|
||||||
if self[i].orth != other[i].orth:
|
this_token = self[i]
|
||||||
|
other_token = other[i]
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
|
||||||
similar = False
|
similar = False
|
||||||
break
|
break
|
||||||
if similar:
|
if similar:
|
||||||
|
@ -767,7 +780,7 @@ cdef class Doc:
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1. Test basic data-driven ORTH gazetteer
|
# 1. Test basic data-driven ORTH gazetteer
|
||||||
# 2. Test more nuanced date and currency regex
|
# 2. Test more nuanced date and currency regex
|
||||||
cdef attr_t entity_type, kb_id, ent_id
|
cdef attr_t kb_id, ent_id
|
||||||
cdef int ent_start, ent_end
|
cdef int ent_start, ent_end
|
||||||
ent_spans = []
|
ent_spans = []
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
|
@ -975,7 +988,6 @@ cdef class Doc:
|
||||||
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||||
"""
|
"""
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
|
||||||
cdef np.ndarray[attr_t, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||||
# See also #3064
|
# See also #3064
|
||||||
|
@ -987,8 +999,10 @@ cdef class Doc:
|
||||||
py_attr_ids = [py_attr_ids]
|
py_attr_ids = [py_attr_ids]
|
||||||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||||
try:
|
try:
|
||||||
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
py_attr_ids = [
|
||||||
for id_ in py_attr_ids]
|
(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||||
|
for id_ in py_attr_ids
|
||||||
|
]
|
||||||
except KeyError as msg:
|
except KeyError as msg:
|
||||||
keys = list(IDS.keys())
|
keys = list(IDS.keys())
|
||||||
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
|
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
|
||||||
|
@ -1022,8 +1036,6 @@ cdef class Doc:
|
||||||
DOCS: https://spacy.io/api/doc#count_by
|
DOCS: https://spacy.io/api/doc#count_by
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef attr_t attr
|
|
||||||
cdef size_t count
|
|
||||||
|
|
||||||
if counts is None:
|
if counts is None:
|
||||||
counts = Counter()
|
counts = Counter()
|
||||||
|
@ -1085,7 +1097,6 @@ cdef class Doc:
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef int32_t abs_head_index
|
cdef int32_t abs_head_index
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
if length != len(self):
|
if length != len(self):
|
||||||
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
||||||
|
@ -1505,7 +1516,6 @@ cdef class Doc:
|
||||||
attributes are inherited from the syntactic root of the span.
|
attributes are inherited from the syntactic root of the span.
|
||||||
RETURNS (Token): The first newly merged token.
|
RETURNS (Token): The first newly merged token.
|
||||||
"""
|
"""
|
||||||
cdef str tag, lemma, ent_type
|
|
||||||
attr_len = len(attributes)
|
attr_len = len(attributes)
|
||||||
span_len = len(spans)
|
span_len = len(spans)
|
||||||
if not attr_len == span_len:
|
if not attr_len == span_len:
|
||||||
|
@ -1621,7 +1631,6 @@ cdef class Doc:
|
||||||
for token in char_span[1:]:
|
for token in char_span[1:]:
|
||||||
token.is_sent_start = False
|
token.is_sent_start = False
|
||||||
|
|
||||||
|
|
||||||
for span_group in doc_json.get("spans", {}):
|
for span_group in doc_json.get("spans", {}):
|
||||||
spans = []
|
spans = []
|
||||||
for span in doc_json["spans"][span_group]:
|
for span in doc_json["spans"][span_group]:
|
||||||
|
@ -1773,7 +1782,6 @@ cdef class Doc:
|
||||||
output.fill(255)
|
output.fill(255)
|
||||||
cdef int i, j, start_idx, end_idx
|
cdef int i, j, start_idx, end_idx
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef unsigned char utf8_char
|
|
||||||
for i, byte_string in enumerate(byte_strings):
|
for i, byte_string in enumerate(byte_strings):
|
||||||
j = 0
|
j = 0
|
||||||
start_idx = 0
|
start_idx = 0
|
||||||
|
@ -1826,8 +1834,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||||
# note: end is exclusive
|
# note: end is exclusive
|
||||||
cdef TokenC* head
|
|
||||||
cdef TokenC* child
|
|
||||||
cdef int i
|
cdef int i
|
||||||
# Set number of left/right children to 0. We'll increment it in the loops.
|
# Set number of left/right children to 0. We'll increment it in the loops.
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
|
@ -1927,7 +1933,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
"""Given a doc and a start and end position defining a set of contiguous
|
"""Given a doc and a start and end position defining a set of contiguous
|
||||||
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
||||||
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
||||||
|
@ -1940,7 +1946,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
||||||
with shape (n, n), where n = len(doc).
|
with shape (n, n), where n = len(doc).
|
||||||
"""
|
"""
|
||||||
cdef int [:,:] lca_matrix
|
cdef int [:, :] lca_matrix
|
||||||
cdef int j, k
|
cdef int j, k
|
||||||
n_tokens= end - start
|
n_tokens= end - start
|
||||||
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||||
|
|
|
@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from cython.operator cimport dereference
|
from cython.operator cimport dereference
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
from libcpp.unordered_map cimport unordered_map
|
from libcpp.unordered_map cimport unordered_map
|
||||||
from libcpp.unordered_set cimport unordered_set
|
from libcpp.unordered_set cimport unordered_set
|
||||||
|
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
|
||||||
import weakref
|
import weakref
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport map_get_unless_missing
|
|
||||||
|
|
||||||
from .. import Errors
|
from .. import Errors
|
||||||
|
|
||||||
|
@ -370,7 +369,9 @@ cdef class Graph:
|
||||||
>>> assert graph.has_node((0,))
|
>>> assert graph.has_node((0,))
|
||||||
>>> assert graph.has_edge((0,), (1,3), label="agent")
|
>>> assert graph.has_edge((0,), (1,3), label="agent")
|
||||||
"""
|
"""
|
||||||
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
|
def __init__(
|
||||||
|
self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
|
||||||
|
):
|
||||||
"""Create a Graph object.
|
"""Create a Graph object.
|
||||||
|
|
||||||
doc (Doc): The Doc object the graph will refer to.
|
doc (Doc): The Doc object the graph will refer to.
|
||||||
|
@ -441,8 +442,6 @@ cdef class Graph:
|
||||||
be returned, and no new edge will be created. The weight of the edge
|
be returned, and no new edge will be created. The weight of the edge
|
||||||
will be updated if a weight is specified.
|
will be updated if a weight is specified.
|
||||||
"""
|
"""
|
||||||
label_hash = self.doc.vocab.strings.as_int(label)
|
|
||||||
weight_float = weight if weight is not None else 0.0
|
|
||||||
edge_index = add_edge(
|
edge_index = add_edge(
|
||||||
&self.c,
|
&self.c,
|
||||||
EdgeC(
|
EdgeC(
|
||||||
|
|
|
@ -94,4 +94,3 @@ cdef class MorphAnalysis:
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.to_json()
|
return self.to_json()
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# cython: infer_types=True, bounds_check=False, profile=True
|
# cython: infer_types=True, bounds_check=False, profile=True
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdlib cimport free, malloc
|
from libc.string cimport memset
|
||||||
from libc.string cimport memcpy, memset
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
|
||||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
||||||
from ..structs cimport LexemeC, TokenC
|
from ..structs cimport LexemeC, TokenC
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
|
from .doc cimport Doc, set_children_from_heads, token_by_start
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
|
||||||
|
@ -148,7 +147,7 @@ def _merge(Doc doc, merges):
|
||||||
syntactic root of the span.
|
syntactic root of the span.
|
||||||
RETURNS (Token): The first newly merged token.
|
RETURNS (Token): The first newly merged token.
|
||||||
"""
|
"""
|
||||||
cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
|
cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
|
||||||
cdef Span span
|
cdef Span span
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
|
@ -166,7 +165,6 @@ def _merge(Doc doc, merges):
|
||||||
merges.sort(key=_get_start)
|
merges.sort(key=_get_start)
|
||||||
for merge_index, (span, attributes) in enumerate(merges):
|
for merge_index, (span, attributes) in enumerate(merges):
|
||||||
start = span.start
|
start = span.start
|
||||||
end = span.end
|
|
||||||
spans.append(span)
|
spans.append(span)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
token = &doc.c[start]
|
token = &doc.c[start]
|
||||||
|
@ -204,8 +202,9 @@ def _merge(Doc doc, merges):
|
||||||
# for the merged region. To do this, we create a boolean array indicating
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
# whether the row is to be deleted, then use numpy.delete
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
if doc.tensor is not None and doc.tensor.size != 0:
|
if doc.tensor is not None and doc.tensor.size != 0:
|
||||||
doc.tensor = _resize_tensor(doc.tensor,
|
doc.tensor = _resize_tensor(
|
||||||
[(m[0].start, m[0].end) for m in merges])
|
doc.tensor, [(m[0].start, m[0].end) for m in merges]
|
||||||
|
)
|
||||||
# Memorize span roots and sets dependencies of the newly merged
|
# Memorize span roots and sets dependencies of the newly merged
|
||||||
# tokens to the dependencies of their roots.
|
# tokens to the dependencies of their roots.
|
||||||
span_roots = []
|
span_roots = []
|
||||||
|
@ -346,7 +345,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
if to_process_tensor:
|
if to_process_tensor:
|
||||||
xp = get_array_module(doc.tensor)
|
xp = get_array_module(doc.tensor)
|
||||||
if xp is numpy:
|
if xp is numpy:
|
||||||
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
|
doc.tensor = xp.append(
|
||||||
|
doc.tensor,
|
||||||
|
xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
|
||||||
|
axis=0
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
|
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
|
||||||
resized_array = xp.zeros(shape, dtype="float32")
|
resized_array = xp.zeros(shape, dtype="float32")
|
||||||
|
@ -368,7 +371,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
token.norm = 0 # reset norm
|
token.norm = 0 # reset norm
|
||||||
if to_process_tensor:
|
if to_process_tensor:
|
||||||
# setting the tensors of the split tokens to array of zeros
|
# setting the tensors of the split tokens to array of zeros
|
||||||
doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
|
doc.tensor[token_index + i:token_index + i + 1] = \
|
||||||
|
xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
|
||||||
# Update the character offset of the subtokens
|
# Update the character offset of the subtokens
|
||||||
if i != 0:
|
if i != 0:
|
||||||
token.idx = orig_token.idx + idx_offset
|
token.idx = orig_token.idx + idx_offset
|
||||||
|
@ -456,7 +460,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
|
||||||
def set_token_attrs(Token py_token, attrs):
|
def set_token_attrs(Token py_token, attrs):
|
||||||
cdef TokenC* token = py_token.c
|
cdef TokenC* token = py_token.c
|
||||||
cdef const LexemeC* lex = token.lex
|
cdef const LexemeC* lex = token.lex
|
||||||
cdef Doc doc = py_token.doc
|
|
||||||
# Assign attributes
|
# Assign attributes
|
||||||
for attr_name, attr_value in attrs.items():
|
for attr_name, attr_value in attrs.items():
|
||||||
if attr_name == "_": # Set extension attributes
|
if attr_name == "_": # Set extension attributes
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.math cimport sqrt
|
|
||||||
from libcpp.memory cimport make_shared
|
from libcpp.memory cimport make_shared
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
@ -9,13 +8,13 @@ import numpy
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
|
||||||
from ..attrs cimport *
|
from ..attrs cimport *
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport ORTH, attr_id_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..structs cimport TokenC
|
||||||
from ..structs cimport LexemeC, TokenC
|
|
||||||
from ..symbols cimport dep
|
from ..symbols cimport dep
|
||||||
from ..typedefs cimport attr_t, flags_t, hash_t
|
from ..typedefs cimport attr_t
|
||||||
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
|
from .doc cimport _get_lca_matrix, get_token_attr
|
||||||
|
from .token cimport Token
|
||||||
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
|
@ -371,13 +370,26 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.doc.user_span_hooks:
|
if "similarity" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["similarity"](self, other)
|
return self.doc.user_span_hooks["similarity"](self, other)
|
||||||
if len(self) == 1 and hasattr(other, "orth"):
|
attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
|
||||||
if self[0].orth == other.orth:
|
cdef Token this_token
|
||||||
|
cdef Token other_token
|
||||||
|
cdef Lexeme other_lex
|
||||||
|
if len(self) == 1 and isinstance(other, Token):
|
||||||
|
this_token = self[0]
|
||||||
|
other_token = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
|
||||||
|
return 1.0
|
||||||
|
elif len(self) == 1 and isinstance(other, Lexeme):
|
||||||
|
this_token = self[0]
|
||||||
|
other_lex = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
||||||
similar = True
|
similar = True
|
||||||
for i in range(len(self)):
|
for i in range(len(self)):
|
||||||
if self[i].orth != getattr(other[i], "orth", None):
|
this_token = self[i]
|
||||||
|
other_token = other[i]
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
|
||||||
similar = False
|
similar = False
|
||||||
break
|
break
|
||||||
if similar:
|
if similar:
|
||||||
|
@ -607,7 +619,6 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
return "".join([t.text_with_ws for t in self])
|
return "".join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
"""Iterate over the base noun phrases in the span. Yields base
|
"""Iterate over the base noun phrases in the span. Yields base
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import struct
|
import struct
|
||||||
import weakref
|
import weakref
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
|
from typing import Iterable, Optional, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ cdef class SpanGroup:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spangroup
|
DOCS: https://spacy.io/api/spangroup
|
||||||
"""
|
"""
|
||||||
def __init__(self, doc, *, name="", attrs={}, spans=[]):
|
def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
|
||||||
"""Create a SpanGroup.
|
"""Create a SpanGroup.
|
||||||
|
|
||||||
doc (Doc): The reference Doc object.
|
doc (Doc): The reference Doc object.
|
||||||
|
@ -315,7 +315,7 @@ cdef class SpanGroup:
|
||||||
|
|
||||||
other_attrs = deepcopy(other_group.attrs)
|
other_attrs = deepcopy(other_group.attrs)
|
||||||
span_group.attrs.update({
|
span_group.attrs.update({
|
||||||
key: value for key, value in other_attrs.items() \
|
key: value for key, value in other_attrs.items()
|
||||||
if key not in span_group.attrs
|
if key not in span_group.attrs
|
||||||
})
|
})
|
||||||
if len(other_group):
|
if len(other_group):
|
||||||
|
|
|
@ -26,7 +26,7 @@ cdef class Token:
|
||||||
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
# cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||||
# cdef TokenC token
|
# cdef TokenC token
|
||||||
# attrs = normalize_attrs(attrs)
|
# attrs = normalize_attrs(attrs)
|
||||||
|
|
||||||
|
@ -98,12 +98,10 @@ cdef class Token:
|
||||||
elif feat_name == SENT_START:
|
elif feat_name == SENT_START:
|
||||||
token.sent_start = value
|
token.sent_start = value
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline int missing_dep(const TokenC* token) nogil:
|
cdef inline int missing_dep(const TokenC* token) nogil:
|
||||||
return token.dep == MISSING_DEP
|
return token.dep == MISSING_DEP
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline int missing_head(const TokenC* token) nogil:
|
cdef inline int missing_head(const TokenC* token) nogil:
|
||||||
return Token.missing_dep(token)
|
return Token.missing_dep(token)
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from cython.view cimport array as cvarray
|
|
||||||
|
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy
|
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
|
||||||
from ..attrs cimport (
|
from ..attrs cimport (
|
||||||
|
@ -28,6 +26,7 @@ from ..attrs cimport (
|
||||||
LIKE_EMAIL,
|
LIKE_EMAIL,
|
||||||
LIKE_NUM,
|
LIKE_NUM,
|
||||||
LIKE_URL,
|
LIKE_URL,
|
||||||
|
ORTH,
|
||||||
)
|
)
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..symbols cimport conj
|
from ..symbols cimport conj
|
||||||
|
@ -216,11 +215,17 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.doc.user_token_hooks:
|
if "similarity" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["similarity"](self, other)
|
return self.doc.user_token_hooks["similarity"](self, other)
|
||||||
if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
|
attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
|
||||||
if self.c.lex.orth == getattr(other[0], "orth", None):
|
cdef Token this_token = self
|
||||||
|
cdef Token other_token
|
||||||
|
cdef Lexeme other_lex
|
||||||
|
if isinstance(other, Token):
|
||||||
|
other_token = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
elif hasattr(other, "orth"):
|
elif isinstance(other, Lexeme):
|
||||||
if self.c.lex.orth == other.orth:
|
other_lex = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
if self.vocab.vectors.n_keys == 0:
|
if self.vocab.vectors.n_keys == 0:
|
||||||
warnings.warn(Warnings.W007.format(obj="Token"))
|
warnings.warn(Warnings.W007.format(obj="Token"))
|
||||||
|
@ -421,7 +426,7 @@ cdef class Token:
|
||||||
if "vector" in self.doc.user_token_hooks:
|
if "vector" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["vector"](self)
|
return self.doc.user_token_hooks["vector"](self)
|
||||||
else:
|
else:
|
||||||
return self.vocab.get_vector(self.c.lex.orth)
|
return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vector_norm(self):
|
def vector_norm(self):
|
||||||
|
@ -528,9 +533,9 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.i + 1 == len(self.doc):
|
if self.i + 1 == len(self.doc):
|
||||||
return True
|
return True
|
||||||
elif self.doc[self.i+1].is_sent_start == None:
|
elif self.doc[self.i+1].is_sent_start is None:
|
||||||
return None
|
return None
|
||||||
elif self.doc[self.i+1].is_sent_start == True:
|
elif self.doc[self.i+1].is_sent_start is True:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
||||||
b2a.append(set())
|
b2a.append(set())
|
||||||
# Process the alignment at the current position
|
# Process the alignment at the current position
|
||||||
if A[token_idx_a] == B[token_idx_b] and \
|
if A[token_idx_a] == B[token_idx_b] and \
|
||||||
(char_idx_a == 0 or \
|
(
|
||||||
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
char_idx_a == 0 or
|
||||||
(char_idx_b == 0 or \
|
char_to_token_a[char_idx_a - 1] < token_idx_a
|
||||||
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
) and \
|
||||||
|
(
|
||||||
|
char_idx_b == 0 or
|
||||||
|
char_to_token_b[char_idx_b - 1] < token_idx_b
|
||||||
|
):
|
||||||
# Current tokens are identical and both character offsets are the
|
# Current tokens are identical and both character offsets are the
|
||||||
# start of a token (either at the beginning of the document or the
|
# start of a token (either at the beginning of the document or the
|
||||||
# previous character belongs to a different token)
|
# previous character belongs to a different token)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import warnings
|
|
||||||
from collections.abc import Iterable as IterableInstance
|
from collections.abc import Iterable as IterableInstance
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -168,7 +167,6 @@ cdef class Example:
|
||||||
self._y_sig = y_sig
|
self._y_sig = y_sig
|
||||||
return self._cached_alignment
|
return self._cached_alignment
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_vectorized(self, align, gold_values):
|
def _get_aligned_vectorized(self, align, gold_values):
|
||||||
# Fast path for Doc attributes/fields that are predominantly a single value,
|
# Fast path for Doc attributes/fields that are predominantly a single value,
|
||||||
# i.e., TAG, POS, MORPH.
|
# i.e., TAG, POS, MORPH.
|
||||||
|
@ -211,7 +209,6 @@ cdef class Example:
|
||||||
|
|
||||||
return output.tolist()
|
return output.tolist()
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_non_vectorized(self, align, gold_values):
|
def _get_aligned_non_vectorized(self, align, gold_values):
|
||||||
# Slower path for fields that return multiple values (resulting
|
# Slower path for fields that return multiple values (resulting
|
||||||
# in ragged arrays that cannot be vectorized trivially).
|
# in ragged arrays that cannot be vectorized trivially).
|
||||||
|
@ -228,7 +225,6 @@ cdef class Example:
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_aligned(self, field, as_string=False):
|
def get_aligned(self, field, as_string=False):
|
||||||
"""Return an aligned array for a token attribute."""
|
"""Return an aligned array for a token attribute."""
|
||||||
align = self.alignment.x2y
|
align = self.alignment.x2y
|
||||||
|
@ -337,7 +333,7 @@ cdef class Example:
|
||||||
missing=None
|
missing=None
|
||||||
)
|
)
|
||||||
# Now fill the tokens we can align to O.
|
# Now fill the tokens we can align to O.
|
||||||
O = 2 # I=1, O=2, B=3
|
O = 2 # I=1, O=2, B=3 # no-cython-lint: E741
|
||||||
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||||
if x_tags[i] is None:
|
if x_tags[i] is None:
|
||||||
if ent_iob == O:
|
if ent_iob == O:
|
||||||
|
@ -347,7 +343,7 @@ cdef class Example:
|
||||||
return x_ents, x_tags
|
return x_ents, x_tags
|
||||||
|
|
||||||
def get_aligned_ner(self):
|
def get_aligned_ner(self):
|
||||||
x_ents, x_tags = self.get_aligned_ents_and_ner()
|
_x_ents, x_tags = self.get_aligned_ents_and_ner()
|
||||||
return x_tags
|
return x_tags
|
||||||
|
|
||||||
def get_matching_ents(self, check_label=True):
|
def get_matching_ents(self, check_label=True):
|
||||||
|
@ -405,7 +401,6 @@ cdef class Example:
|
||||||
|
|
||||||
return span_dict
|
return span_dict
|
||||||
|
|
||||||
|
|
||||||
def _links_to_dict(self):
|
def _links_to_dict(self):
|
||||||
links = {}
|
links = {}
|
||||||
for ent in self.reference.ents:
|
for ent in self.reference.ents:
|
||||||
|
@ -596,6 +591,7 @@ def _fix_legacy_dict_data(example_dict):
|
||||||
"doc_annotation": doc_dict
|
"doc_annotation": doc_dict
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _has_field(annot, field):
|
def _has_field(annot, field):
|
||||||
if field not in annot:
|
if field not in annot:
|
||||||
return False
|
return False
|
||||||
|
@ -632,6 +628,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
||||||
ent_types.append("")
|
ent_types.append("")
|
||||||
return ent_iobs, ent_types
|
return ent_iobs, ent_types
|
||||||
|
|
||||||
|
|
||||||
def _parse_links(vocab, words, spaces, links):
|
def _parse_links(vocab, words, spaces, links):
|
||||||
reference = Doc(vocab, words=words, spaces=spaces)
|
reference = Doc(vocab, words=words, spaces=spaces)
|
||||||
starts = {token.idx: token.i for token in reference}
|
starts = {token.idx: token.i for token in reference}
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import json
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -6,7 +5,7 @@ import srsly
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Warnings
|
from ..errors import Warnings
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
|
from .iob_utils import offsets_to_biluo_tags
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
|
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
json_doc = {"id": doc_id, "paragraphs": []}
|
json_doc = {"id": doc_id, "paragraphs": []}
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
raw = None if doc.has_unknown_spaces else doc.text
|
raw = None if doc.has_unknown_spaces else doc.text
|
||||||
json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
|
json_para = {
|
||||||
|
'raw': raw,
|
||||||
|
"sentences": [],
|
||||||
|
"cats": [],
|
||||||
|
"entities": [],
|
||||||
|
"links": []
|
||||||
|
}
|
||||||
for cat, val in doc.cats.items():
|
for cat, val in doc.cats.items():
|
||||||
json_cat = {"label": cat, "value": val}
|
json_cat = {"label": cat, "value": val}
|
||||||
json_para["cats"].append(json_cat)
|
json_para["cats"].append(json_cat)
|
||||||
|
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
if ent.kb_id_:
|
if ent.kb_id_:
|
||||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||||
json_para["links"].append(link_dict)
|
json_para["links"].append(link_dict)
|
||||||
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
|
biluo_tags = offsets_to_biluo_tags(
|
||||||
|
doc, json_para["entities"], missing=ner_missing_tag
|
||||||
|
)
|
||||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
json_sent = {"tokens": [], "brackets": []}
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
json_token = {
|
||||||
|
"id": token.i, "orth": token.text, "space": token.whitespace_
|
||||||
|
}
|
||||||
if include_annotation["TAG"]:
|
if include_annotation["TAG"]:
|
||||||
json_token["tag"] = token.tag_
|
json_token["tag"] = token.tag_
|
||||||
if include_annotation["POS"]:
|
if include_annotation["POS"]:
|
||||||
|
@ -125,9 +134,14 @@ def json_to_annotations(doc):
|
||||||
else:
|
else:
|
||||||
sent_starts.append(-1)
|
sent_starts.append(-1)
|
||||||
if "brackets" in sent:
|
if "brackets" in sent:
|
||||||
brackets.extend((b["first"] + sent_start_i,
|
brackets.extend(
|
||||||
b["last"] + sent_start_i, b["label"])
|
(
|
||||||
for b in sent["brackets"])
|
b["first"] + sent_start_i,
|
||||||
|
b["last"] + sent_start_i,
|
||||||
|
b["label"]
|
||||||
|
)
|
||||||
|
for b in sent["brackets"]
|
||||||
|
)
|
||||||
|
|
||||||
example["token_annotation"] = dict(
|
example["token_annotation"] = dict(
|
||||||
ids=ids,
|
ids=ids,
|
||||||
|
@ -160,6 +174,7 @@ def json_to_annotations(doc):
|
||||||
)
|
)
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
|
||||||
def json_iterate(bytes utf8_str):
|
def json_iterate(bytes utf8_str):
|
||||||
# We should've made these files jsonl...But since we didn't, parse out
|
# We should've made these files jsonl...But since we didn't, parse out
|
||||||
# the docs one-by-one to reduce memory usage.
|
# the docs one-by-one to reduce memory usage.
|
||||||
|
|
|
@ -71,7 +71,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
logger.info("Resuming training for: %s", resume_components)
|
logger.info("Resuming training for: %s", resume_components)
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
# Make sure that listeners are defined before initializing further
|
# Make sure that internal component names are synced and listeners are
|
||||||
|
# defined before initializing further
|
||||||
nlp._link_components()
|
nlp._link_components()
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
if T["max_epochs"] == -1:
|
if T["max_epochs"] == -1:
|
||||||
|
@ -305,9 +306,14 @@ def convert_vectors(
|
||||||
truncate: int,
|
truncate: int,
|
||||||
prune: int,
|
prune: int,
|
||||||
mode: str = VectorsMode.default,
|
mode: str = VectorsMode.default,
|
||||||
|
attr: str = "ORTH",
|
||||||
) -> None:
|
) -> None:
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
|
if attr != "ORTH":
|
||||||
|
raise ValueError(
|
||||||
|
"ORTH is the only attribute supported for vectors in .npz format."
|
||||||
|
)
|
||||||
nlp.vocab.vectors = Vectors(
|
nlp.vocab.vectors = Vectors(
|
||||||
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
|
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
|
||||||
)
|
)
|
||||||
|
@ -335,11 +341,15 @@ def convert_vectors(
|
||||||
nlp.vocab.vectors = Vectors(
|
nlp.vocab.vectors = Vectors(
|
||||||
strings=nlp.vocab.strings,
|
strings=nlp.vocab.strings,
|
||||||
data=vectors_data,
|
data=vectors_data,
|
||||||
|
attr=attr,
|
||||||
**floret_settings,
|
**floret_settings,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
nlp.vocab.vectors = Vectors(
|
nlp.vocab.vectors = Vectors(
|
||||||
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
strings=nlp.vocab.strings,
|
||||||
|
data=vectors_data,
|
||||||
|
keys=vector_keys,
|
||||||
|
attr=attr,
|
||||||
)
|
)
|
||||||
nlp.vocab.deduplicate_vectors()
|
nlp.vocab.deduplicate_vectors()
|
||||||
if prune >= 1 and mode != VectorsMode.floret:
|
if prune >= 1 and mode != VectorsMode.floret:
|
||||||
|
|
|
@ -518,7 +518,7 @@ def load_model_from_path(
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
config_path = model_path / "config.cfg"
|
config_path = model_path / "config.cfg"
|
||||||
overrides = dict_to_dot(config)
|
overrides = dict_to_dot(config, for_overrides=True)
|
||||||
config = load_config(config_path, overrides=overrides)
|
config = load_config(config_path, overrides=overrides)
|
||||||
nlp = load_model_from_config(
|
nlp = load_model_from_config(
|
||||||
config,
|
config,
|
||||||
|
@ -1486,14 +1486,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
|
def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
|
||||||
"""Convert dot notation to a dict. For example: {"token": {"pos": True,
|
"""Convert dot notation to a dict. For example: {"token": {"pos": True,
|
||||||
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
|
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
|
||||||
|
|
||||||
values (Dict[str, dict]): The dict to convert.
|
obj (Dict[str, dict]): The dict to convert.
|
||||||
|
for_overrides (bool): Whether to enable special handling for registered
|
||||||
|
functions in overrides.
|
||||||
RETURNS (Dict[str, Any]): The key/value pairs.
|
RETURNS (Dict[str, Any]): The key/value pairs.
|
||||||
"""
|
"""
|
||||||
return {".".join(key): value for key, value in walk_dict(obj)}
|
return {
|
||||||
|
".".join(key): value
|
||||||
|
for key, value in walk_dict(obj, for_overrides=for_overrides)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def dot_to_object(config: Config, section: str):
|
def dot_to_object(config: Config, section: str):
|
||||||
|
@ -1535,13 +1540,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
|
||||||
|
|
||||||
|
|
||||||
def walk_dict(
|
def walk_dict(
|
||||||
node: Dict[str, Any], parent: List[str] = []
|
node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
|
||||||
) -> Iterator[Tuple[List[str], Any]]:
|
) -> Iterator[Tuple[List[str], Any]]:
|
||||||
"""Walk a dict and yield the path and values of the leaves."""
|
"""Walk a dict and yield the path and values of the leaves.
|
||||||
|
|
||||||
|
for_overrides (bool): Whether to treat registered functions that start with
|
||||||
|
@ as final values rather than dicts to traverse.
|
||||||
|
"""
|
||||||
for key, value in node.items():
|
for key, value in node.items():
|
||||||
key_parent = [*parent, key]
|
key_parent = [*parent, key]
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict) and (
|
||||||
yield from walk_dict(value, key_parent)
|
not for_overrides
|
||||||
|
or not any(value_key.startswith("@") for value_key in value)
|
||||||
|
):
|
||||||
|
yield from walk_dict(value, key_parent, for_overrides=for_overrides)
|
||||||
else:
|
else:
|
||||||
yield (key_parent, value)
|
yield (key_parent, value)
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
cimport numpy as np
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libcpp.set cimport set as cppset
|
from libcpp.set cimport set as cppset
|
||||||
from murmurhash.mrmr cimport hash128_x64
|
from murmurhash.mrmr cimport hash128_x64
|
||||||
|
|
||||||
import functools
|
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
@ -15,9 +13,11 @@ from thinc.api import Ops, get_array_module, get_current_ops
|
||||||
from thinc.backends import get_array_ops
|
from thinc.backends import get_array_ops
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
|
from .attrs cimport ORTH, attr_id_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from .attrs import IDS
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .strings import get_string_id
|
from .strings import get_string_id
|
||||||
|
|
||||||
|
@ -63,6 +63,7 @@ cdef class Vectors:
|
||||||
cdef readonly uint32_t hash_seed
|
cdef readonly uint32_t hash_seed
|
||||||
cdef readonly unicode bow
|
cdef readonly unicode bow
|
||||||
cdef readonly unicode eow
|
cdef readonly unicode eow
|
||||||
|
cdef readonly attr_id_t attr
|
||||||
|
|
||||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
||||||
"""Create a new vector store.
|
"""Create a new vector store.
|
||||||
|
@ -78,6 +79,8 @@ cdef class Vectors:
|
||||||
hash_seed (int): The floret hash seed (default: 0).
|
hash_seed (int): The floret hash seed (default: 0).
|
||||||
bow (str): The floret BOW string (default: "<").
|
bow (str): The floret BOW string (default: "<").
|
||||||
eow (str): The floret EOW string (default: ">").
|
eow (str): The floret EOW string (default: ">").
|
||||||
|
attr (Union[int, str]): The token attribute for the vector keys
|
||||||
|
(default: "ORTH").
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#init
|
DOCS: https://spacy.io/api/vectors#init
|
||||||
"""
|
"""
|
||||||
|
@ -100,10 +103,18 @@ cdef class Vectors:
|
||||||
self.hash_seed = hash_seed
|
self.hash_seed = hash_seed
|
||||||
self.bow = bow
|
self.bow = bow
|
||||||
self.eow = eow
|
self.eow = eow
|
||||||
|
if isinstance(attr, (int, long)):
|
||||||
|
self.attr = attr
|
||||||
|
else:
|
||||||
|
attr = attr.upper()
|
||||||
|
if attr == "TEXT":
|
||||||
|
attr = "ORTH"
|
||||||
|
self.attr = IDS.get(attr, ORTH)
|
||||||
|
|
||||||
if self.mode == Mode.default:
|
if self.mode == Mode.default:
|
||||||
if data is None:
|
if data is None:
|
||||||
if shape is None:
|
if shape is None:
|
||||||
shape = (0,0)
|
shape = (0, 0)
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
data = ops.xp.zeros(shape, dtype="f")
|
data = ops.xp.zeros(shape, dtype="f")
|
||||||
self._unset = cppset[int]({i for i in range(data.shape[0])})
|
self._unset = cppset[int]({i for i in range(data.shape[0])})
|
||||||
|
@ -246,8 +257,7 @@ cdef class Vectors:
|
||||||
return (
|
return (
|
||||||
self.shape == other.shape
|
self.shape == other.shape
|
||||||
and self.key2row == other.key2row
|
and self.key2row == other.key2row
|
||||||
and self.to_bytes(exclude=["strings"])
|
and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
|
||||||
== other.to_bytes(exclude=["strings"])
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def resize(self, shape, inplace=False):
|
def resize(self, shape, inplace=False):
|
||||||
|
@ -504,11 +514,12 @@ cdef class Vectors:
|
||||||
# vectors e.g. (10000, 300)
|
# vectors e.g. (10000, 300)
|
||||||
# sims e.g. (1024, 10000)
|
# sims e.g. (1024, 10000)
|
||||||
sims = xp.dot(batch, vectors.T)
|
sims = xp.dot(batch, vectors.T)
|
||||||
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
|
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:]
|
||||||
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
|
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
|
||||||
|
|
||||||
if sort and n >= 2:
|
if sort and n >= 2:
|
||||||
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
|
||||||
|
xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
|
||||||
scores[i:i+batch_size] = scores[sorted_index]
|
scores[i:i+batch_size] = scores[sorted_index]
|
||||||
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
||||||
|
|
||||||
|
@ -522,8 +533,12 @@ cdef class Vectors:
|
||||||
|
|
||||||
numpy_rows = get_current_ops().to_numpy(best_rows)
|
numpy_rows = get_current_ops().to_numpy(best_rows)
|
||||||
keys = xp.asarray(
|
keys = xp.asarray(
|
||||||
[[row2key[row] for row in numpy_rows[i] if row in row2key]
|
[
|
||||||
for i in range(len(queries)) ], dtype="uint64")
|
[row2key[row] for row in numpy_rows[i] if row in row2key]
|
||||||
|
for i in range(len(queries))
|
||||||
|
],
|
||||||
|
dtype="uint64"
|
||||||
|
)
|
||||||
return (keys, best_rows, scores)
|
return (keys, best_rows, scores)
|
||||||
|
|
||||||
def to_ops(self, ops: Ops):
|
def to_ops(self, ops: Ops):
|
||||||
|
@ -543,6 +558,7 @@ cdef class Vectors:
|
||||||
"hash_seed": self.hash_seed,
|
"hash_seed": self.hash_seed,
|
||||||
"bow": self.bow,
|
"bow": self.bow,
|
||||||
"eow": self.eow,
|
"eow": self.eow,
|
||||||
|
"attr": self.attr,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _set_cfg(self, cfg):
|
def _set_cfg(self, cfg):
|
||||||
|
@ -553,6 +569,7 @@ cdef class Vectors:
|
||||||
self.hash_seed = cfg.get("hash_seed", 0)
|
self.hash_seed = cfg.get("hash_seed", 0)
|
||||||
self.bow = cfg.get("bow", "<")
|
self.bow = cfg.get("bow", "<")
|
||||||
self.eow = cfg.get("eow", ">")
|
self.eow = cfg.get("eow", ">")
|
||||||
|
self.attr = cfg.get("attr", ORTH)
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
@ -564,9 +581,9 @@ cdef class Vectors:
|
||||||
"""
|
"""
|
||||||
xp = get_array_module(self.data)
|
xp = get_array_module(self.data)
|
||||||
if xp is numpy:
|
if xp is numpy:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) # no-cython-lint
|
||||||
else:
|
else:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
save_array = lambda arr, file_: xp.save(file_, arr) # no-cython-lint
|
||||||
|
|
||||||
def save_vectors(path):
|
def save_vectors(path):
|
||||||
# the source of numpy.save indicates that the file object is closed after use.
|
# the source of numpy.save indicates that the file object is closed after use.
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from libc.string cimport memcpy
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -19,7 +17,6 @@ from .errors import Errors
|
||||||
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
|
||||||
from .lang.norm_exceptions import BASE_NORMS
|
from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
from .util import registry
|
|
||||||
from .vectors import Mode as VectorsMode
|
from .vectors import Mode as VectorsMode
|
||||||
from .vectors import Vectors
|
from .vectors import Vectors
|
||||||
|
|
||||||
|
@ -50,8 +47,15 @@ cdef class Vocab:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
|
def __init__(
|
||||||
oov_prob=-20., writing_system=None, get_noun_chunks=None):
|
self,
|
||||||
|
lex_attr_getters=None,
|
||||||
|
strings=None,
|
||||||
|
lookups=None,
|
||||||
|
oov_prob=-20.,
|
||||||
|
writing_system=None,
|
||||||
|
get_noun_chunks=None
|
||||||
|
):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||||
|
@ -150,7 +154,6 @@ cdef class Vocab:
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
cdef hash_t key = self.strings[string]
|
cdef hash_t key = self.strings[string]
|
||||||
lex = <LexemeC*>self._by_orth.get(key)
|
lex = <LexemeC*>self._by_orth.get(key)
|
||||||
cdef size_t addr
|
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
assert lex.orth in self.strings
|
assert lex.orth in self.strings
|
||||||
if lex.orth != key:
|
if lex.orth != key:
|
||||||
|
@ -352,8 +355,13 @@ cdef class Vocab:
|
||||||
self[orth]
|
self[orth]
|
||||||
# Make prob negative so it sorts by rank ascending
|
# Make prob negative so it sorts by rank ascending
|
||||||
# (key2row contains the rank)
|
# (key2row contains the rank)
|
||||||
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
|
priority = []
|
||||||
for lex in self if lex.orth in self.vectors.key2row]
|
cdef Lexeme lex
|
||||||
|
cdef attr_t value
|
||||||
|
for lex in self:
|
||||||
|
value = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
if value in self.vectors.key2row:
|
||||||
|
priority.append((-lex.prob, self.vectors.key2row[value], value))
|
||||||
priority.sort()
|
priority.sort()
|
||||||
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
|
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
|
||||||
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
||||||
|
@ -386,8 +394,10 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
if self.has_vector(orth):
|
cdef Lexeme lex = self[orth]
|
||||||
return self.vectors[orth]
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
if self.has_vector(key):
|
||||||
|
return self.vectors[key]
|
||||||
xp = get_array_module(self.vectors.data)
|
xp = get_array_module(self.vectors.data)
|
||||||
vectors = xp.zeros((self.vectors_length,), dtype="f")
|
vectors = xp.zeros((self.vectors_length,), dtype="f")
|
||||||
return vectors
|
return vectors
|
||||||
|
@ -403,15 +413,16 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
if self.vectors.is_full and orth not in self.vectors:
|
cdef Lexeme lex = self[orth]
|
||||||
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
if self.vectors.is_full and key not in self.vectors:
|
||||||
new_rows = max(100, int(self.vectors.shape[0]*1.3))
|
new_rows = max(100, int(self.vectors.shape[0]*1.3))
|
||||||
if self.vectors.shape[1] == 0:
|
if self.vectors.shape[1] == 0:
|
||||||
width = vector.size
|
width = vector.size
|
||||||
else:
|
else:
|
||||||
width = self.vectors.shape[1]
|
width = self.vectors.shape[1]
|
||||||
self.vectors.resize((new_rows, width))
|
self.vectors.resize((new_rows, width))
|
||||||
lex = self[orth] # Add word to vocab if necessary
|
row = self.vectors.add(key, vector=vector)
|
||||||
row = self.vectors.add(orth, vector=vector)
|
|
||||||
if row >= 0:
|
if row >= 0:
|
||||||
lex.rank = row
|
lex.rank = row
|
||||||
|
|
||||||
|
@ -426,7 +437,9 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
cdef Lexeme lex = self[orth]
|
||||||
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
return key in self.vectors
|
||||||
|
|
||||||
property lookups:
|
property lookups:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -440,7 +453,6 @@ cdef class Vocab:
|
||||||
self.lookups.get_table("lexeme_norm"),
|
self.lookups.get_table("lexeme_norm"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -453,7 +465,6 @@ cdef class Vocab:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
setters = ["strings", "vectors"]
|
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.to_disk(path / "strings.json")
|
self.strings.to_disk(path / "strings.json")
|
||||||
if "vectors" not in exclude:
|
if "vectors" not in exclude:
|
||||||
|
@ -472,7 +483,6 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
getters = ["strings", "vectors"]
|
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||||
if "vectors" not in exclude:
|
if "vectors" not in exclude:
|
||||||
|
|
|
@ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on
|
||||||
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
|
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
|
||||||
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
|
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
|
||||||
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
|
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
|
||||||
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
|
| `key_attr` | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
||||||
|
|
||||||
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}
|
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}
|
||||||
|
|
|
@ -876,7 +876,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or
|
||||||
training a pipeline with components sourced from an existing pipeline: if
|
training a pipeline with components sourced from an existing pipeline: if
|
||||||
multiple components (e.g. tagger, parser, NER) listen to the same
|
multiple components (e.g. tagger, parser, NER) listen to the same
|
||||||
token-to-vector component, but some of them are frozen and not updated, their
|
token-to-vector component, but some of them are frozen and not updated, their
|
||||||
performance may degrade significally as the token-to-vector component is updated
|
performance may degrade significantly as the token-to-vector component is updated
|
||||||
with new data. To prevent this, listeners can be replaced with a standalone
|
with new data. To prevent this, listeners can be replaced with a standalone
|
||||||
token-to-vector layer that is owned by the component and doesn't change if the
|
token-to-vector layer that is owned by the component and doesn't change if the
|
||||||
component isn't updated.
|
component isn't updated.
|
||||||
|
|
|
@ -60,7 +60,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
||||||
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
|
| `max_length` | Maximum length of the produced spans, defaults to `25`. ~~Optional[int]~~ |
|
||||||
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
|
|
|
@ -59,6 +59,7 @@ modified later.
|
||||||
| `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ |
|
| `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ |
|
||||||
| `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ |
|
| `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ |
|
||||||
| `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ |
|
| `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ |
|
||||||
|
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~ |
|
||||||
|
|
||||||
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
|
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
|
||||||
|
|
||||||
|
@ -453,7 +454,8 @@ Load state from a binary string.
|
||||||
## Attributes {id="attributes"}
|
## Attributes {id="attributes"}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||||
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
|
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
|
||||||
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||||
|
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~ |
|
||||||
|
|
|
@ -113,7 +113,7 @@ print(doc[2].morph) # 'Case=Nom|Person=2|PronType=Prs'
|
||||||
print(doc[2].pos_) # 'PRON'
|
print(doc[2].pos_) # 'PRON'
|
||||||
```
|
```
|
||||||
|
|
||||||
## Lemmatization {id="lemmatization",model="lemmatizer",version="3"}
|
## Lemmatization {id="lemmatization",version="3"}
|
||||||
|
|
||||||
spaCy provides two pipeline components for lemmatization:
|
spaCy provides two pipeline components for lemmatization:
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ nlp = spacy.blank("sv")
|
||||||
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rule-based lemmatizer {id="lemmatizer-rule"}
|
### Rule-based lemmatizer {id="lemmatizer-rule",model="morphologizer"}
|
||||||
|
|
||||||
When training pipelines that include a component that assigns part-of-speech
|
When training pipelines that include a component that assigns part-of-speech
|
||||||
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
||||||
|
@ -194,7 +194,7 @@ information, without consulting the context of the token. The rule-based
|
||||||
lemmatizer also accepts list-based exception files. For English, these are
|
lemmatizer also accepts list-based exception files. For English, these are
|
||||||
acquired from [WordNet](https://wordnet.princeton.edu/).
|
acquired from [WordNet](https://wordnet.princeton.edu/).
|
||||||
|
|
||||||
### Trainable lemmatizer
|
### Trainable lemmatizer {id="lemmatizer-train",model="trainable_lemmatizer"}
|
||||||
|
|
||||||
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
|
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
|
||||||
transformations from a training corpus that includes lemma annotations. This
|
transformations from a training corpus that includes lemma annotations. This
|
||||||
|
|
|
@ -11,7 +11,6 @@ menu:
|
||||||
- ['Custom Functions', 'custom-functions']
|
- ['Custom Functions', 'custom-functions']
|
||||||
- ['Initialization', 'initialization']
|
- ['Initialization', 'initialization']
|
||||||
- ['Data Utilities', 'data']
|
- ['Data Utilities', 'data']
|
||||||
- ['Parallel Training', 'parallel-training']
|
|
||||||
- ['Internal API', 'api']
|
- ['Internal API', 'api']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -1565,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need
|
||||||
to take care to adjust the `Example` object so its annotations match and remain
|
to take care to adjust the `Example` object so its annotations match and remain
|
||||||
valid.
|
valid.
|
||||||
|
|
||||||
## Parallel & distributed training with Ray {id="parallel-training"}
|
|
||||||
|
|
||||||
> #### Installation
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
|
|
||||||
> # Check that the CLI is registered
|
|
||||||
> $ python -m spacy ray --help
|
|
||||||
> ```
|
|
||||||
|
|
||||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
|
||||||
**distributed applications**. You can use Ray to train spaCy on one or more
|
|
||||||
remote machines, potentially speeding up your training process. Parallel
|
|
||||||
training won't always be faster though – it depends on your batch size, models,
|
|
||||||
and hardware.
|
|
||||||
|
|
||||||
<Infobox variant="warning">
|
|
||||||
|
|
||||||
To use Ray with spaCy, you need the
|
|
||||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
|
||||||
Installing the package will automatically add the `ray` command to the spaCy
|
|
||||||
CLI.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
|
|
||||||
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
|
|
||||||
setup. You can optionally set the `--address` option to point to your Ray
|
|
||||||
cluster. If it's not set, Ray will run locally.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m spacy ray train config.cfg --n-workers 2
|
|
||||||
```
|
|
||||||
|
|
||||||
<Project id="integrations/ray">
|
|
||||||
|
|
||||||
Get started with parallel training using our project template. It trains a
|
|
||||||
simple model on a Universal Dependencies Treebank and lets you parallelize the
|
|
||||||
training with Ray.
|
|
||||||
|
|
||||||
</Project>
|
|
||||||
|
|
||||||
### How parallel training works {id="parallel-training-details"}
|
|
||||||
|
|
||||||
Each worker receives a shard of the **data** and builds a copy of the **model
|
|
||||||
and optimizer** from the [`config.cfg`](#config). It also has a communication
|
|
||||||
channel to **pass gradients and parameters** to the other workers. Additionally,
|
|
||||||
each worker is given ownership of a subset of the parameter arrays. Every
|
|
||||||
parameter array is owned by exactly one worker, and the workers are given a
|
|
||||||
mapping so they know which worker owns which parameter.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
As training proceeds, every worker will be computing gradients for **all** of
|
|
||||||
the model parameters. When they compute gradients for parameters they don't own,
|
|
||||||
they'll **send them to the worker** that does own that parameter, along with a
|
|
||||||
version identifier so that the owner can decide whether to discard the gradient.
|
|
||||||
Workers use the gradients they receive and the ones they compute locally to
|
|
||||||
update the parameters they own, and then broadcast the updated array and a new
|
|
||||||
version ID to the other workers.
|
|
||||||
|
|
||||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
|
||||||
push their gradient increments and parameter updates, they do not have to pull
|
|
||||||
them and block on the result, so the transfers can happen in the background,
|
|
||||||
overlapped with the actual training work. The workers also do not have to stop
|
|
||||||
and wait for each other ("synchronize") at the start of each batch. This is very
|
|
||||||
useful for spaCy, because spaCy is often trained on long documents, which means
|
|
||||||
**batches can vary in size** significantly. Uneven workloads make synchronous
|
|
||||||
gradient descent inefficient, because if one batch is slow, all of the other
|
|
||||||
workers are stuck waiting for it to complete before they can continue.
|
|
||||||
|
|
||||||
## Internal training API {id="api"}
|
## Internal training API {id="api"}
|
||||||
|
|
||||||
<Infobox variant="danger">
|
<Infobox variant="danger">
|
||||||
|
|
143
website/docs/usage/v3-6.mdx
Normal file
143
website/docs/usage/v3-6.mdx
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
---
|
||||||
|
title: What's New in v3.6
|
||||||
|
teaser: New features and how to upgrade
|
||||||
|
menu:
|
||||||
|
- ['New Features', 'features']
|
||||||
|
- ['Upgrading Notes', 'upgrading']
|
||||||
|
---
|
||||||
|
|
||||||
|
## New features {id="features",hidden="true"}
|
||||||
|
|
||||||
|
spaCy v3.6 adds the new [`SpanFinder`](/api/spanfinder) component to the core
|
||||||
|
spaCy library and new trained pipelines for Slovenian.
|
||||||
|
|
||||||
|
### SpanFinder {id="spanfinder"}
|
||||||
|
|
||||||
|
The [`SpanFinder`](/api/spanfinder) component identifies potentially
|
||||||
|
overlapping, unlabeled spans by identifying span start and end tokens. It is
|
||||||
|
intended for use in combination with a component like
|
||||||
|
[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
|
||||||
|
spans. See our
|
||||||
|
[Spancat blog post](https://explosion.ai/blog/spancat#span-finder) for a more
|
||||||
|
detailed introduction to the span finder.
|
||||||
|
|
||||||
|
To train a pipeline with `span_finder` + `spancat`, remember to add
|
||||||
|
`span_finder` (and its `tok2vec` or `transformer` if required) to
|
||||||
|
`[training.annotating_components]` so that the `spancat` component can be
|
||||||
|
trained directly from its predictions:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[nlp]
|
||||||
|
pipeline = ["tok2vec","span_finder","spancat"]
|
||||||
|
|
||||||
|
[training]
|
||||||
|
annotating_components = ["tok2vec","span_finder"]
|
||||||
|
```
|
||||||
|
|
||||||
|
In practice it can be helpful to initially train the `span_finder` separately
|
||||||
|
before [sourcing](/usage/processing-pipelines#sourced-components) it (along with
|
||||||
|
its `tok2vec`) into the `spancat` pipeline for further training. Otherwise the
|
||||||
|
memory usage can spike for `spancat` in the first few training steps if the
|
||||||
|
`span_finder` makes a large number of predictions.
|
||||||
|
|
||||||
|
### Additional features and improvements {id="additional-features-and-improvements"}
|
||||||
|
|
||||||
|
- Language updates:
|
||||||
|
- Add initial support for Malay.
|
||||||
|
- Update Latin defaults to support noun chunks, update lexical/tokenizer
|
||||||
|
settings and add example sentences.
|
||||||
|
- Support `spancat_singlelabel` in `spacy debug data` CLI.
|
||||||
|
- Add `doc.spans` rendering to `spacy evaluate` CLI displaCy output.
|
||||||
|
- Support custom token/lexeme attribute for vectors.
|
||||||
|
- Add option to return scores separately keyed by component name with
|
||||||
|
`spacy evaluate --per-component`, `Language.evaluate(per_component=True)` and
|
||||||
|
`Scorer.score(per_component=True)`. This is useful when the pipeline contains
|
||||||
|
more than one of the same component like `textcat` that may have overlapping
|
||||||
|
scores keys.
|
||||||
|
- Typing updates for `PhraseMatcher` and `SpanGroup`.
|
||||||
|
|
||||||
|
## Trained pipelines {id="pipelines"}
|
||||||
|
|
||||||
|
### New trained pipelines {id="new-pipelines"}
|
||||||
|
|
||||||
|
v3.6 introduces new pipelines for Slovenian, which use the trainable lemmatizer
|
||||||
|
and [floret vectors](https://github.com/explosion/floret).
|
||||||
|
|
||||||
|
| Package | UPOS | Parser LAS | NER F |
|
||||||
|
| ------------------------------------------------- | ---: | ---------: | ----: |
|
||||||
|
| [`sl_core_news_sm`](/models/sl#sl_core_news_sm) | 96.9 | 82.1 | 62.9 |
|
||||||
|
| [`sl_core_news_md`](/models/sl#sl_core_news_md) | 97.6 | 84.3 | 73.5 |
|
||||||
|
| [`sl_core_news_lg`](/models/sl#sl_core_news_lg) | 97.7 | 84.3 | 79.0 |
|
||||||
|
| [`sl_core_news_trf`](/models/sl#sl_core_news_trf) | 99.0 | 91.7 | 90.0 |
|
||||||
|
|
||||||
|
### Pipeline updates {id="pipeline-updates"}
|
||||||
|
|
||||||
|
The English pipelines have been updated to improve handling of contractions with
|
||||||
|
various apostrophes and to lemmatize "get" as a passive auxiliary.
|
||||||
|
|
||||||
|
The Danish pipeline `da_core_news_trf` has been updated to use
|
||||||
|
[`vesteinn/DanskBERT`](https://huggingface.co/vesteinn/DanskBERT) with
|
||||||
|
performance improvements across the board.
|
||||||
|
|
||||||
|
## Notes about upgrading from v3.5 {id="upgrading"}
|
||||||
|
|
||||||
|
### SpanGroup spans are now required to be from the same doc {id="spangroup-spans"}
|
||||||
|
|
||||||
|
When initializing a `SpanGroup`, there is a new check to verify that all added
|
||||||
|
spans refer to the current doc. Without this check, it was possible to run into
|
||||||
|
string store or other errors.
|
||||||
|
|
||||||
|
One place this may crop up is when creating `Example` objects for training with
|
||||||
|
custom spans:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
doc = Doc(nlp.vocab, words=tokens) # predicted doc
|
||||||
|
example = Example.from_dict(doc, {"ner": iob_tags})
|
||||||
|
# use the reference doc when creating reference spans
|
||||||
|
- span = Span(doc, 0, 5, "ORG")
|
||||||
|
+ span = Span(example.reference, 0, 5, "ORG")
|
||||||
|
example.reference.spans[spans_key] = [span]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pipeline package version compatibility {id="version-compat"}
|
||||||
|
|
||||||
|
> #### Using legacy implementations
|
||||||
|
>
|
||||||
|
> In spaCy v3, you'll still be able to load and reference legacy implementations
|
||||||
|
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
|
||||||
|
> components or architectures change and newer versions are available in the
|
||||||
|
> core library.
|
||||||
|
|
||||||
|
When you're loading a pipeline package trained with an earlier version of spaCy
|
||||||
|
v3, you will see a warning telling you that the pipeline may be incompatible.
|
||||||
|
This doesn't necessarily have to be true, but we recommend running your
|
||||||
|
pipelines against your test suite or evaluation data to make sure there are no
|
||||||
|
unexpected results.
|
||||||
|
|
||||||
|
If you're using one of the [trained pipelines](/models) we provide, you should
|
||||||
|
run [`spacy download`](/api/cli#download) to update to the latest version. To
|
||||||
|
see an overview of all installed packages and their compatibility, you can run
|
||||||
|
[`spacy validate`](/api/cli#validate).
|
||||||
|
|
||||||
|
If you've trained your own custom pipeline and you've confirmed that it's still
|
||||||
|
working as expected, you can update the spaCy version requirements in the
|
||||||
|
[`meta.json`](/api/data-formats#meta):
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- "spacy_version": ">=3.5.0,<3.6.0",
|
||||||
|
+ "spacy_version": ">=3.5.0,<3.7.0",
|
||||||
|
```
|
||||||
|
|
||||||
|
### Updating v3.5 configs
|
||||||
|
|
||||||
|
To update a config from spaCy v3.5 with the new v3.6 settings, run
|
||||||
|
[`init fill-config`](/api/cli#init-fill-config):
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init fill-config config-v3.5.cfg config-v3.6.cfg
|
||||||
|
```
|
||||||
|
|
||||||
|
In many cases ([`spacy train`](/api/cli#train),
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
|
||||||
|
automatically, but you'll need to fill in the new settings to run
|
||||||
|
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
|
|
@ -222,7 +222,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "la",
|
"code": "la",
|
||||||
"name": "Latin"
|
"name": "Latin",
|
||||||
|
"example": "In principio creavit Deus caelum et terram.",
|
||||||
|
"has_examples": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "lb",
|
"code": "lb",
|
||||||
|
@ -339,7 +341,10 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "sl",
|
"code": "sl",
|
||||||
"name": "Slovenian"
|
"name": "Slovenian",
|
||||||
|
"example": "France Prešeren je umrl 8. februarja 1849 v Kranju",
|
||||||
|
"has_examples": true,
|
||||||
|
"models": ["sl_core_news_sm", "sl_core_news_md", "sl_core_news_lg", "sl_core_news_trf"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "sq",
|
"code": "sq",
|
||||||
|
|
|
@ -14,7 +14,8 @@
|
||||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
||||||
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
||||||
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
||||||
{ "text": "New in v3.5", "url": "/usage/v3-5" }
|
{ "text": "New in v3.5", "url": "/usage/v3-5" },
|
||||||
|
{ "text": "New in v3.6", "url": "/usage/v3-6" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -27,7 +27,7 @@
|
||||||
"indexName": "spacy"
|
"indexName": "spacy"
|
||||||
},
|
},
|
||||||
"binderUrl": "explosion/spacy-io-binder",
|
"binderUrl": "explosion/spacy-io-binder",
|
||||||
"binderVersion": "3.5",
|
"binderVersion": "3.6",
|
||||||
"sections": [
|
"sections": [
|
||||||
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
|
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
|
||||||
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
|
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
|
||||||
|
|
|
@ -4376,7 +4376,7 @@
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
|
"nlp = spacy.load(\"en_core_web_sm\", exclude=[\"ner\"])",
|
||||||
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
|
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
|
||||||
"",
|
"",
|
||||||
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
|
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
|
||||||
|
|
|
@ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js'
|
||||||
import 'prismjs/components/prism-markdown.min.js'
|
import 'prismjs/components/prism-markdown.min.js'
|
||||||
import 'prismjs/components/prism-python.min.js'
|
import 'prismjs/components/prism-python.min.js'
|
||||||
import 'prismjs/components/prism-yaml.min.js'
|
import 'prismjs/components/prism-yaml.min.js'
|
||||||
|
import 'prismjs/components/prism-docker.min.js'
|
||||||
|
import 'prismjs/components/prism-r.min.js'
|
||||||
|
|
||||||
import { isString } from './util'
|
import { isString } from './util'
|
||||||
import Link, { OptionalLink } from './link'
|
import Link, { OptionalLink } from './link'
|
||||||
|
@ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => {
|
||||||
return handlePromot({ lineFlat, prompt })
|
return handlePromot({ lineFlat, prompt })
|
||||||
}
|
}
|
||||||
|
|
||||||
return lang === 'none' || !lineFlat ? (
|
return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? (
|
||||||
lineFlat
|
lineFlat
|
||||||
) : (
|
) : (
|
||||||
<span
|
<span
|
||||||
|
|
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="/usage/v3-5" noLinkLayout>
|
<Link to="/usage/v3-6" noLinkLayout>
|
||||||
<strong>💥 Out now:</strong> spaCy v3.5
|
<strong>💥 Out now:</strong> spaCy v3.6
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user