mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Add edit tree lemmatizer (#10231)
* Add edit tree lemmatizer Co-authored-by: Daniël de Kok <me@danieldk.eu> * Hide edit tree lemmatizer labels * Use relative imports * Switch to single quotes in error message * Type annotation fixes Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Reformat edit_tree_lemmatizer with black * EditTreeLemmatizer.predict: take Iterable Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Validate edit trees during deserialization This change also changes the serialized representation. Rather than mirroring the deep C structure, we use a simple flat union of the match and substitution node types. * Move edit_trees to _edit_tree_internals * Fix invalid edit tree format error message * edit_tree_lemmatizer: remove outdated TODO comment * Rename factory name to trainable_lemmatizer * Ignore type instead of casting truths to List[Union[Ints1d, Floats2d, List[int], List[str]]] for thinc v8.0.14 * Switch to Tagger.v2 * Add documentation for EditTreeLemmatizer * docs: Fix 3.2 -> 3.3 somewhere * trainable_lemmatizer documentation fixes * docs: EditTreeLemmatizer is in edit_tree_lemmatizer.py Co-authored-by: Daniël de Kok <me@danieldk.eu> Co-authored-by: Daniël de Kok <me@github.danieldk.eu> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
98ed941c39
commit
85778dfcf4
1
setup.py
1
setup.py
|
@ -33,6 +33,7 @@ MOD_NAMES = [
|
|||
"spacy.ml.parser_model",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
|
|
|
@ -524,6 +524,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
|
||||
E858 = ("The {mode} vector table does not support this operation. "
|
||||
"{alternative}")
|
||||
E859 = ("The floret vector table cannot be modified.")
|
||||
|
@ -895,6 +896,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
"supported values are: 'I', 'O', 'B' and ''")
|
||||
E1026 = ("Edit tree has an invalid format:\n{errors}")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from .attributeruler import AttributeRuler
|
||||
from .dep_parser import DependencyParser
|
||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||
from .entity_linker import EntityLinker
|
||||
from .ner import EntityRecognizer
|
||||
from .entityruler import EntityRuler
|
||||
|
|
0
spacy/pipeline/_edit_tree_internals/__init__.py
Normal file
0
spacy/pipeline/_edit_tree_internals/__init__.py
Normal file
93
spacy/pipeline/_edit_tree_internals/edit_trees.pxd
Normal file
93
spacy/pipeline/_edit_tree_internals/edit_trees.pxd
Normal file
|
@ -0,0 +1,93 @@
|
|||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ...typedefs cimport attr_t, hash_t, len_t
|
||||
from ...strings cimport StringStore
|
||||
|
||||
cdef extern from "<algorithm>" namespace "std" nogil:
|
||||
void swap[T](T& a, T& b) except + # Only available in Cython 3.
|
||||
|
||||
# An edit tree (Müller et al., 2015) is a tree structure that consists of
|
||||
# edit operations. The two types of operations are string matches
|
||||
# and string substitutions. Given an input string s and an output string t,
|
||||
# subsitution and match nodes should be interpreted as follows:
|
||||
#
|
||||
# * Substitution node: consists of an original string and substitute string.
|
||||
# If s matches the original string, then t is the substitute. Otherwise,
|
||||
# the node does not apply.
|
||||
# * Match node: consists of a prefix length, suffix length, prefix edit tree,
|
||||
# and suffix edit tree. If s is composed of a prefix, middle part, and suffix
|
||||
# with the given suffix and prefix lengths, then t is the concatenation
|
||||
# prefix_tree(prefix) + middle + suffix_tree(suffix).
|
||||
#
|
||||
# For efficiency, we represent strings in substitution nodes as integers, with
|
||||
# the actual strings stored in a StringStore. Subtrees in match nodes are stored
|
||||
# as tree identifiers (rather than pointers) to simplify serialization.
|
||||
|
||||
cdef uint32_t NULL_TREE_ID
|
||||
|
||||
cdef struct MatchNodeC:
|
||||
len_t prefix_len
|
||||
len_t suffix_len
|
||||
uint32_t prefix_tree
|
||||
uint32_t suffix_tree
|
||||
|
||||
cdef struct SubstNodeC:
|
||||
attr_t orig
|
||||
attr_t subst
|
||||
|
||||
cdef union NodeC:
|
||||
MatchNodeC match_node
|
||||
SubstNodeC subst_node
|
||||
|
||||
cdef struct EditTreeC:
|
||||
bint is_match_node
|
||||
NodeC inner
|
||||
|
||||
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
|
||||
uint32_t prefix_tree, uint32_t suffix_tree):
|
||||
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
|
||||
suffix_len=suffix_len, prefix_tree=prefix_tree,
|
||||
suffix_tree=suffix_tree)
|
||||
cdef NodeC inner = NodeC(match_node=match_node)
|
||||
return EditTreeC(is_match_node=True, inner=inner)
|
||||
|
||||
cdef inline EditTreeC edittree_new_subst(attr_t orig, attr_t subst):
|
||||
cdef EditTreeC node
|
||||
cdef SubstNodeC subst_node = SubstNodeC(orig=orig, subst=subst)
|
||||
cdef NodeC inner = NodeC(subst_node=subst_node)
|
||||
return EditTreeC(is_match_node=False, inner=inner)
|
||||
|
||||
cdef inline uint64_t edittree_hash(EditTreeC tree):
|
||||
cdef MatchNodeC match_node
|
||||
cdef SubstNodeC subst_node
|
||||
|
||||
if tree.is_match_node:
|
||||
match_node = tree.inner.match_node
|
||||
return hash((match_node.prefix_len, match_node.suffix_len, match_node.prefix_tree, match_node.suffix_tree))
|
||||
else:
|
||||
subst_node = tree.inner.subst_node
|
||||
return hash((subst_node.orig, subst_node.subst))
|
||||
|
||||
cdef struct LCS:
|
||||
int source_begin
|
||||
int source_end
|
||||
int target_begin
|
||||
int target_end
|
||||
|
||||
cdef inline bint lcs_is_empty(LCS lcs):
|
||||
return lcs.source_begin == 0 and lcs.source_end == 0 and lcs.target_begin == 0 and lcs.target_end == 0
|
||||
|
||||
cdef class EditTrees:
|
||||
cdef vector[EditTreeC] trees
|
||||
cdef unordered_map[hash_t, uint32_t] map
|
||||
cdef StringStore strings
|
||||
|
||||
cpdef uint32_t add(self, str form, str lemma)
|
||||
cpdef str apply(self, uint32_t tree_id, str form)
|
||||
cpdef unicode tree_to_str(self, uint32_t tree_id)
|
||||
|
||||
cdef uint32_t _add(self, str form, str lemma)
|
||||
cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces)
|
||||
cdef uint32_t _tree_id(self, EditTreeC tree)
|
305
spacy/pipeline/_edit_tree_internals/edit_trees.pyx
Normal file
305
spacy/pipeline/_edit_tree_internals/edit_trees.pyx
Normal file
|
@ -0,0 +1,305 @@
|
|||
# cython: infer_types=True, binding=True
|
||||
from cython.operator cimport dereference as deref
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport UINT32_MAX
|
||||
from libc.string cimport memset
|
||||
from libcpp.pair cimport pair
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ...typedefs cimport hash_t
|
||||
|
||||
from ... import util
|
||||
from ...errors import Errors
|
||||
from ...strings import StringStore
|
||||
from .schemas import validate_edit_tree
|
||||
|
||||
|
||||
NULL_TREE_ID = UINT32_MAX
|
||||
|
||||
cdef LCS find_lcs(str source, str target):
|
||||
"""
|
||||
Find the longest common subsequence (LCS) between two strings. If there are
|
||||
multiple LCSes, only one of them is returned.
|
||||
|
||||
source (str): The first string.
|
||||
target (str): The second string.
|
||||
RETURNS (LCS): The spans of the longest common subsequences.
|
||||
"""
|
||||
cdef Py_ssize_t source_len = len(source)
|
||||
cdef Py_ssize_t target_len = len(target)
|
||||
cdef size_t longest_align = 0;
|
||||
cdef int source_idx, target_idx
|
||||
cdef LCS lcs
|
||||
cdef Py_UCS4 source_cp, target_cp
|
||||
|
||||
memset(&lcs, 0, sizeof(lcs))
|
||||
|
||||
cdef vector[size_t] prev_aligns = vector[size_t](target_len);
|
||||
cdef vector[size_t] cur_aligns = vector[size_t](target_len);
|
||||
|
||||
for (source_idx, source_cp) in enumerate(source):
|
||||
for (target_idx, target_cp) in enumerate(target):
|
||||
if source_cp == target_cp:
|
||||
if source_idx == 0 or target_idx == 0:
|
||||
cur_aligns[target_idx] = 1
|
||||
else:
|
||||
cur_aligns[target_idx] = prev_aligns[target_idx - 1] + 1
|
||||
|
||||
# Check if this is the longest alignment and replace previous
|
||||
# best alignment when this is the case.
|
||||
if cur_aligns[target_idx] > longest_align:
|
||||
longest_align = cur_aligns[target_idx]
|
||||
lcs.source_begin = source_idx - longest_align + 1
|
||||
lcs.source_end = source_idx + 1
|
||||
lcs.target_begin = target_idx - longest_align + 1
|
||||
lcs.target_end = target_idx + 1
|
||||
else:
|
||||
# No match, we start with a zero-length alignment.
|
||||
cur_aligns[target_idx] = 0
|
||||
swap(prev_aligns, cur_aligns)
|
||||
|
||||
return lcs
|
||||
|
||||
cdef class EditTrees:
|
||||
"""Container for constructing and storing edit trees."""
|
||||
def __init__(self, strings: StringStore):
|
||||
"""Create a container for edit trees.
|
||||
|
||||
strings (StringStore): the string store to use."""
|
||||
self.strings = strings
|
||||
|
||||
cpdef uint32_t add(self, str form, str lemma):
|
||||
"""Add an edit tree that rewrites the given string into the given lemma.
|
||||
|
||||
RETURNS (int): identifier of the edit tree in the container.
|
||||
"""
|
||||
# Treat two empty strings as a special case. Generating an edit
|
||||
# tree for identical strings results in a match node. However,
|
||||
# since two empty strings have a zero-length LCS, a substitution
|
||||
# node would be created. Since we do not want to clutter the
|
||||
# recursive tree construction with logic for this case, handle
|
||||
# it in this wrapper method.
|
||||
if len(form) == 0 and len(lemma) == 0:
|
||||
tree = edittree_new_match(0, 0, NULL_TREE_ID, NULL_TREE_ID)
|
||||
return self._tree_id(tree)
|
||||
|
||||
return self._add(form, lemma)
|
||||
|
||||
cdef uint32_t _add(self, str form, str lemma):
|
||||
cdef LCS lcs = find_lcs(form, lemma)
|
||||
|
||||
cdef EditTreeC tree
|
||||
cdef uint32_t tree_id, prefix_tree, suffix_tree
|
||||
if lcs_is_empty(lcs):
|
||||
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
||||
else:
|
||||
# If we have a non-empty LCS, such as "gooi" in "ge[gooi]d" and "[gooi]en",
|
||||
# create edit trees for the prefix pair ("ge"/"") and the suffix pair ("d"/"en").
|
||||
prefix_tree = NULL_TREE_ID
|
||||
if lcs.source_begin != 0 or lcs.target_begin != 0:
|
||||
prefix_tree = self.add(form[:lcs.source_begin], lemma[:lcs.target_begin])
|
||||
|
||||
suffix_tree = NULL_TREE_ID
|
||||
if lcs.source_end != len(form) or lcs.target_end != len(lemma):
|
||||
suffix_tree = self.add(form[lcs.source_end:], lemma[lcs.target_end:])
|
||||
|
||||
tree = edittree_new_match(lcs.source_begin, len(form) - lcs.source_end, prefix_tree, suffix_tree)
|
||||
|
||||
return self._tree_id(tree)
|
||||
|
||||
cdef uint32_t _tree_id(self, EditTreeC tree):
|
||||
# If this tree has been constructed before, return its identifier.
|
||||
cdef hash_t hash = edittree_hash(tree)
|
||||
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
|
||||
if iter != self.map.end():
|
||||
return deref(iter).second
|
||||
|
||||
# The tree hasn't been seen before, store it.
|
||||
cdef uint32_t tree_id = self.trees.size()
|
||||
self.trees.push_back(tree)
|
||||
self.map.insert(pair[hash_t, uint32_t](hash, tree_id))
|
||||
|
||||
return tree_id
|
||||
|
||||
cpdef str apply(self, uint32_t tree_id, str form):
|
||||
"""Apply an edit tree to a form.
|
||||
|
||||
tree_id (uint32_t): the identifier of the edit tree to apply.
|
||||
form (str): the form to apply the edit tree to.
|
||||
RETURNS (str): the transformer form or None if the edit tree
|
||||
could not be applied to the form.
|
||||
"""
|
||||
if tree_id >= self.trees.size():
|
||||
raise IndexError("Edit tree identifier out of range")
|
||||
|
||||
lemma_pieces = []
|
||||
try:
|
||||
self._apply(tree_id, form, lemma_pieces)
|
||||
except ValueError:
|
||||
return None
|
||||
return "".join(lemma_pieces)
|
||||
|
||||
cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces):
|
||||
"""Recursively apply an edit tree to a form, adding pieces to
|
||||
the lemma_pieces list."""
|
||||
assert tree_id <= self.trees.size()
|
||||
|
||||
cdef EditTreeC tree = self.trees[tree_id]
|
||||
cdef MatchNodeC match_node
|
||||
cdef int suffix_start
|
||||
|
||||
if tree.is_match_node:
|
||||
match_node = tree.inner.match_node
|
||||
|
||||
if match_node.prefix_len + match_node.suffix_len > len(form_part):
|
||||
raise ValueError("Edit tree cannot be applied to form")
|
||||
|
||||
suffix_start = len(form_part) - match_node.suffix_len
|
||||
|
||||
if match_node.prefix_tree != NULL_TREE_ID:
|
||||
self._apply(match_node.prefix_tree, form_part[:match_node.prefix_len], lemma_pieces)
|
||||
|
||||
lemma_pieces.append(form_part[match_node.prefix_len:suffix_start])
|
||||
|
||||
if match_node.suffix_tree != NULL_TREE_ID:
|
||||
self._apply(match_node.suffix_tree, form_part[suffix_start:], lemma_pieces)
|
||||
else:
|
||||
if form_part == self.strings[tree.inner.subst_node.orig]:
|
||||
lemma_pieces.append(self.strings[tree.inner.subst_node.subst])
|
||||
else:
|
||||
raise ValueError("Edit tree cannot be applied to form")
|
||||
|
||||
cpdef unicode tree_to_str(self, uint32_t tree_id):
|
||||
"""Return the tree as a string. The tree tree string is formatted
|
||||
like an S-expression. This is primarily useful for debugging. Match
|
||||
nodes have the following format:
|
||||
|
||||
(m prefix_len suffix_len prefix_tree suffix_tree)
|
||||
|
||||
Substitution nodes have the following format:
|
||||
|
||||
(s original substitute)
|
||||
|
||||
tree_id (uint32_t): the identifier of the edit tree.
|
||||
RETURNS (str): the tree as an S-expression.
|
||||
"""
|
||||
|
||||
if tree_id >= self.trees.size():
|
||||
raise IndexError("Edit tree identifier out of range")
|
||||
|
||||
cdef EditTreeC tree = self.trees[tree_id]
|
||||
cdef SubstNodeC subst_node
|
||||
|
||||
if not tree.is_match_node:
|
||||
subst_node = tree.inner.subst_node
|
||||
return f"(s '{self.strings[subst_node.orig]}' '{self.strings[subst_node.subst]}')"
|
||||
|
||||
cdef MatchNodeC match_node = tree.inner.match_node
|
||||
|
||||
prefix_tree = "()"
|
||||
if match_node.prefix_tree != NULL_TREE_ID:
|
||||
prefix_tree = self.tree_to_str(match_node.prefix_tree)
|
||||
|
||||
suffix_tree = "()"
|
||||
if match_node.suffix_tree != NULL_TREE_ID:
|
||||
suffix_tree = self.tree_to_str(match_node.suffix_tree)
|
||||
|
||||
return f"(m {match_node.prefix_len} {match_node.suffix_len} {prefix_tree} {suffix_tree})"
|
||||
|
||||
def from_json(self, trees: list) -> "EditTrees":
|
||||
self.trees.clear()
|
||||
|
||||
for tree in trees:
|
||||
tree = _dict2tree(tree)
|
||||
self.trees.push_back(tree)
|
||||
|
||||
self._rebuild_tree_map()
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *) -> "EditTrees":
|
||||
def deserialize_trees(tree_dicts):
|
||||
cdef EditTreeC c_tree
|
||||
for tree_dict in tree_dicts:
|
||||
c_tree = _dict2tree(tree_dict)
|
||||
self.trees.push_back(c_tree)
|
||||
|
||||
deserializers = {}
|
||||
deserializers["trees"] = lambda n: deserialize_trees(n)
|
||||
util.from_bytes(bytes_data, deserializers, [])
|
||||
|
||||
self._rebuild_tree_map()
|
||||
|
||||
return self
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
tree_dicts = []
|
||||
for tree in self.trees:
|
||||
tree = _tree2dict(tree)
|
||||
tree_dicts.append(tree)
|
||||
|
||||
serializers = {}
|
||||
serializers["trees"] = lambda: tree_dicts
|
||||
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def to_disk(self, path, **kwargs) -> "EditTrees":
|
||||
path = util.ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path, **kwargs) -> "EditTrees":
|
||||
path = util.ensure_path(path)
|
||||
if path.exists():
|
||||
with path.open("rb") as file_:
|
||||
data = file_.read()
|
||||
return self.from_bytes(data)
|
||||
|
||||
return self
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return _tree2dict(self.trees[idx])
|
||||
|
||||
def __len__(self):
|
||||
return self.trees.size()
|
||||
|
||||
def _rebuild_tree_map(self):
|
||||
"""Rebuild the tree hash -> tree id mapping"""
|
||||
cdef EditTreeC c_tree
|
||||
cdef uint32_t tree_id
|
||||
cdef hash_t tree_hash
|
||||
|
||||
self.map.clear()
|
||||
|
||||
for tree_id in range(self.trees.size()):
|
||||
c_tree = self.trees[tree_id]
|
||||
tree_hash = edittree_hash(c_tree)
|
||||
self.map.insert(pair[hash_t, uint32_t](tree_hash, tree_id))
|
||||
|
||||
def __reduce__(self):
|
||||
return (unpickle_edittrees, (self.strings, self.to_bytes()))
|
||||
|
||||
|
||||
def unpickle_edittrees(strings, trees_data):
|
||||
return EditTrees(strings).from_bytes(trees_data)
|
||||
|
||||
|
||||
def _tree2dict(tree):
|
||||
if tree["is_match_node"]:
|
||||
tree = tree["inner"]["match_node"]
|
||||
else:
|
||||
tree = tree["inner"]["subst_node"]
|
||||
return(dict(tree))
|
||||
|
||||
def _dict2tree(tree):
|
||||
errors = validate_edit_tree(tree)
|
||||
if errors:
|
||||
raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
|
||||
|
||||
tree = dict(tree)
|
||||
if "prefix_len" in tree:
|
||||
tree = {"is_match_node": True, "inner": {"match_node": tree}}
|
||||
else:
|
||||
tree = {"is_match_node": False, "inner": {"subst_node": tree}}
|
||||
|
||||
return tree
|
44
spacy/pipeline/_edit_tree_internals/schemas.py
Normal file
44
spacy/pipeline/_edit_tree_internals/schemas.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from typing import Any, Dict, List, Union
|
||||
from collections import defaultdict
|
||||
from pydantic import BaseModel, Field, ValidationError
|
||||
from pydantic.types import StrictBool, StrictInt, StrictStr
|
||||
|
||||
|
||||
class MatchNodeSchema(BaseModel):
|
||||
prefix_len: StrictInt = Field(..., title="Prefix length")
|
||||
suffix_len: StrictInt = Field(..., title="Suffix length")
|
||||
prefix_tree: StrictInt = Field(..., title="Prefix tree")
|
||||
suffix_tree: StrictInt = Field(..., title="Suffix tree")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class SubstNodeSchema(BaseModel):
|
||||
orig: Union[int, StrictStr] = Field(..., title="Original substring")
|
||||
subst: Union[int, StrictStr] = Field(..., title="Replacement substring")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class EditTreeSchema(BaseModel):
|
||||
__root__: Union[MatchNodeSchema, SubstNodeSchema]
|
||||
|
||||
|
||||
def validate_edit_tree(obj: Dict[str, Any]) -> List[str]:
|
||||
"""Validate edit tree.
|
||||
|
||||
obj (Dict[str, Any]): JSON-serializable data to validate.
|
||||
RETURNS (List[str]): A list of error messages, if available.
|
||||
"""
|
||||
try:
|
||||
EditTreeSchema.parse_obj(obj)
|
||||
return []
|
||||
except ValidationError as e:
|
||||
errors = e.errors()
|
||||
data = defaultdict(list)
|
||||
for error in errors:
|
||||
err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
|
||||
data[err_loc].append(error.get("msg"))
|
||||
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type]
|
379
spacy/pipeline/edit_tree_lemmatizer.py
Normal file
379
spacy/pipeline/edit_tree_lemmatizer.py
Normal file
|
@ -0,0 +1,379 @@
|
|||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||
from typing import Sequence, Tuple, Union
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
from itertools import islice
|
||||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
from .lemmatizer import lemmatizer_score
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..vocab import Vocab
|
||||
from .. import util
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
"""
|
||||
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"trainable_lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
requires=[],
|
||||
default_config={
|
||||
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
||||
"backoff": "orth",
|
||||
"min_tree_freq": 3,
|
||||
"overwrite": False,
|
||||
"top_k": 1,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_edit_tree_lemmatizer(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model,
|
||||
backoff: Optional[str],
|
||||
min_tree_freq: int,
|
||||
overwrite: bool,
|
||||
top_k: int,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Construct an EditTreeLemmatizer component."""
|
||||
return EditTreeLemmatizer(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
backoff=backoff,
|
||||
min_tree_freq=min_tree_freq,
|
||||
overwrite=overwrite,
|
||||
top_k=top_k,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
class EditTreeLemmatizer(TrainablePipe):
|
||||
"""
|
||||
Lemmatizer that lemmatizes each word using a predicted edit tree.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "trainable_lemmatizer",
|
||||
*,
|
||||
backoff: Optional[str] = "orth",
|
||||
min_tree_freq: int = 3,
|
||||
overwrite: bool = False,
|
||||
top_k: int = 1,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
):
|
||||
"""
|
||||
Construct an edit tree lemmatizer.
|
||||
|
||||
backoff (Optional[str]): backoff to use when the predicted edit trees
|
||||
are not applicable. Must be an attribute of Token or None (leave the
|
||||
lemma unset).
|
||||
min_tree_freq (int): prune trees that are applied less than this
|
||||
frequency in the training data.
|
||||
overwrite (bool): overwrite existing lemma annotations.
|
||||
top_k (int): try to apply at most the k most probable edit trees.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.backoff = backoff
|
||||
self.min_tree_freq = min_tree_freq
|
||||
self.overwrite = overwrite
|
||||
self.top_k = top_k
|
||||
|
||||
self.trees = EditTrees(self.vocab.strings)
|
||||
self.tree2label: Dict[int, int] = {}
|
||||
|
||||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
validate_examples(examples, "EditTreeLemmatizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
|
||||
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
for (predicted, gold_lemma) in zip(
|
||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||
):
|
||||
if gold_lemma is None:
|
||||
label = -1
|
||||
else:
|
||||
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||
label = self.tree2label.get(tree_id, 0)
|
||||
eg_truths.append(label)
|
||||
|
||||
truths.append(eg_truths)
|
||||
|
||||
d_scores, loss = loss_func(scores, truths) # type: ignore
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
|
||||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
n_labels = len(self.cfg["labels"])
|
||||
guesses: List[Ints2d] = [
|
||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
||||
]
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
if self.top_k == 1:
|
||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||
else:
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
|
||||
if not isinstance(doc_guesses, np.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
|
||||
doc_compat_guesses = []
|
||||
for token, candidates in zip(doc, doc_guesses):
|
||||
tree_id = -1
|
||||
for candidate in candidates:
|
||||
candidate_tree_id = self.cfg["labels"][candidate]
|
||||
|
||||
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||
tree_id = candidate_tree_id
|
||||
break
|
||||
doc_compat_guesses.append(tree_id)
|
||||
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tree_ids = batch_tree_ids[i]
|
||||
if hasattr(doc_tree_ids, "get"):
|
||||
doc_tree_ids = doc_tree_ids.get()
|
||||
for j, tree_id in enumerate(doc_tree_ids):
|
||||
if self.overwrite or doc[j].lemma == 0:
|
||||
# If no applicable tree could be found during prediction,
|
||||
# the special identifier -1 is used. Otherwise the tree
|
||||
# is guaranteed to be applicable.
|
||||
if tree_id == -1:
|
||||
if self.backoff is not None:
|
||||
doc[j].lemma = getattr(doc[j], self.backoff)
|
||||
else:
|
||||
lemma = self.trees.apply(tree_id, doc[j].text)
|
||||
doc[j].lemma_ = lemma
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[int, ...]:
|
||||
"""Returns the labels currently added to the component."""
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
@property
|
||||
def hide_labels(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def label_data(self) -> Dict:
|
||||
trees = []
|
||||
for tree_id in range(len(self.trees)):
|
||||
tree = self.trees[tree_id]
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
if "subst" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
trees.append(tree)
|
||||
return dict(trees=trees, labels=tuple(self.cfg["labels"]))
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
labels: Optional[Dict] = None,
|
||||
):
|
||||
validate_get_examples(get_examples, "EditTreeLemmatizer.initialize")
|
||||
|
||||
if labels is None:
|
||||
self._labels_from_data(get_examples)
|
||||
else:
|
||||
self._add_labels(labels)
|
||||
|
||||
# Sample for the model.
|
||||
doc_sample = []
|
||||
label_sample = []
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
gold_labels: List[List[float]] = []
|
||||
for token in example.reference:
|
||||
if token.lemma == 0:
|
||||
gold_label = None
|
||||
else:
|
||||
gold_label = self._pair2label(token.text, token.lemma_)
|
||||
|
||||
gold_labels.append(
|
||||
[
|
||||
1.0 if label == gold_label else 0.0
|
||||
for label in self.cfg["labels"]
|
||||
]
|
||||
)
|
||||
|
||||
gold_labels = cast(Floats2d, gold_labels)
|
||||
label_sample.append(self.model.ops.asarray(gold_labels, dtype="float32"))
|
||||
|
||||
self._require_labels()
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
deserializers = {
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: self.model.from_bytes(b),
|
||||
"vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
|
||||
"trees": lambda b: self.trees.from_bytes(b),
|
||||
}
|
||||
|
||||
util.from_bytes(bytes_data, deserializers, exclude)
|
||||
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
serializers = {
|
||||
"cfg": lambda: srsly.json_dumps(self.cfg),
|
||||
"model": lambda: self.model.to_bytes(),
|
||||
"vocab": lambda: self.vocab.to_bytes(exclude=exclude),
|
||||
"trees": lambda: self.trees.to_bytes(),
|
||||
}
|
||||
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
path = util.ensure_path(path)
|
||||
serializers = {
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
"model": lambda p: self.model.to_disk(p),
|
||||
"vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
|
||||
"trees": lambda p: self.trees.to_disk(p),
|
||||
}
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, exclude=tuple()):
|
||||
def load_model(p):
|
||||
try:
|
||||
with open(p, "rb") as mfile:
|
||||
self.model.from_bytes(mfile.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserializers = {
|
||||
"cfg": lambda p: self.cfg.update(srsly.read_json(p)),
|
||||
"model": load_model,
|
||||
"vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
|
||||
"trees": lambda p: self.trees.from_disk(p),
|
||||
}
|
||||
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
return self
|
||||
|
||||
def _add_labels(self, labels: Dict):
|
||||
if "labels" not in labels:
|
||||
raise ValueError(Errors.E857.format(name="labels"))
|
||||
if "trees" not in labels:
|
||||
raise ValueError(Errors.E857.format(name="trees"))
|
||||
|
||||
self.cfg["labels"] = list(labels["labels"])
|
||||
trees = []
|
||||
for tree in labels["trees"]:
|
||||
errors = validate_edit_tree(tree)
|
||||
if errors:
|
||||
raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
|
||||
|
||||
tree = dict(tree)
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
if "orig" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
|
||||
trees.append(tree)
|
||||
|
||||
self.trees.from_json(trees)
|
||||
|
||||
for label, tree in enumerate(self.labels):
|
||||
self.tree2label[tree] = label
|
||||
|
||||
def _labels_from_data(self, get_examples: Callable[[], Iterable[Example]]):
|
||||
# Count corpus tree frequencies in ad-hoc storage to avoid cluttering
|
||||
# the final pipe/string store.
|
||||
vocab = Vocab()
|
||||
trees = EditTrees(vocab.strings)
|
||||
tree_freqs: Counter = Counter()
|
||||
repr_pairs: Dict = {}
|
||||
for example in get_examples():
|
||||
for token in example.reference:
|
||||
if token.lemma != 0:
|
||||
tree_id = trees.add(token.text, token.lemma_)
|
||||
tree_freqs[tree_id] += 1
|
||||
repr_pairs[tree_id] = (token.text, token.lemma_)
|
||||
|
||||
# Construct trees that make the frequency cut-off using representative
|
||||
# form - token pairs.
|
||||
for tree_id, freq in tree_freqs.items():
|
||||
if freq >= self.min_tree_freq:
|
||||
form, lemma = repr_pairs[tree_id]
|
||||
self._pair2label(form, lemma, add_label=True)
|
||||
|
||||
def _pair2label(self, form, lemma, add_label=False):
|
||||
"""
|
||||
Look up the edit tree identifier for a form/label pair. If the edit
|
||||
tree is unknown and "add_label" is set, the edit tree will be added to
|
||||
the labels.
|
||||
"""
|
||||
tree_id = self.trees.add(form, lemma)
|
||||
if tree_id not in self.tree2label:
|
||||
if not add_label:
|
||||
return None
|
||||
|
||||
self.tree2label[tree_id] = len(self.cfg["labels"])
|
||||
self.cfg["labels"].append(tree_id)
|
||||
return self.tree2label[tree_id]
|
280
spacy/tests/pipeline/test_edit_tree_lemmatizer.py
Normal file
280
spacy/tests/pipeline/test_edit_tree_lemmatizer.py
Normal file
|
@ -0,0 +1,280 @@
|
|||
import pickle
|
||||
import pytest
|
||||
from hypothesis import given
|
||||
import hypothesis.strategies as st
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from spacy.training import Example
|
||||
from spacy.strings import StringStore
|
||||
from spacy.util import make_tempdir
|
||||
|
||||
|
||||
TRAIN_DATA = [
|
||||
("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
|
||||
("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
|
||||
]
|
||||
|
||||
PARTIAL_DATA = [
|
||||
# partial annotation
|
||||
("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
|
||||
# misaligned partial annotation
|
||||
(
|
||||
"He hates green eggs",
|
||||
{
|
||||
"words": ["He", "hat", "es", "green", "eggs"],
|
||||
"lemmas": ["", "hat", "e", "green", ""],
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||
with pytest.raises(TypeError):
|
||||
nlp.initialize(get_examples=lambda: [])
|
||||
with pytest.raises(TypeError):
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_initialize_from_labels():
|
||||
nlp = Language()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
nlp2 = Language()
|
||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer2.initialize(
|
||||
get_examples=lambda: train_examples,
|
||||
labels=lemmatizer.label_data,
|
||||
)
|
||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||
|
||||
|
||||
def test_no_data():
|
||||
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
||||
TEXTCAT_DATA = [
|
||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||
]
|
||||
nlp = English()
|
||||
nlp.add_pipe("trainable_lemmatizer")
|
||||
nlp.add_pipe("textcat")
|
||||
|
||||
train_examples = []
|
||||
for t in TEXTCAT_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_incomplete_data():
|
||||
# Test that the lemmatizer works with incomplete information
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in PARTIAL_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["trainable_lemmatizer"] < 0.00001
|
||||
|
||||
# test the trained model
|
||||
test_text = "She likes blue eggs"
|
||||
doc = nlp(test_text)
|
||||
assert doc[1].lemma_ == "like"
|
||||
assert doc[2].lemma_ == "blue"
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["trainable_lemmatizer"] < 0.00001
|
||||
|
||||
test_text = "She likes blue eggs"
|
||||
doc = nlp(test_text)
|
||||
assert doc[0].lemma_ == "she"
|
||||
assert doc[1].lemma_ == "like"
|
||||
assert doc[2].lemma_ == "blue"
|
||||
assert doc[3].lemma_ == "egg"
|
||||
|
||||
# Check model after a {to,from}_disk roundtrip
|
||||
with util.make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert doc2[0].lemma_ == "she"
|
||||
assert doc2[1].lemma_ == "like"
|
||||
assert doc2[2].lemma_ == "blue"
|
||||
assert doc2[3].lemma_ == "egg"
|
||||
|
||||
# Check model after a {to,from}_bytes roundtrip
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
nlp3 = English()
|
||||
nlp3.add_pipe("trainable_lemmatizer")
|
||||
nlp3.from_bytes(nlp_bytes)
|
||||
doc3 = nlp3(test_text)
|
||||
assert doc3[0].lemma_ == "she"
|
||||
assert doc3[1].lemma_ == "like"
|
||||
assert doc3[2].lemma_ == "blue"
|
||||
assert doc3[3].lemma_ == "egg"
|
||||
|
||||
# Check model after a pickle roundtrip.
|
||||
nlp_bytes = pickle.dumps(nlp)
|
||||
nlp4 = pickle.loads(nlp_bytes)
|
||||
doc4 = nlp4(test_text)
|
||||
assert doc4[0].lemma_ == "she"
|
||||
assert doc4[1].lemma_ == "like"
|
||||
assert doc4[2].lemma_ == "blue"
|
||||
assert doc4[3].lemma_ == "egg"
|
||||
|
||||
|
||||
def test_lemmatizer_requires_labels():
|
||||
nlp = English()
|
||||
nlp.add_pipe("trainable_lemmatizer")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_lemmatizer_label_data():
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
nlp2 = English()
|
||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer2.initialize(
|
||||
get_examples=lambda: train_examples, labels=lemmatizer.label_data
|
||||
)
|
||||
|
||||
# Verify that the labels and trees are the same.
|
||||
assert lemmatizer.labels == lemmatizer2.labels
|
||||
assert lemmatizer.trees.to_bytes() == lemmatizer2.trees.to_bytes()
|
||||
|
||||
|
||||
def test_dutch():
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
tree = trees.add("deelt", "delen")
|
||||
assert trees.tree_to_str(tree) == "(m 0 3 () (m 0 2 (s '' 'l') (s 'lt' 'n')))"
|
||||
|
||||
tree = trees.add("gedeeld", "delen")
|
||||
assert (
|
||||
trees.tree_to_str(tree) == "(m 2 3 (s 'ge' '') (m 0 2 (s '' 'l') (s 'ld' 'n')))"
|
||||
)
|
||||
|
||||
|
||||
def test_from_to_bytes():
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
trees.add("deelt", "delen")
|
||||
trees.add("gedeeld", "delen")
|
||||
|
||||
b = trees.to_bytes()
|
||||
|
||||
trees2 = EditTrees(strings)
|
||||
trees2.from_bytes(b)
|
||||
|
||||
# Verify that the nodes did not change.
|
||||
assert len(trees) == len(trees2)
|
||||
for i in range(len(trees)):
|
||||
assert trees.tree_to_str(i) == trees2.tree_to_str(i)
|
||||
|
||||
# Reinserting the same trees should not add new nodes.
|
||||
trees2.add("deelt", "delen")
|
||||
trees2.add("gedeeld", "delen")
|
||||
assert len(trees) == len(trees2)
|
||||
|
||||
|
||||
def test_from_to_disk():
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
trees.add("deelt", "delen")
|
||||
trees.add("gedeeld", "delen")
|
||||
|
||||
trees2 = EditTrees(strings)
|
||||
with make_tempdir() as temp_dir:
|
||||
trees_file = temp_dir / "edit_trees.bin"
|
||||
trees.to_disk(trees_file)
|
||||
trees2 = trees2.from_disk(trees_file)
|
||||
|
||||
# Verify that the nodes did not change.
|
||||
assert len(trees) == len(trees2)
|
||||
for i in range(len(trees)):
|
||||
assert trees.tree_to_str(i) == trees2.tree_to_str(i)
|
||||
|
||||
# Reinserting the same trees should not add new nodes.
|
||||
trees2.add("deelt", "delen")
|
||||
trees2.add("gedeeld", "delen")
|
||||
assert len(trees) == len(trees2)
|
||||
|
||||
|
||||
@given(st.text(), st.text())
|
||||
def test_roundtrip(form, lemma):
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
tree = trees.add(form, lemma)
|
||||
assert trees.apply(tree, form) == lemma
|
||||
|
||||
|
||||
@given(st.text(alphabet="ab"), st.text(alphabet="ab"))
|
||||
def test_roundtrip_small_alphabet(form, lemma):
|
||||
# Test with small alphabets to have more overlap.
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
tree = trees.add(form, lemma)
|
||||
assert trees.apply(tree, form) == lemma
|
||||
|
||||
|
||||
def test_unapplicable_trees():
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
tree3 = trees.add("deelt", "delen")
|
||||
|
||||
# Replacement fails.
|
||||
assert trees.apply(tree3, "deeld") == None
|
||||
|
||||
# Suffix + prefix are too large.
|
||||
assert trees.apply(tree3, "de") == None
|
||||
|
||||
|
||||
def test_empty_strings():
|
||||
strings = StringStore()
|
||||
trees = EditTrees(strings)
|
||||
no_change = trees.add("xyz", "xyz")
|
||||
empty = trees.add("", "")
|
||||
assert no_change == empty
|
409
website/docs/api/edittreelemmatizer.md
Normal file
409
website/docs/api/edittreelemmatizer.md
Normal file
|
@ -0,0 +1,409 @@
|
|||
---
|
||||
title: EditTreeLemmatizer
|
||||
tag: class
|
||||
source: spacy/pipeline/edit_tree_lemmatizer.py
|
||||
new: 3.3
|
||||
teaser: 'Pipeline component for lemmatization'
|
||||
api_base_class: /api/pipe
|
||||
api_string_name: trainable_lemmatizer
|
||||
api_trainable: true
|
||||
---
|
||||
|
||||
A trainable component for assigning base forms to tokens. This lemmatizer uses
|
||||
**edit trees** to transform tokens into base forms. The lemmatization model
|
||||
predicts which edit tree is applicable to a token. The edit tree data structure
|
||||
and construction method used by this lemmatizer were proposed in
|
||||
[Joint Lemmatization and Morphological Tagging with Lemming](https://aclanthology.org/D15-1272.pdf)
|
||||
(Thomas Müller et al., 2015).
|
||||
|
||||
For a lookup and rule-based lemmatizer, see [`Lemmatizer`](/api/lemmatizer).
|
||||
|
||||
## Assigned Attributes {#assigned-attributes}
|
||||
|
||||
Predictions are assigned to `Token.lemma`.
|
||||
|
||||
| Location | Value |
|
||||
| -------------- | ------------------------- |
|
||||
| `Token.lemma` | The lemma (hash). ~~int~~ |
|
||||
| `Token.lemma_` | The lemma. ~~str~~ |
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config). See the
|
||||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
|
||||
> config = {"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL}
|
||||
> nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ |
|
||||
| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ |
|
||||
| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
|
||||
```
|
||||
|
||||
## EditTreeLemmatizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe with default model
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
>
|
||||
> # Construction via create_pipe with custom model
|
||||
> config = {"model": {"@architectures": "my_tagger"}}
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import EditTreeLemmatizer
|
||||
> lemmatizer = EditTreeLemmatizer(nlp.vocab, model)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ |
|
||||
| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ |
|
||||
| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ |
|
||||
|
||||
## EditTreeLemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
This usually happens under the hood when the `nlp` object is called on a text
|
||||
and all pipeline components are applied to the `Doc` in order. Both
|
||||
[`__call__`](/api/edittreelemmatizer#call) and
|
||||
[`pipe`](/api/edittreelemmatizer#pipe) delegate to the
|
||||
[`predict`](/api/edittreelemmatizer#predict) and
|
||||
[`set_annotations`](/api/edittreelemmatizer#set_annotations) methods.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("This is a sentence.")
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> # This usually happens under the hood
|
||||
> processed = lemmatizer(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## EditTreeLemmatizer.pipe {#pipe tag="method"}
|
||||
|
||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||
when the `nlp` object is called on a text and all pipeline components are
|
||||
applied to the `Doc` in order. Both [`__call__`](/api/edittreelemmatizer#call)
|
||||
and [`pipe`](/api/edittreelemmatizer#pipe) delegate to the
|
||||
[`predict`](/api/edittreelemmatizer#predict) and
|
||||
[`set_annotations`](/api/edittreelemmatizer#set_annotations) methods.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> for doc in lemmatizer.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
used to **initialize the model** of the component and can either be the full
|
||||
training data or a representative sample. Initialization includes validating the
|
||||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> lemmatizer.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.lemmatizer]
|
||||
>
|
||||
> [initialize.components.lemmatizer.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/lemmatizer.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||
|
||||
## EditTreeLemmatizer.predict {#predict tag="method"}
|
||||
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
|
||||
modifying them.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> tree_ids = lemmatizer.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## EditTreeLemmatizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed tree
|
||||
identifiers.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> tree_ids = lemmatizer.predict([doc1, doc2])
|
||||
> lemmatizer.set_annotations([doc1, doc2], tree_ids)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `tree_ids` | The identifiers of the edit trees to apply, produced by `EditTreeLemmatizer.predict`. |
|
||||
|
||||
## EditTreeLemmatizer.update {#update tag="method"}
|
||||
|
||||
Learn from a batch of [`Example`](/api/example) objects containing the
|
||||
predictions and gold-standard annotations, and update the component's model.
|
||||
Delegates to [`predict`](/api/edittreelemmatizer#predict) and
|
||||
[`get_loss`](/api/edittreelemmatizer#get_loss).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = lemmatizer.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## EditTreeLemmatizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
Find the loss and gradient of loss for the batch of documents and their
|
||||
predicted scores.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> scores = lemmatizer.model.begin_update([eg.predicted for eg in examples])
|
||||
> loss, d_loss = lemmatizer.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## EditTreeLemmatizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
Create an optimizer for the pipeline component.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> optimizer = lemmatizer.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## EditTreeLemmatizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
Modify the pipe's model, to use the given parameter values. At the end of the
|
||||
context, the original parameters are restored.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> with lemmatizer.use_params(optimizer.averages):
|
||||
> lemmatizer.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## EditTreeLemmatizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
Serialize the pipe to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> lemmatizer.to_disk("/path/to/lemmatizer")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## EditTreeLemmatizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> lemmatizer.from_disk("/path/to/lemmatizer")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `EditTreeLemmatizer` object. ~~EditTreeLemmatizer~~ |
|
||||
|
||||
## EditTreeLemmatizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> lemmatizer_bytes = lemmatizer.to_bytes()
|
||||
> ```
|
||||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `EditTreeLemmatizer` object. ~~bytes~~ |
|
||||
|
||||
## EditTreeLemmatizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer_bytes = lemmatizer.to_bytes()
|
||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
> lemmatizer.from_bytes(lemmatizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `EditTreeLemmatizer` object. ~~EditTreeLemmatizer~~ |
|
||||
|
||||
## EditTreeLemmatizer.labels {#labels tag="property"}
|
||||
|
||||
The labels currently added to the component.
|
||||
|
||||
<Infobox variant="warning" title="Interpretability of the labels">
|
||||
|
||||
The `EditTreeLemmatizer` labels are not useful by themselves, since they are
|
||||
identifiers of edit trees.
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## EditTreeLemmatizer.label_data {#label_data tag="property" new="3"}
|
||||
|
||||
The labels currently added to the component and their internal meta information.
|
||||
This is the data generated by [`init labels`](/api/cli#init-labels) and used by
|
||||
[`EditTreeLemmatizer.initialize`](/api/edittreelemmatizer#initialize) to
|
||||
initialize the model with a pre-defined label set.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> labels = lemmatizer.label_data
|
||||
> lemmatizer.initialize(lambda: [], nlp=nlp, labels=labels)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------- |
|
||||
| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
During serialization, spaCy will export several data fields used to restore
|
||||
different aspects of the object. If needed, you can exclude them from
|
||||
serialization by passing in the string names via the `exclude` argument.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> data = lemmatizer.to_disk("/path", exclude=["vocab"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||
| `trees` | The edit trees. You usually don't want to exclude this. |
|
|
@ -9,14 +9,15 @@ api_trainable: false
|
|||
---
|
||||
|
||||
Component for assigning base forms to tokens using rules based on part-of-speech
|
||||
tags, or lookup tables. Functionality to train the component is coming soon.
|
||||
Different [`Language`](/api/language) subclasses can implement their own
|
||||
lemmatizer components via
|
||||
tags, or lookup tables. Different [`Language`](/api/language) subclasses can
|
||||
implement their own lemmatizer components via
|
||||
[language-specific factories](/usage/processing-pipelines#factories-language).
|
||||
The default data used is provided by the
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||
extension package.
|
||||
|
||||
For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
|
||||
|
||||
<Infobox variant="warning" title="New in v3.0">
|
||||
|
||||
As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
|
||||
|
|
|
@ -45,10 +45,11 @@ components for different language processing tasks and also allows adding
|
|||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. |
|
||||
| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. |
|
||||
| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. |
|
||||
| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. |
|
||||
| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. |
|
||||
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. |
|
||||
| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. |
|
||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
|
||||
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
|
||||
|
|
|
@ -120,10 +120,13 @@ print(doc[2].pos_) # 'PRON'
|
|||
|
||||
## Lemmatization {#lemmatization model="lemmatizer" new="3"}
|
||||
|
||||
The [`Lemmatizer`](/api/lemmatizer) is a pipeline component that provides lookup
|
||||
and rule-based lemmatization methods in a configurable component. An individual
|
||||
language can extend the `Lemmatizer` as part of its
|
||||
[language data](#language-data).
|
||||
spaCy provides two pipeline components for lemmatization:
|
||||
|
||||
1. The [`Lemmatizer`](/api/lemmatizer) component provides lookup and rule-based
|
||||
lemmatization methods in a configurable component. An individual language can
|
||||
extend the `Lemmatizer` as part of its [language data](#language-data).
|
||||
2. The [`EditTreeLemmatizer`](/api/edittreelemmatizer)
|
||||
<Tag variant="new">3.3</Tag> component provides a trainable lemmatizer.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -197,6 +200,20 @@ information, without consulting the context of the token. The rule-based
|
|||
lemmatizer also accepts list-based exception files. For English, these are
|
||||
acquired from [WordNet](https://wordnet.princeton.edu/).
|
||||
|
||||
### Trainable lemmatizer
|
||||
|
||||
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
|
||||
transformations from a training corpus that includes lemma annotations. This
|
||||
removes the need to write language-specific rules and can (in many cases)
|
||||
provide higher accuracies than lookup and rule-based lemmatizers.
|
||||
|
||||
```python
|
||||
import spacy
|
||||
|
||||
nlp = spacy.blank("de")
|
||||
nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||
```
|
||||
|
||||
## Dependency Parsing {#dependency-parse model="parser"}
|
||||
|
||||
spaCy features a fast and accurate syntactic dependency parser, and has a rich
|
||||
|
@ -1189,7 +1206,7 @@ class WhitespaceTokenizer:
|
|||
spaces = spaces[0:-1]
|
||||
else:
|
||||
spaces[-1] = False
|
||||
|
||||
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
@ -1269,8 +1286,8 @@ hyperparameters, pipeline and tokenizer used for constructing and training the
|
|||
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
|
||||
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
||||
function called `whitespace_tokenizer` in the
|
||||
[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to
|
||||
construct your tokenizer during training, you can pass in your Python file by
|
||||
[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how
|
||||
to construct your tokenizer during training, you can pass in your Python file by
|
||||
setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
|
||||
|
||||
> #### config.cfg
|
||||
|
|
|
@ -303,22 +303,23 @@ available pipeline components and component functions.
|
|||
> ruler = nlp.add_pipe("entity_ruler")
|
||||
> ```
|
||||
|
||||
| String name | Component | Description |
|
||||
| -------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. |
|
||||
| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. |
|
||||
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
|
||||
| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
|
||||
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. |
|
||||
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. |
|
||||
| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. |
|
||||
| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words. |
|
||||
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
||||
| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Assign token attribute mappings and rule-based exceptions. |
|
||||
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. |
|
||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||
| String name | Component | Description |
|
||||
| ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. |
|
||||
| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. |
|
||||
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
|
||||
| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
|
||||
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. |
|
||||
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. |
|
||||
| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. |
|
||||
| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words using rules and lookups. |
|
||||
| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Assign base forms to words. |
|
||||
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
||||
| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Assign token attribute mappings and rule-based exceptions. |
|
||||
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. |
|
||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||
|
||||
### Disabling, excluding and modifying components {#disabling}
|
||||
|
||||
|
|
|
@ -93,6 +93,7 @@
|
|||
"items": [
|
||||
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
|
||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
||||
{ "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
|
||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||
|
|
Loading…
Reference in New Issue
Block a user