mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 05:10:21 +03:00
Compare commits
67 Commits
master
...
v4.0.0.dev
Author | SHA1 | Date | |
---|---|---|---|
|
6348a7a4b4 | ||
|
b052b1b47f | ||
|
a183db3cef | ||
|
5e297aa20e | ||
|
c2f3e699ca | ||
|
2c2e66e145 | ||
|
fc2723925b | ||
|
6ff5eb256c | ||
|
b2fd9490e3 | ||
|
a231bf65af | ||
|
b510fbd0aa | ||
|
326b541312 | ||
|
6852adc8b7 | ||
|
20b63943f5 | ||
|
d30ba9b7b8 | ||
|
2f08deea2a | ||
|
207565a788 | ||
|
f9308aae13 | ||
|
ca75190a3d | ||
|
f5aabaf7d6 | ||
|
d60997febb | ||
|
6b9af38eeb | ||
|
60379cec65 | ||
|
8267aa1b65 | ||
|
799d226676 | ||
|
04fea09ffd | ||
|
e79910d57e | ||
|
d0fc871a1c | ||
|
68b8fa2df2 | ||
|
cae4589f5a | ||
|
a4bd890f32 | ||
|
0e2b7fb28b | ||
|
103b24fb25 | ||
|
446a3ecf34 | ||
|
c6704f368c | ||
|
d4922f25fc | ||
|
e3027c65b8 | ||
|
5157e4e823 | ||
|
efdbb722c5 | ||
|
60c050e82b | ||
|
977b847cce | ||
|
4a615cacd2 | ||
|
698b8b495f | ||
|
98a916e01a | ||
|
4bce8fa755 | ||
|
2a558a7cdc | ||
|
1eb7ce5ef7 | ||
|
740c33fe58 | ||
|
8dd1fa9896 | ||
|
c44d243f25 | ||
|
bb0e178878 | ||
|
1a5be63715 | ||
|
d757dec5c4 | ||
|
551e73ccfc | ||
|
5d54c0e32a | ||
|
e581eeac34 | ||
|
b2d05f9f66 | ||
|
1ff683a50b | ||
|
ba18d2913d | ||
|
851a7ca4fa | ||
|
1605ef7319 | ||
|
7f3842f54d | ||
|
2f05c6824c | ||
|
10b7223021 | ||
|
5586fd9311 | ||
|
0e71bd973f | ||
|
75f7c15187 |
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"thinc>=9.0.0.dev2,<9.1.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
|||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=9.0.0.dev2,<9.1.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
|
|
12
setup.cfg
12
setup.cfg
|
@ -32,14 +32,6 @@ project_urls =
|
|||
zip_safe = false
|
||||
include_package_data = true
|
||||
python_requires = >=3.6
|
||||
setup_requires =
|
||||
cython>=0.25,<3.0
|
||||
numpy>=1.15.0
|
||||
# We also need our Cython packages here to compile against
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
|
@ -47,7 +39,7 @@ install_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=9.0.0.dev2,<9.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
|
@ -120,7 +112,7 @@ ja =
|
|||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py>=0.9.0
|
||||
mecab-ko>=1.0.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
11
setup.py
11
setup.py
|
@ -33,13 +33,10 @@ MOD_NAMES = [
|
|||
"spacy.kb.candidate",
|
||||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.ml.tb_framework",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
"spacy.pipeline.pipe",
|
||||
"spacy.pipeline.trainable_pipe",
|
||||
"spacy.pipeline.sentencizer",
|
||||
|
@ -47,12 +44,15 @@ MOD_NAMES = [
|
|||
"spacy.pipeline.tagger",
|
||||
"spacy.pipeline.transition_parser",
|
||||
"spacy.pipeline._parser_internals.arc_eager",
|
||||
"spacy.pipeline._parser_internals.batch",
|
||||
"spacy.pipeline._parser_internals.ner",
|
||||
"spacy.pipeline._parser_internals.nonproj",
|
||||
"spacy.pipeline._parser_internals.search",
|
||||
"spacy.pipeline._parser_internals._state",
|
||||
"spacy.pipeline._parser_internals.stateclass",
|
||||
"spacy.pipeline._parser_internals.transition_system",
|
||||
"spacy.pipeline._parser_internals._beam_utils",
|
||||
"spacy.pipeline._parser_internals._parser_utils",
|
||||
"spacy.tokenizer",
|
||||
"spacy.training.align",
|
||||
"spacy.training.gold_io",
|
||||
|
@ -62,12 +62,13 @@ MOD_NAMES = [
|
|||
"spacy.tokens.span_group",
|
||||
"spacy.tokens.graph",
|
||||
"spacy.tokens.morphanalysis",
|
||||
"spacy.tokens._retokenize",
|
||||
"spacy.tokens.retokenizer",
|
||||
"spacy.matcher.matcher",
|
||||
"spacy.matcher.phrasematcher",
|
||||
"spacy.matcher.dependencymatcher",
|
||||
"spacy.symbols",
|
||||
"spacy.vectors",
|
||||
"spacy.tests.parser._search",
|
||||
]
|
||||
COMPILE_OPTIONS = {
|
||||
"msvc": ["/Ox", "/EHsc"],
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.5.0"
|
||||
__version__ = "4.0.0.dev0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
129
spacy/attrs.pxd
129
spacy/attrs.pxd
|
@ -1,98 +1,49 @@
|
|||
# Reserve 64 values for flag features
|
||||
from . cimport symbols
|
||||
|
||||
cdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
LIKE_URL
|
||||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV_DEPRECATED
|
||||
IS_BRACKET
|
||||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT
|
||||
IS_CURRENCY
|
||||
NULL_ATTR = 0
|
||||
IS_ALPHA = symbols.IS_ALPHA
|
||||
IS_ASCII = symbols.IS_ASCII
|
||||
IS_DIGIT = symbols.IS_DIGIT
|
||||
IS_LOWER = symbols.IS_LOWER
|
||||
IS_PUNCT = symbols.IS_PUNCT
|
||||
IS_SPACE = symbols.IS_SPACE
|
||||
IS_TITLE = symbols.IS_TITLE
|
||||
IS_UPPER = symbols.IS_UPPER
|
||||
LIKE_URL = symbols.LIKE_URL
|
||||
LIKE_NUM = symbols.LIKE_NUM
|
||||
LIKE_EMAIL = symbols.LIKE_EMAIL
|
||||
IS_STOP = symbols.IS_STOP
|
||||
IS_BRACKET = symbols.IS_BRACKET
|
||||
IS_QUOTE = symbols.IS_QUOTE
|
||||
IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
|
||||
IS_CURRENCY = symbols.IS_CURRENCY
|
||||
|
||||
FLAG19 = 19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
ID = symbols.ID
|
||||
ORTH = symbols.ORTH
|
||||
LOWER = symbols.LOWER
|
||||
NORM = symbols.NORM
|
||||
SHAPE = symbols.SHAPE
|
||||
PREFIX = symbols.PREFIX
|
||||
SUFFIX = symbols.SUFFIX
|
||||
|
||||
ID
|
||||
ORTH
|
||||
LOWER
|
||||
NORM
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
LENGTH = symbols.LENGTH
|
||||
CLUSTER = symbols.CLUSTER
|
||||
LEMMA = symbols.LEMMA
|
||||
POS = symbols.POS
|
||||
TAG = symbols.TAG
|
||||
DEP = symbols.DEP
|
||||
ENT_IOB = symbols.ENT_IOB
|
||||
ENT_TYPE = symbols.ENT_TYPE
|
||||
HEAD = symbols.HEAD
|
||||
SENT_START = symbols.SENT_START
|
||||
SPACY = symbols.SPACY
|
||||
PROB = symbols.PROB
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
LEMMA
|
||||
POS
|
||||
TAG
|
||||
DEP
|
||||
ENT_IOB
|
||||
ENT_TYPE
|
||||
HEAD
|
||||
SENT_START
|
||||
SPACY
|
||||
PROB
|
||||
|
||||
LANG
|
||||
LANG = symbols.LANG
|
||||
ENT_KB_ID = symbols.ENT_KB_ID
|
||||
MORPH
|
||||
MORPH = symbols.MORPH
|
||||
ENT_ID = symbols.ENT_ID
|
||||
|
||||
IDX
|
||||
SENT_END
|
||||
IDX = symbols.IDX
|
||||
|
|
120
spacy/attrs.pyx
120
spacy/attrs.pyx
|
@ -16,57 +16,11 @@ IDS = {
|
|||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
|
||||
"IS_BRACKET": IS_BRACKET,
|
||||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||
"IS_CURRENCY": IS_CURRENCY,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
"FLAG22": FLAG22,
|
||||
"FLAG23": FLAG23,
|
||||
"FLAG24": FLAG24,
|
||||
"FLAG25": FLAG25,
|
||||
"FLAG26": FLAG26,
|
||||
"FLAG27": FLAG27,
|
||||
"FLAG28": FLAG28,
|
||||
"FLAG29": FLAG29,
|
||||
"FLAG30": FLAG30,
|
||||
"FLAG31": FLAG31,
|
||||
"FLAG32": FLAG32,
|
||||
"FLAG33": FLAG33,
|
||||
"FLAG34": FLAG34,
|
||||
"FLAG35": FLAG35,
|
||||
"FLAG36": FLAG36,
|
||||
"FLAG37": FLAG37,
|
||||
"FLAG38": FLAG38,
|
||||
"FLAG39": FLAG39,
|
||||
"FLAG40": FLAG40,
|
||||
"FLAG41": FLAG41,
|
||||
"FLAG42": FLAG42,
|
||||
"FLAG43": FLAG43,
|
||||
"FLAG44": FLAG44,
|
||||
"FLAG45": FLAG45,
|
||||
"FLAG46": FLAG46,
|
||||
"FLAG47": FLAG47,
|
||||
"FLAG48": FLAG48,
|
||||
"FLAG49": FLAG49,
|
||||
"FLAG50": FLAG50,
|
||||
"FLAG51": FLAG51,
|
||||
"FLAG52": FLAG52,
|
||||
"FLAG53": FLAG53,
|
||||
"FLAG54": FLAG54,
|
||||
"FLAG55": FLAG55,
|
||||
"FLAG56": FLAG56,
|
||||
"FLAG57": FLAG57,
|
||||
"FLAG58": FLAG58,
|
||||
"FLAG59": FLAG59,
|
||||
"FLAG60": FLAG60,
|
||||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
|
@ -92,12 +46,11 @@ IDS = {
|
|||
}
|
||||
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
NAMES = {v: k for k, v in IDS.items()}
|
||||
locals().update(IDS)
|
||||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
def intify_attrs(stringy_attrs, strings_map=None):
|
||||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
|
@ -109,75 +62,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
converted to ints.
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if "F" in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if "L" in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
stringy_attrs.pop("tenspect")
|
||||
morph_keys = [
|
||||
"PunctType",
|
||||
"PunctSide",
|
||||
"Other",
|
||||
"Degree",
|
||||
"AdvType",
|
||||
"Number",
|
||||
"VerbForm",
|
||||
"PronType",
|
||||
"Aspect",
|
||||
"Tense",
|
||||
"PartType",
|
||||
"Poss",
|
||||
"Hyph",
|
||||
"ConjType",
|
||||
"NumType",
|
||||
"Foreign",
|
||||
"VerbType",
|
||||
"NounType",
|
||||
"Gender",
|
||||
"Mood",
|
||||
"Negative",
|
||||
"Tense",
|
||||
"Voice",
|
||||
"Abbr",
|
||||
"Derivation",
|
||||
"Echo",
|
||||
"Foreign",
|
||||
"NameType",
|
||||
"NounType",
|
||||
"NumForm",
|
||||
"NumValue",
|
||||
"PartType",
|
||||
"Polite",
|
||||
"StyleVariant",
|
||||
"PronType",
|
||||
"AdjType",
|
||||
"Person",
|
||||
"Variant",
|
||||
"AdpType",
|
||||
"Reflex",
|
||||
"Negative",
|
||||
"Mood",
|
||||
"Aspect",
|
||||
"Case",
|
||||
"Polarity",
|
||||
"PrepCase",
|
||||
"Animacy", # U20
|
||||
]
|
||||
for key in morph_keys:
|
||||
if key in stringy_attrs:
|
||||
stringy_attrs.pop(key)
|
||||
elif key.lower() in stringy_attrs:
|
||||
stringy_attrs.pop(key.lower())
|
||||
elif key.upper() in stringy_attrs:
|
||||
stringy_attrs.pop(key.upper())
|
||||
for name, value in stringy_attrs.items():
|
||||
int_key = intify_attr(name)
|
||||
if int_key is not None:
|
||||
|
|
|
@ -8,7 +8,6 @@ from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
|||
from .. import about
|
||||
from ..util import is_package, get_minor_version, run_command
|
||||
from ..util import is_prerelease_version
|
||||
from ..errors import OLD_MODEL_SHORTCUTS
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -61,12 +60,6 @@ def download(
|
|||
version = components[-1]
|
||||
else:
|
||||
model_name = model
|
||||
if model in OLD_MODEL_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
|
||||
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
|
||||
|
|
|
@ -87,12 +87,11 @@ grad_factor = 1.0
|
|||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
|
@ -108,12 +107,11 @@ grad_factor = 1.0
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = false
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
|
@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
|
@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
|
|
|
@ -131,13 +131,6 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"and make it independent. For example, `replace_listeners = "
|
||||
"[\"model.tok2vec\"]` See the documentation for details: "
|
||||
"https://spacy.io/usage/training#config-components-listeners")
|
||||
W088 = ("The pipeline component {name} implements a `begin_training` "
|
||||
"method, which won't be called by spaCy. As of v3.0, `begin_training` "
|
||||
"has been renamed to `initialize`, so you likely want to rename the "
|
||||
"component method. See the documentation for details: "
|
||||
"https://spacy.io/api/language#initialize")
|
||||
W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
|
||||
"to `nlp.initialize`.")
|
||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
|
@ -216,6 +209,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
|
||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||
|
@ -251,9 +246,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"https://spacy.io/usage/models")
|
||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||
"tag, ent, dep_tag_offset, ent_tag.")
|
||||
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
||||
E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
|
||||
E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
|
||||
"refers to an issue with the `Vocab` or `StringStore`.")
|
||||
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
||||
|
@ -466,13 +459,13 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||
"Please use the option `validate=True` with the Matcher, PhraseMatcher, "
|
||||
"EntityRuler or AttributeRuler for more details.")
|
||||
"SpanRuler or AttributeRuler for more details.")
|
||||
E153 = ("The value type {vtype} is not supported for token patterns. "
|
||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||
"EntityRuler or AttributeRuler for more details.")
|
||||
"SpanRuler or AttributeRuler for more details.")
|
||||
E154 = ("One of the attributes or values is not supported for token "
|
||||
"patterns. Please use the option `validate=True` with the Matcher, "
|
||||
"PhraseMatcher, or EntityRuler for more details.")
|
||||
"PhraseMatcher, or SpanRuler for more details.")
|
||||
E155 = ("The pipeline needs to include a {pipe} in order to use "
|
||||
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||
"Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
|
||||
|
@ -496,7 +489,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||
E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
|
||||
E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
|
||||
"callable or None, but got: {arg_type}")
|
||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||
|
@ -733,13 +726,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"method in component '{name}'. If you want to use this "
|
||||
"method, make sure it's overwritten on the subclass.")
|
||||
E940 = ("Found NaN values in scores.")
|
||||
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
||||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||
"load the model, use its full name instead:\n\n"
|
||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||
"models, see the models directory: https://spacy.io/models. If you "
|
||||
"want to create a blank model, use spacy.blank: "
|
||||
"nlp = spacy.blank(\"{name}\")")
|
||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||
"return an initialized nlp object but got: {value}. Maybe "
|
||||
"you forgot to return the modified object in your function?")
|
||||
|
@ -753,7 +739,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"loaded nlp object, but got: {source}")
|
||||
E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
|
||||
"a string value from {expected} but got: '{arg}'")
|
||||
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
|
||||
E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
|
||||
"a list, but got: {arg_type}")
|
||||
E949 = ("Unable to align tokens for the predicted and reference docs. It "
|
||||
"is only possible to align the docs when both texts are the same "
|
||||
|
@ -927,8 +913,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||
"Non-UD tags should use the `tag` property.")
|
||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||
"exist.")
|
||||
E1024 = ("A pattern with {attr_type} '{label}' is not present in "
|
||||
"'{component}' patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
|
@ -969,14 +953,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
||||
"pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
|
||||
"nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
|
||||
"lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
|
||||
}
|
||||
# v4 error strings
|
||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||
E4001 = ("Expected input to be one of the following types: ({expected_types}), "
|
||||
"but got '{received_type}'")
|
||||
E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
|
||||
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||
"reference and predicted docs.")
|
||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||
|
||||
|
||||
# fmt: on
|
||||
|
|
|
@ -18,34 +18,23 @@ DEFAULT_CONFIG = """
|
|||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ko.KoreanTokenizer"
|
||||
mecab_args = ""
|
||||
"""
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||
def create_tokenizer():
|
||||
def create_tokenizer(mecab_args: str):
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp.vocab)
|
||||
return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
|
||||
|
||||
return korean_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
|
||||
self.vocab = vocab
|
||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
mecab = try_mecab_import()
|
||||
self.mecab_tokenizer = mecab.Tagger(mecab_args)
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
for line in self.mecab_tokenizer.parse(text).split("\n"):
|
||||
if line == "EOS":
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
surface, _, expr = line.partition("\t")
|
||||
features = expr.split("/")[0].split(",")
|
||||
tag = features[0]
|
||||
lemma = "*"
|
||||
if len(features) >= 8:
|
||||
lemma = features[7]
|
||||
if lemma == "*":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
@ -97,20 +88,94 @@ class Korean(Language):
|
|||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
def try_mecab_import():
|
||||
try:
|
||||
from natto import MeCab
|
||||
import mecab_ko as MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
"the python package `mecab-ko`: pip install mecab-ko"
|
||||
) from None
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
|
||||
def create_natto_tokenizer():
|
||||
def korean_natto_tokenizer_factory(nlp):
|
||||
return KoreanNattoTokenizer(nlp.vocab)
|
||||
|
||||
return korean_natto_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanNattoTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanNattoTokenizer, (self.vocab,)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
if lemma == "*" or lemma == "":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def _try_mecab_import(self):
|
||||
try:
|
||||
from natto import MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
prev_end = -1
|
||||
start = 0
|
||||
|
|
|
@ -17,10 +17,6 @@ URL_PATTERN = (
|
|||
r"(?:\S+(?::\S*)?@)?"
|
||||
r"(?:"
|
||||
# IP address exclusion
|
||||
# private & local networks
|
||||
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
||||
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
||||
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
||||
# IP address dotted notation octets
|
||||
# excludes loopback network 0.0.0.0
|
||||
# excludes reserved space >= 224.0.0.0
|
||||
|
|
|
@ -1239,15 +1239,6 @@ class Language:
|
|||
sgd(key, W, dW) # type: ignore[call-arg, misc]
|
||||
return losses
|
||||
|
||||
def begin_training(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||
return self.initialize(get_examples, sgd=sgd)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
|
|
|
@ -5,7 +5,6 @@ from .attrs cimport attr_id_t
|
|||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
|
||||
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
from .vocab cimport Vocab
|
||||
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ class Lexeme:
|
|||
def vector_norm(self) -> float: ...
|
||||
vector: Floats1d
|
||||
rank: int
|
||||
sentiment: float
|
||||
@property
|
||||
def orth_(self) -> str: ...
|
||||
@property
|
||||
|
|
|
@ -173,19 +173,6 @@ cdef class Lexeme:
|
|||
def __set__(self, value):
|
||||
self.c.id = value
|
||||
|
||||
property sentiment:
|
||||
"""RETURNS (float): A scalar value indicating the positivity or
|
||||
negativity of the lexeme."""
|
||||
def __get__(self):
|
||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||||
return sentiment_table.get(self.c.orth, 0.0)
|
||||
|
||||
def __set__(self, float x):
|
||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||||
sentiment_table[self.c.orth] = x
|
||||
|
||||
@property
|
||||
def orth_(self):
|
||||
"""RETURNS (str): The original verbatim text of the lexeme
|
||||
|
|
|
@ -165,9 +165,9 @@ cdef class DependencyMatcher:
|
|||
on_match (callable): Optional callback executed on match.
|
||||
"""
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List): # old API
|
||||
raise ValueError(Errors.E948.format(arg_type=type(patterns)))
|
||||
raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List):
|
||||
raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
raise ValueError(Errors.E012.format(key=key))
|
||||
|
|
|
@ -23,7 +23,7 @@ from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
|||
from .levenshtein import levenshtein_compare
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..strings import get_string_id
|
||||
from ..strings cimport get_string_id
|
||||
from ..attrs import IDS
|
||||
from ..util import registry
|
||||
|
||||
|
@ -115,9 +115,9 @@ cdef class Matcher:
|
|||
"""
|
||||
errors = {}
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List): # old API
|
||||
raise ValueError(Errors.E948.format(arg_type=type(patterns)))
|
||||
raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List):
|
||||
raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
|
||||
if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
|
||||
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
|
||||
for i, pattern in enumerate(patterns):
|
||||
|
@ -265,6 +265,10 @@ cdef class Matcher:
|
|||
# non-overlapping ones this `match` can be either (start, end) or
|
||||
# (start, end, alignments) depending on `with_alignments=` option.
|
||||
for key, *match in matches:
|
||||
# Adjust span matches to doc offsets
|
||||
if isinstance(doclike, Span):
|
||||
match[0] += doclike.start
|
||||
match[1] += doclike.start
|
||||
span_filter = self._filter.get(key)
|
||||
if span_filter is not None:
|
||||
pairs = pairs_by_id.get(key, [])
|
||||
|
@ -295,9 +299,6 @@ cdef class Matcher:
|
|||
if as_spans:
|
||||
final_results = []
|
||||
for key, start, end, *_ in final_matches:
|
||||
if isinstance(doclike, Span):
|
||||
start += doclike.start
|
||||
end += doclike.start
|
||||
final_results.append(Span(doc, start, end, label=key))
|
||||
elif with_alignments:
|
||||
# convert alignments List[Dict[str, int]] --> List[int]
|
||||
|
|
|
@ -20,6 +20,15 @@ class PhraseMatcher:
|
|||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||
] = ...,
|
||||
) -> None: ...
|
||||
def _add_from_arrays(
|
||||
self,
|
||||
key: str,
|
||||
specs: List[List[int]],
|
||||
*,
|
||||
on_match: Optional[
|
||||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||
] = ...,
|
||||
) -> None: ...
|
||||
def remove(self, key: str) -> None: ...
|
||||
@overload
|
||||
def __call__(
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import List
|
||||
from collections import defaultdict
|
||||
from libc.stdint cimport uintptr_t
|
||||
from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
|
||||
|
||||
|
@ -39,7 +41,7 @@ cdef class PhraseMatcher:
|
|||
"""
|
||||
self.vocab = vocab
|
||||
self._callbacks = {}
|
||||
self._docs = {}
|
||||
self._docs = defaultdict(set)
|
||||
self._validate = validate
|
||||
|
||||
self.mem = Pool()
|
||||
|
@ -155,66 +157,24 @@ cdef class PhraseMatcher:
|
|||
del self._callbacks[key]
|
||||
del self._docs[key]
|
||||
|
||||
def add(self, key, docs, *_docs, on_match=None):
|
||||
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
|
||||
second argument, with the on_match callback as an optional keyword
|
||||
argument.
|
||||
def _add_from_arrays(self, key, specs, *, on_match=None):
|
||||
"""Add a preprocessed list of specs, with an optional callback.
|
||||
|
||||
key (str): The match ID.
|
||||
docs (list): List of `Doc` objects representing match patterns.
|
||||
specs (List[List[int]]): A list of lists of hashes to match.
|
||||
on_match (callable): Callback executed on match.
|
||||
*_docs (Doc): For backwards compatibility: list of patterns to add
|
||||
as variable arguments. Will be ignored if a list of patterns is
|
||||
provided as the second argument.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#add
|
||||
"""
|
||||
if docs is None or hasattr(docs, "__call__"): # old API
|
||||
on_match = docs
|
||||
docs = _docs
|
||||
|
||||
_ = self.vocab[key]
|
||||
self._callbacks[key] = on_match
|
||||
self._docs.setdefault(key, set())
|
||||
|
||||
cdef MapStruct* current_node
|
||||
cdef MapStruct* internal_node
|
||||
cdef void* result
|
||||
|
||||
if isinstance(docs, Doc):
|
||||
raise ValueError(Errors.E179.format(key=key))
|
||||
for doc in docs:
|
||||
if len(doc) == 0:
|
||||
continue
|
||||
if isinstance(doc, Doc):
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for attr in attrs:
|
||||
if self.attr == attr and not has_annotation[attr]:
|
||||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer or tagger+attribute_ruler"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||
keyword = self._convert_to_array(doc)
|
||||
else:
|
||||
keyword = doc
|
||||
self._docs[key].add(tuple(keyword))
|
||||
self._callbacks[key] = on_match
|
||||
for spec in specs:
|
||||
self._docs[key].add(tuple(spec))
|
||||
|
||||
current_node = self.c_map
|
||||
for token in keyword:
|
||||
for token in spec:
|
||||
if token == self._terminal_hash:
|
||||
warnings.warn(Warnings.W021)
|
||||
break
|
||||
|
@ -233,6 +193,57 @@ cdef class PhraseMatcher:
|
|||
result = internal_node
|
||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||
|
||||
|
||||
def add(self, key, docs, *, on_match=None):
|
||||
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
||||
key, a list of one or more patterns, and (optionally) an on_match callback.
|
||||
|
||||
key (str): The match ID.
|
||||
docs (list): List of `Doc` objects representing match patterns.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
||||
If any of the input Docs are invalid, no internal state will be updated.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#add
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
raise ValueError(Errors.E179.format(key=key))
|
||||
if docs is None or not isinstance(docs, List):
|
||||
raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
|
||||
|
||||
_ = self.vocab[key]
|
||||
specs = []
|
||||
|
||||
for doc in docs:
|
||||
if len(doc) == 0:
|
||||
continue
|
||||
if not isinstance(doc, Doc):
|
||||
raise ValueError(Errors.E4000.format(type=type(doc)))
|
||||
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for attr in attrs:
|
||||
if self.attr == attr and not has_annotation[attr]:
|
||||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer or tagger+attribute_ruler"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||
specs.append(self._convert_to_array(doc))
|
||||
|
||||
self._add_from_arrays(key, specs, on_match=on_match)
|
||||
|
||||
def __call__(self, object doclike, *, as_spans=False):
|
||||
"""Find all sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
|
@ -345,7 +356,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
|
|||
matcher = PhraseMatcher(vocab, attr=attr)
|
||||
for key, specs in docs.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, specs, on_match=callback)
|
||||
matcher._add_from_arrays(key, specs, on_match=callback)
|
||||
return matcher
|
||||
|
||||
|
||||
|
|
|
@ -1,164 +0,0 @@
|
|||
from thinc.api import Model, normal_init
|
||||
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.PrecomputableAffine.v1")
|
||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||
model = Model(
|
||||
"precomputable_affine",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||
params={"W": None, "b": None, "pad": None},
|
||||
attrs={"dropout_rate": dropout},
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def forward(model, X, is_train):
|
||||
nF = model.get_dim("nF")
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.get_param("W")
|
||||
# Preallocate array for layer output, including padding.
|
||||
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
|
||||
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
|
||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||
|
||||
# Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
|
||||
# change its shape to (nF, nO, nP) without breaking existing models. So
|
||||
# we'll squeeze the first dimension here.
|
||||
Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
|
||||
|
||||
def backward(dY_ids):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
# thing from what we put out. We put out an array of shape:
|
||||
# (nB, nF, nO, nP), and get back:
|
||||
# (nB, nO, nP) and ids (nB, nF)
|
||||
# The ids tell us the values of nF, so we would have:
|
||||
#
|
||||
# dYf = zeros((nB, nF, nO, nP))
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# dYf[b, ids[b, f]] += dY[b]
|
||||
#
|
||||
# However, we avoid building that array for efficiency -- and just pass
|
||||
# in the indices.
|
||||
dY, ids = dY_ids
|
||||
assert dY.ndim == 3
|
||||
assert dY.shape[1] == nO, dY.shape
|
||||
assert dY.shape[2] == nP, dY.shape
|
||||
# nB = dY.shape[0]
|
||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
||||
Xf = X[ids]
|
||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
||||
|
||||
model.inc_grad("b", dY.sum(axis=0))
|
||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||
|
||||
Wopfi = W.transpose((1, 2, 0, 3))
|
||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||
|
||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||
model.inc_grad("W", dWopfi)
|
||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _backprop_precomputable_affine_padding(model, dY, ids):
|
||||
nB = dY.shape[0]
|
||||
nF = model.get_dim("nF")
|
||||
nP = model.get_dim("nP")
|
||||
nO = model.get_dim("nO")
|
||||
# Backprop the "padding", used as a filler for missing values.
|
||||
# Values that are missing are set to -1, and each state vector could
|
||||
# have multiple missing values. The padding has different values for
|
||||
# different missing features. The gradient of the padding vector is:
|
||||
#
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# if ids[b, f] < 0:
|
||||
# d_pad[f] += dY[b]
|
||||
#
|
||||
# Which can be rewritten as:
|
||||
#
|
||||
# (ids < 0).T @ dY
|
||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
||||
return d_pad.reshape((1, nF, nO, nP))
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
if model.has_param("W") and model.get_param("W").any():
|
||||
return
|
||||
|
||||
nF = model.get_dim("nF")
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.ops.alloc4f(nF, nO, nP, nI)
|
||||
b = model.ops.alloc2f(nO, nP)
|
||||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
||||
ids = ops.alloc((5000, nF), dtype="f")
|
||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.alloc((5000, nI), dtype="f")
|
||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
|
||||
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors = vectors.reshape((vectors.shape[0], nO, nP))
|
||||
vectors += b
|
||||
vectors = model.ops.asarray(vectors)
|
||||
if nP >= 2:
|
||||
return model.ops.maxout(vectors)[0]
|
||||
else:
|
||||
return vectors * (vectors >= 0)
|
||||
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
W = model.get_param("W").copy()
|
||||
b = model.get_param("b").copy()
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
W /= model.ops.xp.sqrt(var)
|
||||
model.set_param("W", W)
|
||||
elif abs(mean) >= tol_mean:
|
||||
b -= mean
|
||||
model.set_param("b", b)
|
||||
else:
|
||||
break
|
|
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
|
|||
"update",
|
||||
"rehearse",
|
||||
"get_loss",
|
||||
"get_teacher_student_loss",
|
||||
"initialize",
|
||||
"begin_update",
|
||||
"finish_update",
|
||||
|
|
|
@ -1,17 +1,20 @@
|
|||
from typing import Optional, List, cast
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
from typing import Optional, List, Tuple, Any
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model
|
||||
import warnings
|
||||
|
||||
from ...errors import Errors
|
||||
from ...errors import Errors, Warnings
|
||||
from ...compat import Literal
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
from ..tb_framework import TransitionModel
|
||||
from ...tokens import Doc
|
||||
from ...tokens.doc import Doc
|
||||
|
||||
TransitionSystem = Any # TODO
|
||||
State = Any # TODO
|
||||
|
||||
|
||||
@registry.architectures("spacy.TransitionBasedParser.v2")
|
||||
def build_tb_parser_model(
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v2")
|
||||
def transition_parser_v2(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
|
@ -19,6 +22,46 @@ def build_tb_parser_model(
|
|||
maxout_pieces: int,
|
||||
use_upper: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
if not use_upper:
|
||||
warnings.warn(Warnings.W400)
|
||||
|
||||
return build_tb_parser_model(
|
||||
tok2vec,
|
||||
state_type,
|
||||
extra_state_tokens,
|
||||
hidden_width,
|
||||
maxout_pieces,
|
||||
nO=nO,
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v3")
|
||||
def transition_parser_v3(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
return build_tb_parser_model(
|
||||
tok2vec,
|
||||
state_type,
|
||||
extra_state_tokens,
|
||||
hidden_width,
|
||||
maxout_pieces,
|
||||
nO=nO,
|
||||
)
|
||||
|
||||
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
"""
|
||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||
|
@ -51,14 +94,7 @@ def build_tb_parser_model(
|
|||
feature sets (for the NER) or 13 (for the parser).
|
||||
hidden_width (int): The width of the hidden layer.
|
||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||
is replaced with a ReLu non-linearity if use_upper=True, and no
|
||||
non-linearity if use_upper=False.
|
||||
use_upper (bool): Whether to use an additional hidden layer after the state
|
||||
vector in order to predict the action scores. It is recommended to set
|
||||
this to False for large pretrained models such as transformers, and True
|
||||
for smaller networks. The upper layer is computed on CPU, which becomes
|
||||
a bottleneck on larger GPU-based models, where it's also less necessary.
|
||||
Recommended values are 1, 2 or 3.
|
||||
nO (int or None): The number of actions the model will predict between.
|
||||
Usually inferred from data at the beginning of training, or loaded from
|
||||
disk.
|
||||
|
@ -69,106 +105,11 @@ def build_tb_parser_model(
|
|||
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||
else:
|
||||
raise ValueError(Errors.E917.format(value=state_type))
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
list2array(),
|
||||
Linear(hidden_width, t2v_width),
|
||||
return TransitionModel(
|
||||
tok2vec=tok2vec,
|
||||
state_tokens=nr_feature_tokens,
|
||||
hidden_width=hidden_width,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nO=nO,
|
||||
unseen_classes=set(),
|
||||
)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
lower = _define_lower(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
nF=nr_feature_tokens,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
)
|
||||
upper = None
|
||||
if use_upper:
|
||||
with use_ops("cpu"):
|
||||
# Initialize weights at zero, as it's a classification layer.
|
||||
upper = _define_upper(nO=nO, nI=None)
|
||||
return TransitionModel(tok2vec, lower, upper, resize_output)
|
||||
|
||||
|
||||
def _define_upper(nO, nI):
|
||||
return Linear(nO=nO, nI=nI, init_W=zero_init)
|
||||
|
||||
|
||||
def _define_lower(nO, nF, nI, nP):
|
||||
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
|
||||
|
||||
|
||||
def resize_output(model, new_nO):
|
||||
if model.attrs["has_upper"]:
|
||||
return _resize_upper(model, new_nO)
|
||||
return _resize_lower(model, new_nO)
|
||||
|
||||
|
||||
def _resize_upper(model, new_nO):
|
||||
upper = model.get_ref("upper")
|
||||
if upper.has_dim("nO") is None:
|
||||
upper.set_dim("nO", new_nO)
|
||||
return model
|
||||
elif new_nO == upper.get_dim("nO"):
|
||||
return model
|
||||
|
||||
smaller = upper
|
||||
nI = smaller.maybe_get_dim("nI")
|
||||
with use_ops("cpu"):
|
||||
larger = _define_upper(nO=new_nO, nI=nI)
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
if smaller.has_param("W"):
|
||||
larger_W = larger.ops.alloc2f(new_nO, nI)
|
||||
larger_b = larger.ops.alloc1f(new_nO)
|
||||
smaller_W = smaller.get_param("W")
|
||||
smaller_b = smaller.get_param("b")
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
if smaller.has_dim("nO"):
|
||||
old_nO = smaller.get_dim("nO")
|
||||
larger_W[:old_nO] = smaller_W
|
||||
larger_b[:old_nO] = smaller_b
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
larger.set_param("W", larger_W)
|
||||
larger.set_param("b", larger_b)
|
||||
model._layers[-1] = larger
|
||||
model.set_ref("upper", larger)
|
||||
return model
|
||||
|
||||
|
||||
def _resize_lower(model, new_nO):
|
||||
lower = model.get_ref("lower")
|
||||
if lower.has_dim("nO") is None:
|
||||
lower.set_dim("nO", new_nO)
|
||||
return model
|
||||
|
||||
smaller = lower
|
||||
nI = smaller.maybe_get_dim("nI")
|
||||
nF = smaller.maybe_get_dim("nF")
|
||||
nP = smaller.maybe_get_dim("nP")
|
||||
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
if smaller.has_param("W"):
|
||||
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
|
||||
larger_b = larger.ops.alloc2f(new_nO, nP)
|
||||
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
|
||||
smaller_W = smaller.get_param("W")
|
||||
smaller_b = smaller.get_param("b")
|
||||
smaller_pad = smaller.get_param("pad")
|
||||
# Copy the old weights and padding into the new layer
|
||||
if smaller.has_dim("nO"):
|
||||
old_nO = smaller.get_dim("nO")
|
||||
larger_W[:, 0:old_nO, :, :] = smaller_W
|
||||
larger_pad[:, :, 0:old_nO, :] = smaller_pad
|
||||
larger_b[0:old_nO, :] = smaller_b
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
larger.set_param("W", larger_W)
|
||||
larger.set_param("b", larger_b)
|
||||
larger.set_param("pad", larger_pad)
|
||||
model._layers[1] = larger
|
||||
model.set_ref("lower", larger)
|
||||
return model
|
||||
|
|
|
@ -7,7 +7,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|||
from ...tokens import Doc
|
||||
from ...util import registry
|
||||
from ...errors import Errors
|
||||
from ...ml import _character_embed
|
||||
from ...ml import character_embed
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
from ...pipeline.tok2vec import Tok2VecListener
|
||||
|
@ -226,7 +226,7 @@ def CharacterEmbed(
|
|||
if feature is None:
|
||||
raise ValueError(Errors.E911.format(feat=feature))
|
||||
char_embed = chain(
|
||||
_character_embed.CharacterEmbed(nM=nM, nC=nC),
|
||||
character_embed.CharacterEmbed(nM=nM, nC=nC),
|
||||
cast(Model[List[Floats2d], Ragged], list2ragged()),
|
||||
)
|
||||
feature_extractor: Model[List[Doc], Ragged] = chain(
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
from libc.string cimport memset, memcpy
|
||||
from thinc.backends.cblas cimport CBlas
|
||||
from ..typedefs cimport weight_t, hash_t
|
||||
from ..pipeline._parser_internals._state cimport StateC
|
||||
|
||||
|
||||
cdef struct SizesC:
|
||||
int states
|
||||
int classes
|
||||
int hiddens
|
||||
int pieces
|
||||
int feats
|
||||
int embed_width
|
||||
|
||||
|
||||
cdef struct WeightsC:
|
||||
const float* feat_weights
|
||||
const float* feat_bias
|
||||
const float* hidden_bias
|
||||
const float* hidden_weights
|
||||
const float* seen_classes
|
||||
|
||||
|
||||
cdef struct ActivationsC:
|
||||
int* token_ids
|
||||
float* unmaxed
|
||||
float* scores
|
||||
float* hiddens
|
||||
int* is_valid
|
||||
int _curr_size
|
||||
int _max_size
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *
|
||||
|
||||
cdef ActivationsC alloc_activations(SizesC n) nogil
|
||||
|
||||
cdef void free_activations(const ActivationsC* A) nogil
|
||||
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
||||
|
|
@ -1,492 +0,0 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
cimport numpy as np
|
||||
from libc.math cimport exp
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from thinc.backends.linalg cimport Vec, VecVec
|
||||
from thinc.backends.cblas cimport saxpy, sgemm
|
||||
|
||||
import numpy
|
||||
import numpy.random
|
||||
from thinc.api import Model, CupyOps, NumpyOps, get_ops
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..typedefs cimport weight_t, class_t, hash_t
|
||||
from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *:
|
||||
cdef WeightsC output
|
||||
cdef precompute_hiddens state2vec = model.state2vec
|
||||
output.feat_weights = state2vec.get_feat_weights()
|
||||
output.feat_bias = <const float*>state2vec.bias.data
|
||||
cdef np.ndarray vec2scores_W
|
||||
cdef np.ndarray vec2scores_b
|
||||
if model.vec2scores is None:
|
||||
output.hidden_weights = NULL
|
||||
output.hidden_bias = NULL
|
||||
else:
|
||||
vec2scores_W = model.vec2scores.get_param("W")
|
||||
vec2scores_b = model.vec2scores.get_param("b")
|
||||
output.hidden_weights = <const float*>vec2scores_W.data
|
||||
output.hidden_bias = <const float*>vec2scores_b.data
|
||||
cdef np.ndarray class_mask = model._class_mask
|
||||
output.seen_classes = <const float*>class_mask.data
|
||||
return output
|
||||
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||
cdef SizesC output
|
||||
output.states = batch_size
|
||||
if model.vec2scores is None:
|
||||
output.classes = model.state2vec.get_dim("nO")
|
||||
else:
|
||||
output.classes = model.vec2scores.get_dim("nO")
|
||||
output.hiddens = model.state2vec.get_dim("nO")
|
||||
output.pieces = model.state2vec.get_dim("nP")
|
||||
output.feats = model.state2vec.get_dim("nF")
|
||||
output.embed_width = model.tokvecs.shape[1]
|
||||
return output
|
||||
|
||||
|
||||
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
||||
cdef ActivationsC A
|
||||
memset(&A, 0, sizeof(A))
|
||||
resize_activations(&A, n)
|
||||
return A
|
||||
|
||||
|
||||
cdef void free_activations(const ActivationsC* A) nogil:
|
||||
free(A.token_ids)
|
||||
free(A.scores)
|
||||
free(A.unmaxed)
|
||||
free(A.hiddens)
|
||||
free(A.is_valid)
|
||||
|
||||
|
||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||
if n.states <= A._max_size:
|
||||
A._curr_size = n.states
|
||||
return
|
||||
if A._max_size == 0:
|
||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>realloc(A.scores,
|
||||
n.states * n.classes * sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
cdef double one = 1.0
|
||||
resize_activations(A, n)
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
sum_state_features(cblas, A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
W.feat_bias, 1., n.hiddens * n.pieces)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
||||
if W.hidden_weights == NULL:
|
||||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||
1.0, <const float *>A.hiddens, n.hiddens,
|
||||
<const float *>W.hidden_weights, n.hiddens,
|
||||
0.0, A.scores, n.classes)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.scores[i*n.classes],
|
||||
W.hidden_bias, 1., n.classes)
|
||||
# Set unseen classes to minimum value
|
||||
i = 0
|
||||
min_ = A.scores[0]
|
||||
for i in range(1, n.states * n.classes):
|
||||
if A.scores[i] < min_:
|
||||
min_ = A.scores[i]
|
||||
for i in range(n.states):
|
||||
for j in range(n.classes):
|
||||
if not W.seen_classes[j]:
|
||||
A.scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
padding = cached
|
||||
cached += F * O
|
||||
cdef int id_stride = F*O
|
||||
cdef float one = 1.
|
||||
for b in range(B):
|
||||
for f in range(F):
|
||||
if token_ids[f] < 0:
|
||||
feature = &padding[f*O]
|
||||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||
token_ids += F
|
||||
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores,
|
||||
int O) nogil:
|
||||
"""Do multi-label log loss"""
|
||||
cdef double max_, gmax, Z, gZ
|
||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||
guess = Vec.arg_max(scores, O)
|
||||
if best == -1 or guess == -1:
|
||||
# These shouldn't happen, but if they do, we want to make sure we don't
|
||||
# cause an OOB access.
|
||||
return
|
||||
Z = 1e-10
|
||||
gZ = 1e-10
|
||||
max_ = scores[guess]
|
||||
gmax = scores[best]
|
||||
for i in range(O):
|
||||
Z += exp(scores[i] - max_)
|
||||
if costs[i] <= costs[best]:
|
||||
gZ += exp(scores[i] - gmax)
|
||||
for i in range(O):
|
||||
if costs[i] <= costs[best]:
|
||||
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
||||
else:
|
||||
d_scores[i] = exp(scores[i]-max_) / Z
|
||||
|
||||
|
||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||
const int* is_valid, int n) nogil:
|
||||
# Find minimum cost
|
||||
cdef float cost = 1
|
||||
for i in range(n):
|
||||
if is_valid[i] and costs[i] < cost:
|
||||
cost = costs[i]
|
||||
# Now find best-scoring with that cost
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if costs[i] <= cost and is_valid[i]:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||
dropout=0.1):
|
||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||
self.attrs["has_upper"] = has_upper
|
||||
self.attrs["dropout_rate"] = dropout
|
||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||
if layers[1].get_dim("nP") >= 2:
|
||||
activation = "maxout"
|
||||
elif has_upper:
|
||||
activation = None
|
||||
else:
|
||||
activation = "relu"
|
||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
||||
activation=activation, train=train)
|
||||
if has_upper:
|
||||
self.vec2scores = layers[-1]
|
||||
else:
|
||||
self.vec2scores = None
|
||||
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
||||
self.backprops = []
|
||||
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
||||
self._class_mask.fill(1)
|
||||
if unseen_classes is not None:
|
||||
for class_ in unseen_classes:
|
||||
self._class_mask[class_] = 0.
|
||||
|
||||
def clear_memory(self):
|
||||
del self.tokvecs
|
||||
del self.bp_tokvecs
|
||||
del self.state2vec
|
||||
del self.backprops
|
||||
del self._class_mask
|
||||
|
||||
@property
|
||||
def nO(self):
|
||||
if self.attrs["has_upper"]:
|
||||
return self.vec2scores.get_dim("nO")
|
||||
else:
|
||||
return self.state2vec.get_dim("nO")
|
||||
|
||||
def class_is_unseen(self, class_):
|
||||
return self._class_mask[class_]
|
||||
|
||||
def mark_class_unseen(self, class_):
|
||||
self._class_mask[class_] = 0
|
||||
|
||||
def mark_class_seen(self, class_):
|
||||
self._class_mask[class_] = 1
|
||||
|
||||
def get_token_ids(self, states):
|
||||
cdef StateClass state
|
||||
states = [state for state in states if not state.is_final()]
|
||||
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
||||
dtype='i', order='C')
|
||||
ids.fill(-1)
|
||||
c_ids = <int*>ids.data
|
||||
for state in states:
|
||||
state.c.set_context_tokens(c_ids, ids.shape[1])
|
||||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||
if isinstance(self.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
self.backprops.append((
|
||||
util.get_async(self.cuda_stream, token_ids),
|
||||
util.get_async(self.cuda_stream, d_vector),
|
||||
get_d_tokvecs
|
||||
))
|
||||
else:
|
||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
|
||||
|
||||
def finish_steps(self, golds):
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if self.cuda_stream is not None:
|
||||
self.cuda_stream.synchronize()
|
||||
for ids, d_vector, bp_vector in self.backprops:
|
||||
d_state_features = bp_vector((d_vector, ids))
|
||||
ids = ids.flatten()
|
||||
d_state_features = d_state_features.reshape(
|
||||
(ids.size, d_state_features.shape[2]))
|
||||
self.ops.scatter_add(d_tokvecs, ids,
|
||||
d_state_features)
|
||||
# Padded -- see update()
|
||||
self.bp_tokvecs(d_tokvecs[:-1])
|
||||
return d_tokvecs
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
def step_forward(model: ParserStepModel, states, is_train):
|
||||
token_ids = model.get_token_ids(states)
|
||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||
mask = None
|
||||
if model.attrs["has_upper"]:
|
||||
dropout_rate = model.attrs["dropout_rate"]
|
||||
if is_train and dropout_rate > 0:
|
||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||
vector *= mask
|
||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
get_d_vector = lambda d_scores: d_scores
|
||||
# If the class is unseen, make sure its score is minimum
|
||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
||||
|
||||
def backprop_parser_step(d_scores):
|
||||
# Zero vectors for unseen classes
|
||||
d_scores *= model._class_mask
|
||||
d_vector = get_d_vector(d_scores)
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
||||
return None
|
||||
return scores, backprop_parser_step
|
||||
|
||||
|
||||
cdef class precompute_hiddens:
|
||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||
|
||||
This is used for the parser, where we want to take a batch of documents,
|
||||
and compute vectors for each (token, position) pair. These vectors can then
|
||||
be reused, especially for beam-search.
|
||||
|
||||
Let's say we're using 12 features for each state, e.g. word at start of
|
||||
buffer, three words on stack, their children, etc. In the normal arc-eager
|
||||
system, a document of length N is processed in 2*N states. This means we'll
|
||||
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
||||
N*12 vector computations. The saving for beam-search is much better:
|
||||
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
||||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
||||
we can do all our hard maths up front, packed into large multiplications,
|
||||
and do the hard-to-program parsing on the CPU.
|
||||
"""
|
||||
cdef readonly int nF, nO, nP
|
||||
cdef bint _is_synchronized
|
||||
cdef public object ops
|
||||
cdef public object numpy_ops
|
||||
cdef np.ndarray _features
|
||||
cdef np.ndarray _cached
|
||||
cdef np.ndarray bias
|
||||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
cdef object activation
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
activation="maxout", train=False):
|
||||
gpu_cached, bp_features = lower_model(tokvecs, train)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
# Note the passing of cuda_stream here: it lets
|
||||
# cupy make the copy asynchronously.
|
||||
# We then have to block before first use.
|
||||
cached = gpu_cached.get(stream=cuda_stream)
|
||||
else:
|
||||
cached = gpu_cached
|
||||
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
|
||||
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
|
||||
else:
|
||||
self.bias = lower_model.get_param("b")
|
||||
self.nF = cached.shape[1]
|
||||
if lower_model.has_dim("nP"):
|
||||
self.nP = lower_model.get_dim("nP")
|
||||
else:
|
||||
self.nP = 1
|
||||
self.nO = cached.shape[2]
|
||||
self.ops = lower_model.ops
|
||||
self.numpy_ops = NumpyOps()
|
||||
assert activation in (None, "relu", "maxout")
|
||||
self.activation = activation
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
||||
def has_dim(self, name):
|
||||
if name == "nF":
|
||||
return self.nF if self.nF is not None else True
|
||||
elif name == "nP":
|
||||
return self.nP if self.nP is not None else True
|
||||
elif name == "nO":
|
||||
return self.nO if self.nO is not None else True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_dim(self, name):
|
||||
if name == "nF":
|
||||
return self.nF
|
||||
elif name == "nP":
|
||||
return self.nP
|
||||
elif name == "nO":
|
||||
return self.nO
|
||||
else:
|
||||
raise ValueError(Errors.E1033.format(name=name))
|
||||
|
||||
def set_dim(self, name, value):
|
||||
if name == "nF":
|
||||
self.nF = value
|
||||
elif name == "nP":
|
||||
self.nP = value
|
||||
elif name == "nO":
|
||||
self.nO = value
|
||||
else:
|
||||
raise ValueError(Errors.E1033.format(name=name))
|
||||
|
||||
def __call__(self, X, bint is_train):
|
||||
if is_train:
|
||||
return self.begin_update(X)
|
||||
else:
|
||||
return self.predict(X), lambda X: X
|
||||
|
||||
def predict(self, X):
|
||||
return self.begin_update(X)[0]
|
||||
|
||||
def begin_update(self, token_ids):
|
||||
cdef np.ndarray state_vector = numpy.zeros(
|
||||
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
||||
# This is tricky, but (assuming GPU available);
|
||||
# - Input to forward on CPU
|
||||
# - Output from forward on CPU
|
||||
# - Input to backward on GPU!
|
||||
# - Output from backward on GPU
|
||||
bp_hiddens = self._bp_hiddens
|
||||
|
||||
cdef CBlas cblas
|
||||
if isinstance(self.ops, CupyOps):
|
||||
cblas = NUMPY_OPS.cblas()
|
||||
else:
|
||||
cblas = self.ops.cblas()
|
||||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(cblas, <float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector_ids):
|
||||
d_state_vector, token_ids = d_state_vector_ids
|
||||
d_state_vector = bp_nonlinearity(d_state_vector)
|
||||
d_tokens = bp_hiddens((d_state_vector, token_ids))
|
||||
return d_tokens
|
||||
return state_vector, backward
|
||||
|
||||
def _nonlinearity(self, state_vector):
|
||||
if self.activation == "maxout":
|
||||
return self._maxout_nonlinearity(state_vector)
|
||||
else:
|
||||
return self._relu_nonlinearity(state_vector)
|
||||
|
||||
def _maxout_nonlinearity(self, state_vector):
|
||||
state_vector, mask = self.numpy_ops.maxout(state_vector)
|
||||
# We're outputting to CPU, but we need this variable on GPU for the
|
||||
# backward pass.
|
||||
mask = self.ops.asarray(mask)
|
||||
|
||||
def backprop_maxout(d_best):
|
||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||
|
||||
return state_vector, backprop_maxout
|
||||
|
||||
def _relu_nonlinearity(self, state_vector):
|
||||
state_vector = state_vector.reshape((state_vector.shape[0], -1))
|
||||
mask = state_vector >= 0.
|
||||
state_vector *= mask
|
||||
# We're outputting to CPU, but we need this variable on GPU for the
|
||||
# backward pass.
|
||||
mask = self.ops.asarray(mask)
|
||||
|
||||
def backprop_relu(d_best):
|
||||
d_best *= mask
|
||||
return d_best.reshape((d_best.shape + (1,)))
|
||||
|
||||
return state_vector, backprop_relu
|
28
spacy/ml/tb_framework.pxd
Normal file
28
spacy/ml/tb_framework.pxd
Normal file
|
@ -0,0 +1,28 @@
|
|||
from libc.stdint cimport int8_t
|
||||
|
||||
|
||||
cdef struct SizesC:
|
||||
int states
|
||||
int classes
|
||||
int hiddens
|
||||
int pieces
|
||||
int feats
|
||||
int embed_width
|
||||
int tokens
|
||||
|
||||
|
||||
cdef struct WeightsC:
|
||||
const float* feat_weights
|
||||
const float* feat_bias
|
||||
const float* hidden_bias
|
||||
const float* hidden_weights
|
||||
const int8_t* seen_mask
|
||||
|
||||
|
||||
cdef struct ActivationsC:
|
||||
int* token_ids
|
||||
float* unmaxed
|
||||
float* hiddens
|
||||
int* is_valid
|
||||
int _curr_size
|
||||
int _max_size
|
|
@ -1,50 +0,0 @@
|
|||
from thinc.api import Model, noop
|
||||
from .parser_model import ParserStepModel
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.TransitionModel.v1")
|
||||
def TransitionModel(
|
||||
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
||||
):
|
||||
"""Set up a stepwise transition-based model"""
|
||||
if upper is None:
|
||||
has_upper = False
|
||||
upper = noop()
|
||||
else:
|
||||
has_upper = True
|
||||
# don't define nO for this object, because we can't dynamically change it
|
||||
return Model(
|
||||
name="parser_model",
|
||||
forward=forward,
|
||||
dims={"nI": tok2vec.maybe_get_dim("nI")},
|
||||
layers=[tok2vec, lower, upper],
|
||||
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
|
||||
init=init,
|
||||
attrs={
|
||||
"has_upper": has_upper,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(model, X, is_train):
|
||||
step_model = ParserStepModel(
|
||||
X,
|
||||
model.layers,
|
||||
unseen_classes=model.attrs["unseen_classes"],
|
||||
train=is_train,
|
||||
has_upper=model.attrs["has_upper"],
|
||||
)
|
||||
|
||||
return step_model, step_model.finish_steps
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
model.get_ref("tok2vec").initialize(X=X)
|
||||
lower = model.get_ref("lower")
|
||||
lower.initialize()
|
||||
if model.attrs["has_upper"]:
|
||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
||||
model.get_ref("upper").initialize(X=statevecs)
|
621
spacy/ml/tb_framework.pyx
Normal file
621
spacy/ml/tb_framework.pyx
Normal file
|
@ -0,0 +1,621 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
from typing import List, Tuple, Any, Optional, TypeVar, cast
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from libcpp.vector cimport vector
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
from thinc.api import Model, normal_init, chain, list2array, Linear
|
||||
from thinc.api import uniform_init, glorot_uniform_init, zero_init
|
||||
from thinc.api import NumpyOps
|
||||
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
|
||||
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
|
||||
from thinc.types import Ints1d, Ints2d
|
||||
|
||||
from ..errors import Errors
|
||||
from ..pipeline._parser_internals import _beam_utils
|
||||
from ..pipeline._parser_internals.batch import GreedyBatch
|
||||
from ..pipeline._parser_internals._parser_utils cimport arg_max
|
||||
from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
|
||||
from ..pipeline._parser_internals.transition_system cimport TransitionSystem
|
||||
from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
|
||||
from ..tokens.doc import Doc
|
||||
from ..util import registry
|
||||
|
||||
|
||||
State = Any # TODO
|
||||
|
||||
|
||||
@registry.layers("spacy.TransitionModel.v2")
|
||||
def TransitionModel(
|
||||
*,
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
beam_width: int = 1,
|
||||
beam_density: float = 0.0,
|
||||
state_tokens: int,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
unseen_classes=set(),
|
||||
) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
|
||||
"""Set up a transition-based parsing model, using a maxout hidden
|
||||
layer and a linear output layer.
|
||||
"""
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
|
||||
tok2vec_projected.set_dim("nO", hidden_width)
|
||||
|
||||
# FIXME: we use `output` as a container for the output layer's
|
||||
# weights and biases. Thinc optimizers cannot handle resizing
|
||||
# of parameters. So, when the parser model is resized, we
|
||||
# construct a new `output` layer, which has a different key in
|
||||
# the optimizer. Once the optimizer supports parameter resizing,
|
||||
# we can replace the `output` layer by `output_W` and `output_b`
|
||||
# parameters in this model.
|
||||
output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
|
||||
|
||||
return Model(
|
||||
name="parser_model",
|
||||
forward=forward,
|
||||
init=init,
|
||||
layers=[tok2vec_projected, output],
|
||||
refs={
|
||||
"tok2vec": tok2vec_projected,
|
||||
"output": output,
|
||||
},
|
||||
params={
|
||||
"hidden_W": None, # Floats2d W for the hidden layer
|
||||
"hidden_b": None, # Floats1d bias for the hidden layer
|
||||
"hidden_pad": None, # Floats1d padding for the hidden layer
|
||||
},
|
||||
dims={
|
||||
"nO": None, # Output size
|
||||
"nP": maxout_pieces,
|
||||
"nH": hidden_width,
|
||||
"nI": tok2vec_projected.maybe_get_dim("nO"),
|
||||
"nF": state_tokens,
|
||||
},
|
||||
attrs={
|
||||
"beam_width": beam_width,
|
||||
"beam_density": beam_density,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def resize_output(model: Model, new_nO: int) -> Model:
|
||||
old_nO = model.maybe_get_dim("nO")
|
||||
output = model.get_ref("output")
|
||||
if old_nO is None:
|
||||
model.set_dim("nO", new_nO)
|
||||
output.set_dim("nO", new_nO)
|
||||
output.initialize()
|
||||
return model
|
||||
elif new_nO <= old_nO:
|
||||
return model
|
||||
elif output.has_param("W"):
|
||||
nH = model.get_dim("nH")
|
||||
new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
|
||||
new_output.initialize()
|
||||
new_W = new_output.get_param("W")
|
||||
new_b = new_output.get_param("b")
|
||||
old_W = output.get_param("W")
|
||||
old_b = output.get_param("b")
|
||||
new_W[:old_nO] = old_W # type: ignore
|
||||
new_b[:old_nO] = old_b # type: ignore
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
model.layers[-1] = new_output
|
||||
model.set_ref("output", new_output)
|
||||
# TODO: Avoid this private intrusion
|
||||
model._dims["nO"] = new_nO
|
||||
return model
|
||||
|
||||
|
||||
def init(
|
||||
model,
|
||||
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
|
||||
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
|
||||
):
|
||||
if X is not None:
|
||||
docs, moves = X
|
||||
model.get_ref("tok2vec").initialize(X=docs)
|
||||
else:
|
||||
model.get_ref("tok2vec").initialize()
|
||||
inferred_nO = _infer_nO(Y)
|
||||
if inferred_nO is not None:
|
||||
current_nO = model.maybe_get_dim("nO")
|
||||
if current_nO is None or current_nO != inferred_nO:
|
||||
model.attrs["resize_output"](model, inferred_nO)
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nH = model.get_dim("nH")
|
||||
nI = model.get_dim("nI")
|
||||
nF = model.get_dim("nF")
|
||||
ops = model.ops
|
||||
|
||||
Wl = ops.alloc2f(nH * nP, nF * nI)
|
||||
bl = ops.alloc1f(nH * nP)
|
||||
padl = ops.alloc1f(nI)
|
||||
# Wl = zero_init(ops, Wl.shape)
|
||||
Wl = glorot_uniform_init(ops, Wl.shape)
|
||||
padl = uniform_init(ops, padl.shape) # type: ignore
|
||||
# TODO: Experiment with whether better to initialize output_W
|
||||
model.set_param("hidden_W", Wl)
|
||||
model.set_param("hidden_b", bl)
|
||||
model.set_param("hidden_pad", padl)
|
||||
# model = _lsuv_init(model)
|
||||
return model
|
||||
|
||||
|
||||
class TransitionModelInputs:
|
||||
"""
|
||||
Input to transition model.
|
||||
"""
|
||||
|
||||
# dataclass annotation is not yet supported in Cython 0.29.x,
|
||||
# so, we'll do something close to it.
|
||||
|
||||
actions: Optional[List[Ints1d]]
|
||||
docs: List[Doc]
|
||||
max_moves: int
|
||||
moves: TransitionSystem
|
||||
states: Optional[List[State]]
|
||||
|
||||
__slots__ = [
|
||||
"actions",
|
||||
"docs",
|
||||
"max_moves",
|
||||
"moves",
|
||||
"states",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
docs: List[Doc],
|
||||
moves: TransitionSystem,
|
||||
actions: Optional[List[Ints1d]]=None,
|
||||
max_moves: int=0,
|
||||
states: Optional[List[State]]=None):
|
||||
"""
|
||||
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
|
||||
docs (List[Doc]): Docs to predict transition sequences for.
|
||||
max_moves: (int): the maximum number of moves to apply, values less
|
||||
than 1 will apply moves to states until they are final states.
|
||||
moves (TransitionSystem): the transition system to use when predicting
|
||||
the transition sequences.
|
||||
states (Optional[List[States]]): the initial states to predict the
|
||||
transition sequences for. When absent, the initial states are
|
||||
initialized from the provided Docs.
|
||||
"""
|
||||
self.actions = actions
|
||||
self.docs = docs
|
||||
self.moves = moves
|
||||
self.max_moves = max_moves
|
||||
self.states = states
|
||||
|
||||
|
||||
def forward(model, inputs: TransitionModelInputs, is_train: bool):
|
||||
docs = inputs.docs
|
||||
moves = inputs.moves
|
||||
actions = inputs.actions
|
||||
|
||||
beam_width = model.attrs["beam_width"]
|
||||
hidden_pad = model.get_param("hidden_pad")
|
||||
tok2vec = model.get_ref("tok2vec")
|
||||
|
||||
states = moves.init_batch(docs) if inputs.states is None else inputs.states
|
||||
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
|
||||
tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
|
||||
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
|
||||
seen_mask = _get_seen_mask(model)
|
||||
|
||||
if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
|
||||
# Note: max_moves is only used during training, so we don't need to
|
||||
# pass it to the greedy inference path.
|
||||
return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
|
||||
else:
|
||||
return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
|
||||
feats, backprop_feats, seen_mask, is_train, actions=actions,
|
||||
max_moves=inputs.max_moves)
|
||||
|
||||
|
||||
def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
|
||||
np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
|
||||
cdef vector[StateC*] c_states
|
||||
cdef StateClass state
|
||||
for state in states:
|
||||
if not state.is_final():
|
||||
c_states.push_back(state.c)
|
||||
weights = _get_c_weights(model, <float*>feats.data, seen_mask)
|
||||
# Precomputed features have rows for each token, plus one for padding.
|
||||
cdef int n_tokens = feats.shape[0] - 1
|
||||
sizes = _get_c_sizes(model, c_states.size(), n_tokens)
|
||||
cdef CBlas cblas = model.ops.cblas()
|
||||
scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
|
||||
|
||||
def backprop(dY):
|
||||
raise ValueError(Errors.E4004)
|
||||
|
||||
return (states, scores), backprop
|
||||
|
||||
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
||||
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
|
||||
cdef int i, j
|
||||
cdef vector[StateC *] unfinished
|
||||
cdef ActivationsC activations = _alloc_activations(sizes)
|
||||
cdef np.ndarray step_scores
|
||||
cdef np.ndarray step_actions
|
||||
|
||||
scores = []
|
||||
while sizes.states >= 1:
|
||||
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
|
||||
step_actions = actions[0] if actions is not None else None
|
||||
with nogil:
|
||||
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
|
||||
if actions is None:
|
||||
# Validate actions, argmax, take action.
|
||||
c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
|
||||
sizes.states)
|
||||
else:
|
||||
c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
|
||||
for i in range(sizes.states):
|
||||
if not states[i].is_final():
|
||||
unfinished.push_back(states[i])
|
||||
for i in range(unfinished.size()):
|
||||
states[i] = unfinished[i]
|
||||
sizes.states = unfinished.size()
|
||||
scores.append(step_scores)
|
||||
unfinished.clear()
|
||||
actions = actions[1:] if actions is not None else None
|
||||
_free_activations(&activations)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def _forward_fallback(
|
||||
model: Model,
|
||||
moves: TransitionSystem,
|
||||
states: List[StateClass],
|
||||
tokvecs, backprop_tok2vec,
|
||||
feats,
|
||||
backprop_feats,
|
||||
seen_mask,
|
||||
is_train: bool,
|
||||
actions: Optional[List[Ints1d]]=None,
|
||||
max_moves: int=0):
|
||||
nF = model.get_dim("nF")
|
||||
output = model.get_ref("output")
|
||||
hidden_b = model.get_param("hidden_b")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
|
||||
beam_width = model.attrs["beam_width"]
|
||||
beam_density = model.attrs["beam_density"]
|
||||
|
||||
ops = model.ops
|
||||
|
||||
all_ids = []
|
||||
all_which = []
|
||||
all_statevecs = []
|
||||
all_scores = []
|
||||
if beam_width == 1:
|
||||
batch = GreedyBatch(moves, states, None)
|
||||
else:
|
||||
batch = _beam_utils.BeamBatch(
|
||||
moves, states, None, width=beam_width, density=beam_density
|
||||
)
|
||||
arange = ops.xp.arange(nF)
|
||||
n_moves = 0
|
||||
while not batch.is_done:
|
||||
ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
|
||||
for i, state in enumerate(batch.get_unfinished_states()):
|
||||
state.set_context_tokens(ids, i, nF)
|
||||
# Sum the state features, add the bias and apply the activation (maxout)
|
||||
# to create the state vectors.
|
||||
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
|
||||
preacts2f += hidden_b
|
||||
preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
|
||||
assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
|
||||
statevecs, which = ops.maxout(preacts)
|
||||
# We don't use output's backprop, since we want to backprop for
|
||||
# all states at once, rather than a single state.
|
||||
scores = output.predict(statevecs)
|
||||
scores[:, seen_mask] = ops.xp.nanmin(scores)
|
||||
# Transition the states, filtering out any that are finished.
|
||||
cpu_scores = ops.to_numpy(scores)
|
||||
if actions is None:
|
||||
batch.advance(cpu_scores)
|
||||
else:
|
||||
batch.advance_with_actions(actions[0])
|
||||
actions = actions[1:]
|
||||
all_scores.append(scores)
|
||||
if is_train:
|
||||
# Remember intermediate results for the backprop.
|
||||
all_ids.append(ids)
|
||||
all_statevecs.append(statevecs)
|
||||
all_which.append(which)
|
||||
if n_moves >= max_moves >= 1:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
def backprop_parser(d_states_d_scores):
|
||||
ids = ops.xp.vstack(all_ids)
|
||||
which = ops.xp.vstack(all_which)
|
||||
statevecs = ops.xp.vstack(all_statevecs)
|
||||
_, d_scores = d_states_d_scores
|
||||
if model.attrs.get("unseen_classes"):
|
||||
# If we have a negative gradient (i.e. the probability should
|
||||
# increase) on any classes we filtered out as unseen, mark
|
||||
# them as seen.
|
||||
for clas in set(model.attrs["unseen_classes"]):
|
||||
if (d_scores[:, clas] < 0).any():
|
||||
model.attrs["unseen_classes"].remove(clas)
|
||||
d_scores *= seen_mask == False
|
||||
# Calculate the gradients for the parameters of the output layer.
|
||||
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||||
output.inc_grad("b", d_scores.sum(axis=0))
|
||||
output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
|
||||
# Now calculate d_statevecs, by backproping through the output linear layer.
|
||||
# This gemm is (nS, nO) @ (nO, nH)
|
||||
output_W = output.get_param("W")
|
||||
d_statevecs = ops.gemm(d_scores, output_W)
|
||||
# Backprop through the maxout activation
|
||||
d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
|
||||
d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
|
||||
model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
|
||||
# We don't need to backprop the summation, because we pass back the IDs instead
|
||||
d_state_features = backprop_feats((d_preacts2f, ids))
|
||||
d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||||
ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||||
model.inc_grad("hidden_pad", d_tokvecs[-1])
|
||||
return (backprop_tok2vec(d_tokvecs[:-1]), None)
|
||||
|
||||
return (list(batch), all_scores), backprop_parser
|
||||
|
||||
|
||||
def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
|
||||
mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
|
||||
for class_ in model.attrs.get("unseen_classes", set()):
|
||||
mask[class_] = True
|
||||
return mask
|
||||
|
||||
|
||||
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
||||
W: Floats2d = model.get_param("hidden_W")
|
||||
nF = model.get_dim("nF")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
|
||||
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
|
||||
W3f = W3f.transpose((1, 0, 2))
|
||||
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
|
||||
assert X.shape == (X.shape[0], nI), X.shape
|
||||
Yf_ = model.ops.gemm(X, W2f, trans2=True)
|
||||
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
|
||||
|
||||
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
# thing from what we put out. We put out an array of shape:
|
||||
# (nB, nF, nH, nP), and get back:
|
||||
# (nB, nH, nP) and ids (nB, nF)
|
||||
# The ids tell us the values of nF, so we would have:
|
||||
#
|
||||
# dYf = zeros((nB, nF, nH, nP))
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# dYf[b, ids[b, f]] += dY[b]
|
||||
#
|
||||
# However, we avoid building that array for efficiency -- and just pass
|
||||
# in the indices.
|
||||
dY, ids = dY_ids
|
||||
dXf = model.ops.gemm(dY, W)
|
||||
Xf = X[ids].reshape((ids.shape[0], -1))
|
||||
dW = model.ops.gemm(dY, Xf, trans1=True)
|
||||
model.inc_grad("hidden_W", dW)
|
||||
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
|
||||
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
||||
if Y is None:
|
||||
return None
|
||||
_, scores = Y
|
||||
if len(scores) == 0:
|
||||
return None
|
||||
assert scores[0].shape[0] >= 1
|
||||
assert len(scores[0].shape) == 2
|
||||
return scores[0].shape[1]
|
||||
|
||||
|
||||
def _lsuv_init(model: Model):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
W = model.maybe_get_param("hidden_W")
|
||||
if W is not None and W.any():
|
||||
return
|
||||
|
||||
nF = model.get_dim("nF")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.ops.alloc4f(nF, nH, nP, nI)
|
||||
b = model.ops.alloc2f(nH, nP)
|
||||
pad = model.ops.alloc4f(1, nF, nH, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
||||
ids = ops.alloc_f((5000, nF), dtype="f")
|
||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.alloc_f((5000, nI), dtype="f")
|
||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
|
||||
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
|
||||
vectors3f += b
|
||||
return model.ops.maxout(vectors3f)[0]
|
||||
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
W = cast(Floats4d, model.get_param("hidden_W").copy())
|
||||
b = cast(Floats2d, model.get_param("hidden_b").copy())
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
W /= model.ops.xp.sqrt(var)
|
||||
model.set_param("hidden_W", W)
|
||||
elif abs(mean) >= tol_mean:
|
||||
b -= mean
|
||||
model.set_param("hidden_b", b)
|
||||
else:
|
||||
break
|
||||
return model
|
||||
|
||||
|
||||
cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
|
||||
output = model.get_ref("output")
|
||||
cdef np.ndarray hidden_b = model.get_param("hidden_b")
|
||||
cdef np.ndarray output_W = output.get_param("W")
|
||||
cdef np.ndarray output_b = output.get_param("b")
|
||||
|
||||
cdef WeightsC weights
|
||||
weights.feat_weights = feats
|
||||
weights.feat_bias = <const float*>hidden_b.data
|
||||
weights.hidden_weights = <const float *> output_W.data
|
||||
weights.hidden_bias = <const float *> output_b.data
|
||||
weights.seen_mask = <const int8_t*> seen_mask.data
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
|
||||
cdef SizesC sizes
|
||||
sizes.states = batch_size
|
||||
sizes.classes = model.get_dim("nO")
|
||||
sizes.hiddens = model.get_dim("nH")
|
||||
sizes.pieces = model.get_dim("nP")
|
||||
sizes.feats = model.get_dim("nF")
|
||||
sizes.embed_width = model.get_dim("nI")
|
||||
sizes.tokens = tokens
|
||||
return sizes
|
||||
|
||||
|
||||
cdef ActivationsC _alloc_activations(SizesC n) nogil:
|
||||
cdef ActivationsC A
|
||||
memset(&A, 0, sizeof(A))
|
||||
_resize_activations(&A, n)
|
||||
return A
|
||||
|
||||
|
||||
cdef void _free_activations(const ActivationsC* A) nogil:
|
||||
free(A.token_ids)
|
||||
free(A.unmaxed)
|
||||
free(A.hiddens)
|
||||
free(A.is_valid)
|
||||
|
||||
|
||||
cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||
if n.states <= A._max_size:
|
||||
A._curr_size = n.states
|
||||
return
|
||||
if A._max_size == 0:
|
||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
|
||||
_resize_activations(A, n)
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
_sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
|
||||
for i in range(n.states):
|
||||
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
if W.hidden_weights == NULL:
|
||||
memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||
1.0, <const float *>A.hiddens, n.hiddens,
|
||||
<const float *>W.hidden_weights, n.hiddens,
|
||||
0.0, scores, n.classes)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
|
||||
# Set unseen classes to minimum value
|
||||
i = 0
|
||||
min_ = scores[0]
|
||||
for i in range(1, n.states * n.classes):
|
||||
if scores[i] < min_:
|
||||
min_ = scores[i]
|
||||
for i in range(n.states):
|
||||
for j in range(n.classes):
|
||||
if W.seen_mask[j]:
|
||||
scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void _sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, SizesC n) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
cdef int B = n.states
|
||||
cdef int O = n.hiddens * n.pieces
|
||||
cdef int F = n.feats
|
||||
cdef int T = n.tokens
|
||||
padding = cached + (T * F * O)
|
||||
cdef int id_stride = F*O
|
||||
cdef float one = 1.
|
||||
for b in range(B):
|
||||
for f in range(F):
|
||||
if token_ids[f] < 0:
|
||||
feature = &padding[f*O]
|
||||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||
token_ids += F
|
||||
|
|
@ -1,23 +1,41 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
cimport numpy as np
|
||||
from libc.stdint cimport uint64_t
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.memory cimport shared_ptr
|
||||
|
||||
from .structs cimport MorphAnalysisC
|
||||
from .strings cimport StringStore
|
||||
from .typedefs cimport attr_t, hash_t
|
||||
|
||||
|
||||
cdef cppclass Feature:
|
||||
hash_t field
|
||||
hash_t value
|
||||
|
||||
__init__():
|
||||
this.field = 0
|
||||
this.value = 0
|
||||
|
||||
|
||||
cdef cppclass MorphAnalysisC:
|
||||
hash_t key
|
||||
vector[Feature] features
|
||||
|
||||
__init__():
|
||||
this.key = 0
|
||||
|
||||
cdef class Morphology:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly StringStore strings
|
||||
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||
cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
|
||||
|
||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
||||
cdef int insert(self, MorphAnalysisC tag) except -1
|
||||
cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
|
||||
cdef void _intern_morph_tag(self, hash_t tag_key, feats)
|
||||
cdef hash_t _add(self, features)
|
||||
cdef str _normalize_features(self, features)
|
||||
cdef str get_morph_str(self, hash_t morph_key)
|
||||
cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
|
||||
|
||||
|
||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
|
||||
cdef list list_features(const MorphAnalysisC* morph)
|
||||
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
|
||||
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
|
||||
cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
|
||||
cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
|
||||
cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
|
||||
cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
# cython: infer_types
|
||||
import numpy
|
||||
import warnings
|
||||
from typing import Union, Tuple, List, Dict, Optional
|
||||
from cython.operator cimport dereference as deref
|
||||
from libcpp.memory cimport shared_ptr
|
||||
|
||||
from .attrs cimport POS
|
||||
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .errors import Warnings
|
||||
from . import symbols
|
||||
|
||||
|
@ -24,134 +24,187 @@ cdef class Morphology:
|
|||
EMPTY_MORPH = symbols.NAMES[symbols._]
|
||||
|
||||
def __init__(self, StringStore strings):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
self.tags = PreshMap()
|
||||
|
||||
def __reduce__(self):
|
||||
tags = set([self.get(self.strings[s]) for s in self.strings])
|
||||
tags -= set([""])
|
||||
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
|
||||
|
||||
def add(self, features):
|
||||
cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
|
||||
match = self.tags.find(tag_hash)
|
||||
if match != self.tags.const_end():
|
||||
return deref(match).second
|
||||
else:
|
||||
return shared_ptr[MorphAnalysisC]()
|
||||
|
||||
def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
|
||||
if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
|
||||
attr_key = self.strings.as_string(attr_key)
|
||||
attr_value = self.strings.as_string(attr_value)
|
||||
|
||||
# Preserve multiple values as a list
|
||||
if self.VALUE_SEP in attr_value:
|
||||
values = attr_value.split(self.VALUE_SEP)
|
||||
values.sort()
|
||||
attr_value = values
|
||||
else:
|
||||
warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
|
||||
return None
|
||||
|
||||
return attr_key, attr_value
|
||||
|
||||
def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
|
||||
if not feats or feats == self.EMPTY_MORPH:
|
||||
return {}
|
||||
|
||||
out = []
|
||||
for feat in feats.split(self.FEATURE_SEP):
|
||||
field, values = feat.split(self.FIELD_SEP, 1)
|
||||
normalized_attr = self._normalize_attr(field, values)
|
||||
if normalized_attr is None:
|
||||
continue
|
||||
out.append((normalized_attr[0], normalized_attr[1]))
|
||||
out.sort(key=lambda x: x[0])
|
||||
return dict(out)
|
||||
|
||||
def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
|
||||
out = []
|
||||
for field, values in feats.items():
|
||||
normalized_attr = self._normalize_attr(field, values)
|
||||
if normalized_attr is None:
|
||||
continue
|
||||
out.append((normalized_attr[0], normalized_attr[1]))
|
||||
out.sort(key=lambda x: x[0])
|
||||
return dict(out)
|
||||
|
||||
|
||||
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
|
||||
norm_feats_string = self.FEATURE_SEP.join([
|
||||
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
|
||||
for field, values in feats.items()
|
||||
])
|
||||
return norm_feats_string or self.EMPTY_MORPH
|
||||
|
||||
|
||||
cdef hash_t _add(self, features):
|
||||
"""Insert a morphological analysis in the morphology table, if not
|
||||
already present. The morphological analysis may be provided in the UD
|
||||
FEATS format as a string or in the tag map dict format.
|
||||
Returns the hash of the new analysis.
|
||||
"""
|
||||
cdef MorphAnalysisC* tag_ptr
|
||||
cdef hash_t tag_hash = 0
|
||||
cdef shared_ptr[MorphAnalysisC] tag
|
||||
if isinstance(features, str):
|
||||
if features == "":
|
||||
features = self.EMPTY_MORPH
|
||||
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
|
||||
if tag_ptr != NULL:
|
||||
return tag_ptr.key
|
||||
features = self.feats_to_dict(features)
|
||||
if not isinstance(features, dict):
|
||||
|
||||
tag_hash = self.strings[features]
|
||||
tag = self._lookup_tag(tag_hash)
|
||||
if tag:
|
||||
return deref(tag).key
|
||||
|
||||
features = self._str_to_normalized_feat_dict(features)
|
||||
elif isinstance(features, dict):
|
||||
features = self._dict_to_normalized_feat_dict(features)
|
||||
else:
|
||||
warnings.warn(Warnings.W100.format(feature=features))
|
||||
features = {}
|
||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||
# intified ("Field", "Field=Value") pairs
|
||||
field_feature_pairs = []
|
||||
for field in sorted(string_features):
|
||||
values = string_features[field]
|
||||
for value in values.split(self.VALUE_SEP):
|
||||
field_feature_pairs.append((
|
||||
self.strings.add(field),
|
||||
self.strings.add(field + self.FIELD_SEP + value),
|
||||
))
|
||||
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
|
||||
|
||||
# the hash key for the tag is either the hash of the normalized UFEATS
|
||||
# string or the hash of an empty placeholder
|
||||
norm_feats_string = self.normalize_features(features)
|
||||
tag.key = self.strings.add(norm_feats_string)
|
||||
self.insert(tag)
|
||||
return tag.key
|
||||
norm_feats_string = self._normalized_feat_dict_to_str(features)
|
||||
tag_hash = self.strings.add(norm_feats_string)
|
||||
tag = self._lookup_tag(tag_hash)
|
||||
if tag:
|
||||
return deref(tag).key
|
||||
|
||||
def normalize_features(self, features):
|
||||
self._intern_morph_tag(tag_hash, features)
|
||||
return tag_hash
|
||||
|
||||
cdef void _intern_morph_tag(self, hash_t tag_key, feats):
|
||||
# intified ("Field", "Field=Value") pairs where fields with multiple values have
|
||||
# been split into individual tuples, e.g.:
|
||||
# [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
|
||||
# ("Field2", "Field2=Value3")]
|
||||
field_feature_pairs = []
|
||||
|
||||
# Feat dict is normalized at this point.
|
||||
for field, values in feats.items():
|
||||
field_key = self.strings.add(field)
|
||||
if isinstance(values, list):
|
||||
for value in values:
|
||||
value_key = self.strings.add(field + self.FIELD_SEP + value)
|
||||
field_feature_pairs.append((field_key, value_key))
|
||||
else:
|
||||
# We could box scalar values into a list and use a common
|
||||
# code path to generate features but that incurs a small
|
||||
# but measurable allocation/iteration overhead (as this
|
||||
# branch is taken often enough).
|
||||
value_key = self.strings.add(field + self.FIELD_SEP + values)
|
||||
field_feature_pairs.append((field_key, value_key))
|
||||
|
||||
num_features = len(field_feature_pairs)
|
||||
cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
|
||||
deref(tag).key = tag_key
|
||||
deref(tag).features.resize(num_features)
|
||||
|
||||
for i in range(num_features):
|
||||
deref(tag).features[i].field = field_feature_pairs[i][0]
|
||||
deref(tag).features[i].value = field_feature_pairs[i][1]
|
||||
|
||||
self.tags[tag_key] = tag
|
||||
|
||||
cdef str get_morph_str(self, hash_t morph_key):
|
||||
cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
|
||||
if not tag:
|
||||
return ""
|
||||
else:
|
||||
return self.strings[deref(tag).key]
|
||||
|
||||
cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
|
||||
return self._lookup_tag(morph_key)
|
||||
|
||||
cdef str _normalize_features(self, features):
|
||||
"""Create a normalized FEATS string from a features string or dict.
|
||||
|
||||
features (Union[dict, str]): Features as dict or UFEATS string.
|
||||
RETURNS (str): Features as normalized UFEATS string.
|
||||
"""
|
||||
if isinstance(features, str):
|
||||
features = self.feats_to_dict(features)
|
||||
if not isinstance(features, dict):
|
||||
features = self._str_to_normalized_feat_dict(features)
|
||||
elif isinstance(features, dict):
|
||||
features = self._dict_to_normalized_feat_dict(features)
|
||||
else:
|
||||
warnings.warn(Warnings.W100.format(feature=features))
|
||||
features = {}
|
||||
features = self.normalize_attrs(features)
|
||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||
# normalized UFEATS string with sorted fields and values
|
||||
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
||||
self.FIELD_SEP.join([field, values])
|
||||
for field, values in string_features.items()
|
||||
]))
|
||||
return norm_feats_string or self.EMPTY_MORPH
|
||||
|
||||
def normalize_attrs(self, attrs):
|
||||
"""Convert attrs dict so that POS is always by ID, other features are
|
||||
by string. Values separated by VALUE_SEP are sorted.
|
||||
"""
|
||||
out = {}
|
||||
attrs = dict(attrs)
|
||||
for key, value in attrs.items():
|
||||
# convert POS value to ID
|
||||
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
|
||||
if isinstance(value, str) and value.upper() in POS_IDS:
|
||||
value = POS_IDS[value.upper()]
|
||||
elif isinstance(value, int) and value not in POS_IDS.values():
|
||||
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||
continue
|
||||
out[POS] = value
|
||||
# accept any string or ID fields and values and convert to strings
|
||||
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
|
||||
key = self.strings.as_string(key)
|
||||
value = self.strings.as_string(value)
|
||||
# sort values
|
||||
if self.VALUE_SEP in value:
|
||||
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
|
||||
out[key] = value
|
||||
else:
|
||||
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||
return out
|
||||
return self._normalized_feat_dict_to_str(features)
|
||||
|
||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
||||
"""Creates a MorphAnalysisC from a list of intified
|
||||
("Field", "Field=Value") tuples where fields with multiple values have
|
||||
been split into individual tuples, e.g.:
|
||||
[("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
|
||||
("Field2", "Field2=Value3")]
|
||||
"""
|
||||
cdef MorphAnalysisC tag
|
||||
tag.length = len(field_feature_pairs)
|
||||
if tag.length > 0:
|
||||
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
||||
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
||||
for i, (field, feature) in enumerate(field_feature_pairs):
|
||||
tag.fields[i] = field
|
||||
tag.features[i] = feature
|
||||
return tag
|
||||
def add(self, features):
|
||||
return self._add(features)
|
||||
|
||||
cdef int insert(self, MorphAnalysisC tag) except -1:
|
||||
cdef hash_t key = tag.key
|
||||
if self.tags.get(key) == NULL:
|
||||
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
tag_ptr[0] = tag
|
||||
self.tags.set(key, <void*>tag_ptr)
|
||||
def get(self, morph_key):
|
||||
return self.get_morph_str(morph_key)
|
||||
|
||||
def get(self, hash_t morph):
|
||||
tag = <MorphAnalysisC*>self.tags.get(morph)
|
||||
if tag == NULL:
|
||||
return ""
|
||||
else:
|
||||
return self.strings[tag.key]
|
||||
def normalize_features(self, features):
|
||||
return self._normalize_features(features)
|
||||
|
||||
@staticmethod
|
||||
def feats_to_dict(feats):
|
||||
def feats_to_dict(feats, *, sort_values=True):
|
||||
if not feats or feats == Morphology.EMPTY_MORPH:
|
||||
return {}
|
||||
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
||||
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
||||
|
||||
out = {}
|
||||
for feat in feats.split(Morphology.FEATURE_SEP):
|
||||
field, values = feat.split(Morphology.FIELD_SEP, 1)
|
||||
if sort_values:
|
||||
values = values.split(Morphology.VALUE_SEP)
|
||||
values.sort()
|
||||
values = Morphology.VALUE_SEP.join(values)
|
||||
|
||||
out[field] = values
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def dict_to_feats(feats_dict):
|
||||
|
@ -160,34 +213,34 @@ cdef class Morphology:
|
|||
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
|
||||
|
||||
|
||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
|
||||
cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
|
||||
cdef int i
|
||||
for i in range(morph.length):
|
||||
if morph.features[i] == feature:
|
||||
for i in range(deref(morph).features.size()):
|
||||
if deref(morph).features[i].value == feature:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
cdef list list_features(const MorphAnalysisC* morph):
|
||||
cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
|
||||
cdef int i
|
||||
features = []
|
||||
for i in range(morph.length):
|
||||
features.append(morph.features[i])
|
||||
for i in range(deref(morph).features.size()):
|
||||
features.append(deref(morph).features[i].value)
|
||||
return features
|
||||
|
||||
|
||||
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
|
||||
cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
|
||||
cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
|
||||
cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
|
||||
n = get_n_by_field(<uint64_t*>results.data, morph, field)
|
||||
return results[:n]
|
||||
|
||||
|
||||
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
|
||||
cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
|
||||
cdef int n_results = 0
|
||||
cdef int i
|
||||
for i in range(morph.length):
|
||||
if morph.fields[i] == field:
|
||||
results[n_results] = morph.features[i]
|
||||
for i in range(deref(morph).features.size()):
|
||||
if deref(morph).features[i].field == field:
|
||||
results[n_results] = deref(morph).features[i].value
|
||||
n_results += 1
|
||||
return n_results
|
||||
|
||||
|
|
|
@ -3,22 +3,22 @@ from . cimport symbols
|
|||
cpdef enum univ_pos_t:
|
||||
NO_TAG = 0
|
||||
ADJ = symbols.ADJ
|
||||
ADP
|
||||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
CCONJ # U20
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
NUM
|
||||
PART
|
||||
PRON
|
||||
PROPN
|
||||
PUNCT
|
||||
SCONJ
|
||||
SYM
|
||||
VERB
|
||||
X
|
||||
EOL
|
||||
SPACE
|
||||
ADP = symbols.ADP
|
||||
ADV = symbols.ADV
|
||||
AUX = symbols.AUX
|
||||
CONJ = symbols.CONJ
|
||||
CCONJ = symbols.CCONJ # U20
|
||||
DET = symbols.DET
|
||||
INTJ = symbols.INTJ
|
||||
NOUN = symbols.NOUN
|
||||
NUM = symbols.NUM
|
||||
PART = symbols.PART
|
||||
PRON = symbols.PRON
|
||||
PROPN = symbols.PROPN
|
||||
PUNCT = symbols.PUNCT
|
||||
SCONJ = symbols.SCONJ
|
||||
SYM = symbols.SYM
|
||||
VERB = symbols.VERB
|
||||
X = symbols.X
|
||||
EOL = symbols.EOL
|
||||
SPACE = symbols.SPACE
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
from .attributeruler import AttributeRuler
|
||||
from .attribute_ruler import AttributeRuler
|
||||
from .dep_parser import DependencyParser
|
||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||
from .entity_linker import EntityLinker
|
||||
from .ner import EntityRecognizer
|
||||
from .entityruler import EntityRuler
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .morphologizer import Morphologizer
|
||||
from .pipe import Pipe
|
||||
|
@ -23,7 +22,6 @@ __all__ = [
|
|||
"DependencyParser",
|
||||
"EntityLinker",
|
||||
"EntityRecognizer",
|
||||
"EntityRuler",
|
||||
"Morphologizer",
|
||||
"Lemmatizer",
|
||||
"MultiLabel_TextCategorizer",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from ...typedefs cimport class_t, hash_t
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
# These are passed as callbacks to .search.Beam
|
||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
|
||||
|
||||
cdef int check_final_state(void* _state, void* extra_args) except -1
|
||||
|
|
|
@ -3,17 +3,17 @@
|
|||
cimport numpy as np
|
||||
import numpy
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.extra.search import MaxViolation
|
||||
from thinc.extra.search cimport MaxViolation
|
||||
|
||||
from ...typedefs cimport hash_t, class_t
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ...errors import Errors
|
||||
from .batch cimport Batch
|
||||
from .search cimport Beam, MaxViolation
|
||||
from .search import MaxViolation
|
||||
from .stateclass cimport StateC, StateClass
|
||||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
# These are passed as callbacks to .search.Beam
|
||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||
dest = <StateC*>_dest
|
||||
src = <StateC*>_src
|
||||
|
@ -27,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
|
|||
return state.is_final()
|
||||
|
||||
|
||||
cdef class BeamBatch(object):
|
||||
cdef class BeamBatch(Batch):
|
||||
cdef public TransitionSystem moves
|
||||
cdef public object states
|
||||
cdef public object docs
|
||||
|
|
2
spacy/pipeline/_parser_internals/_parser_utils.pxd
Normal file
2
spacy/pipeline/_parser_internals/_parser_utils.pxd
Normal file
|
@ -0,0 +1,2 @@
|
|||
cdef int arg_max(const float* scores, const int n_classes) nogil
|
||||
cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
|
22
spacy/pipeline/_parser_internals/_parser_utils.pyx
Normal file
22
spacy/pipeline/_parser_internals/_parser_utils.pyx
Normal file
|
@ -0,0 +1,22 @@
|
|||
# cython: infer_types=True
|
||||
|
||||
cdef inline int arg_max(const float* scores, const int n_classes) nogil:
|
||||
if n_classes == 2:
|
||||
return 0 if scores[0] > scores[1] else 1
|
||||
cdef int i
|
||||
cdef int best = 0
|
||||
cdef float mode = scores[0]
|
||||
for i in range(1, n_classes):
|
||||
if scores[i] > mode:
|
||||
mode = scores[i]
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
|
@ -6,7 +6,6 @@ cimport libcpp
|
|||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.set cimport set
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from ...vocab cimport EMPTY_LEXEME
|
||||
|
@ -26,7 +25,7 @@ cdef struct ArcC:
|
|||
|
||||
|
||||
cdef cppclass StateC:
|
||||
int* _heads
|
||||
vector[int] _heads
|
||||
const TokenC* _sent
|
||||
vector[int] _stack
|
||||
vector[int] _rebuffer
|
||||
|
@ -34,31 +33,34 @@ cdef cppclass StateC:
|
|||
unordered_map[int, vector[ArcC]] _left_arcs
|
||||
unordered_map[int, vector[ArcC]] _right_arcs
|
||||
vector[libcpp.bool] _unshiftable
|
||||
vector[int] history
|
||||
set[int] _sent_starts
|
||||
TokenC _empty_token
|
||||
int length
|
||||
int offset
|
||||
int _b_i
|
||||
|
||||
__init__(const TokenC* sent, int length) nogil:
|
||||
__init__(const TokenC* sent, int length) nogil except +:
|
||||
this._heads.resize(length, -1)
|
||||
this._unshiftable.resize(length, False)
|
||||
|
||||
# Reserve memory ahead of time to minimize allocations during parsing.
|
||||
# The initial capacity set here ideally reflects the expected average-case/majority usage.
|
||||
cdef int init_capacity = 32
|
||||
this._stack.reserve(init_capacity)
|
||||
this._rebuffer.reserve(init_capacity)
|
||||
this._ents.reserve(init_capacity)
|
||||
this._left_arcs.reserve(init_capacity)
|
||||
this._right_arcs.reserve(init_capacity)
|
||||
this.history.reserve(init_capacity)
|
||||
|
||||
this._sent = sent
|
||||
this._heads = <int*>calloc(length, sizeof(int))
|
||||
if not (this._sent and this._heads):
|
||||
with gil:
|
||||
PyErr_SetFromErrno(MemoryError)
|
||||
PyErr_CheckSignals()
|
||||
this.offset = 0
|
||||
this.length = length
|
||||
this._b_i = 0
|
||||
for i in range(length):
|
||||
this._heads[i] = -1
|
||||
this._unshiftable.push_back(0)
|
||||
memset(&this._empty_token, 0, sizeof(TokenC))
|
||||
this._empty_token.lex = &EMPTY_LEXEME
|
||||
|
||||
__dealloc__():
|
||||
free(this._heads)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
cdef int i, j
|
||||
if n == 1:
|
||||
|
@ -131,19 +133,20 @@ cdef cppclass StateC:
|
|||
ids[i] = -1
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._stack.size():
|
||||
cdef int stack_size = this._stack.size()
|
||||
if i >= stack_size or i < 0:
|
||||
return -1
|
||||
elif i < 0:
|
||||
return -1
|
||||
return this._stack.at(this._stack.size() - (i+1))
|
||||
else:
|
||||
return this._stack[stack_size - (i+1)]
|
||||
|
||||
int B(int i) nogil const:
|
||||
cdef int buf_size = this._rebuffer.size()
|
||||
if i < 0:
|
||||
return -1
|
||||
elif i < this._rebuffer.size():
|
||||
return this._rebuffer.at(this._rebuffer.size() - (i+1))
|
||||
elif i < buf_size:
|
||||
return this._rebuffer[buf_size - (i+1)]
|
||||
else:
|
||||
b_i = this._b_i + (i - this._rebuffer.size())
|
||||
b_i = this._b_i + (i - buf_size)
|
||||
if b_i >= this.length:
|
||||
return -1
|
||||
else:
|
||||
|
@ -242,7 +245,7 @@ cdef cppclass StateC:
|
|||
return 0
|
||||
elif this._sent[word].sent_start == 1:
|
||||
return 1
|
||||
elif this._sent_starts.count(word) >= 1:
|
||||
elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
@ -327,7 +330,7 @@ cdef cppclass StateC:
|
|||
if item >= this._unshiftable.size():
|
||||
return 0
|
||||
else:
|
||||
return this._unshiftable.at(item)
|
||||
return this._unshiftable[item]
|
||||
|
||||
void set_reshiftable(int item) nogil:
|
||||
if item < this._unshiftable.size():
|
||||
|
@ -347,6 +350,9 @@ cdef cppclass StateC:
|
|||
this._heads[child] = head
|
||||
|
||||
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||
cdef vector[ArcC]* arcs
|
||||
cdef ArcC* arc
|
||||
|
||||
arcs_it = heads_arcs.find(h_i)
|
||||
if arcs_it == heads_arcs.end():
|
||||
return
|
||||
|
@ -355,12 +361,12 @@ cdef cppclass StateC:
|
|||
if arcs.size() == 0:
|
||||
return
|
||||
|
||||
arc = arcs.back()
|
||||
arc = &arcs.back()
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arcs.pop_back()
|
||||
else:
|
||||
for i in range(arcs.size()-1):
|
||||
arc = arcs.at(i)
|
||||
arc = &deref(arcs)[i]
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arc.head = -1
|
||||
arc.child = -1
|
||||
|
@ -400,10 +406,11 @@ cdef cppclass StateC:
|
|||
this._rebuffer = src._rebuffer
|
||||
this._sent_starts = src._sent_starts
|
||||
this._unshiftable = src._unshiftable
|
||||
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
|
||||
this._heads = src._heads
|
||||
this._ents = src._ents
|
||||
this._left_arcs = src._left_arcs
|
||||
this._right_arcs = src._right_arcs
|
||||
this._b_i = src._b_i
|
||||
this.offset = src.offset
|
||||
this._empty_token = src._empty_token
|
||||
this.history = src.history
|
||||
|
|
|
@ -15,7 +15,7 @@ from ...training.example cimport Example
|
|||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, ArcC
|
||||
from ...errors import Errors
|
||||
from thinc.extra.search cimport Beam
|
||||
from .search cimport Beam
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
|
||||
|
@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
return list(arcs)
|
||||
|
||||
def has_gold(self, Example eg, start=0, end=None):
|
||||
if end is not None and end < 0:
|
||||
end = None
|
||||
for word in eg.y[start:end]:
|
||||
if word.dep != 0:
|
||||
return True
|
||||
|
@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
state.print_state()
|
||||
)))
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(i)
|
||||
break
|
||||
else:
|
||||
failed = False
|
||||
|
|
2
spacy/pipeline/_parser_internals/batch.pxd
Normal file
2
spacy/pipeline/_parser_internals/batch.pxd
Normal file
|
@ -0,0 +1,2 @@
|
|||
cdef class Batch:
|
||||
pass
|
52
spacy/pipeline/_parser_internals/batch.pyx
Normal file
52
spacy/pipeline/_parser_internals/batch.pyx
Normal file
|
@ -0,0 +1,52 @@
|
|||
from typing import Any
|
||||
|
||||
TransitionSystem = Any # TODO
|
||||
|
||||
cdef class Batch:
|
||||
def advance(self, scores):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_states(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_unfinished_states(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __getitem__(self, i):
|
||||
raise NotImplementedError
|
||||
|
||||
def __len__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class GreedyBatch(Batch):
|
||||
def __init__(self, moves: TransitionSystem, states, golds):
|
||||
self._moves = moves
|
||||
self._states = states
|
||||
self._next_states = [s for s in states if not s.is_final()]
|
||||
|
||||
def advance(self, scores):
|
||||
self._next_states = self._moves.transition_states(self._next_states, scores)
|
||||
|
||||
def advance_with_actions(self, actions):
|
||||
self._next_states = self._moves.apply_actions(self._next_states, actions)
|
||||
|
||||
def get_states(self):
|
||||
return self._states
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
return all(s.is_final() for s in self._states)
|
||||
|
||||
def get_unfinished_states(self):
|
||||
return [st for st in self._states if not st.is_final()]
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self._states[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self._states)
|
|
@ -1,10 +1,11 @@
|
|||
import os
|
||||
import random
|
||||
from libc.stdint cimport int32_t
|
||||
from libcpp.memory cimport shared_ptr
|
||||
from libcpp.vector cimport vector
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from collections import Counter
|
||||
from thinc.extra.search cimport Beam
|
||||
|
||||
from ...tokens.doc cimport Doc
|
||||
from ...tokens.span import Span
|
||||
|
@ -15,6 +16,7 @@ from ...attrs cimport IS_SPACE
|
|||
from ...structs cimport TokenC, SpanC
|
||||
from ...training import split_bilu_label
|
||||
from ...training.example cimport Example
|
||||
from .search cimport Beam
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .transition_system cimport Transition, do_func_t
|
||||
|
@ -43,9 +45,7 @@ MOVE_NAMES[OUT] = 'O'
|
|||
|
||||
cdef struct GoldNERStateC:
|
||||
Transition* ner
|
||||
SpanC* negs
|
||||
int32_t length
|
||||
int32_t nr_neg
|
||||
vector[shared_ptr[SpanC]] negs
|
||||
|
||||
|
||||
cdef class BiluoGold:
|
||||
|
@ -78,8 +78,6 @@ cdef GoldNERStateC create_gold_state(
|
|||
negs = []
|
||||
assert example.x.length > 0
|
||||
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
||||
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
|
||||
gs.nr_neg = len(negs)
|
||||
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
|
||||
for i, ner_tag in enumerate(ner_tags):
|
||||
gs.ner[i] = moves.lookup_transition(ner_tag)
|
||||
|
@ -93,8 +91,8 @@ cdef GoldNERStateC create_gold_state(
|
|||
# In order to handle negative samples, we need to maintain the full
|
||||
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
|
||||
# thing, we'll get blocked if there's an incorrect prefix.
|
||||
for i, neg in enumerate(negs):
|
||||
gs.negs[i] = neg.c
|
||||
for neg in negs:
|
||||
gs.negs.push_back(neg.c)
|
||||
return gs
|
||||
|
||||
|
||||
|
@ -158,7 +156,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if token.ent_type:
|
||||
labels.add(token.ent_type_)
|
||||
return labels
|
||||
|
||||
|
||||
def move_name(self, int move, attr_t label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
|
@ -308,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
for span in eg.y.spans.get(neg_key, []):
|
||||
if span.start >= start and span.end <= end:
|
||||
return True
|
||||
if end is not None and end < 0:
|
||||
end = None
|
||||
for word in eg.y[start:end]:
|
||||
if word.ent_iob != 0:
|
||||
return True
|
||||
|
@ -411,6 +411,8 @@ cdef class Begin:
|
|||
cdef int g_act = gold.ner[b0].move
|
||||
cdef attr_t g_tag = gold.ner[b0].label
|
||||
|
||||
cdef shared_ptr[SpanC] span
|
||||
|
||||
if g_act == MISSING:
|
||||
pass
|
||||
elif g_act == BEGIN:
|
||||
|
@ -428,8 +430,8 @@ cdef class Begin:
|
|||
# be correct or not. However, we can at least tell whether we're
|
||||
# going to be opening an entity where there's only one possible
|
||||
# L.
|
||||
for span in gold.negs[:gold.nr_neg]:
|
||||
if span.label == label and span.start == b0:
|
||||
for span in gold.negs:
|
||||
if span.get().label == label and span.get().start == b0:
|
||||
cost += 1
|
||||
break
|
||||
return cost
|
||||
|
@ -574,8 +576,9 @@ cdef class Last:
|
|||
# If we have negative-example entities, integrate them into the objective,
|
||||
# by marking actions that close an entity that we know is incorrect
|
||||
# as costly.
|
||||
for span in gold.negs[:gold.nr_neg]:
|
||||
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
|
||||
cdef shared_ptr[SpanC] span
|
||||
for span in gold.negs:
|
||||
if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
|
||||
cost += 1
|
||||
break
|
||||
return cost
|
||||
|
@ -639,12 +642,13 @@ cdef class Unit:
|
|||
# This is fairly straight-forward for U- entities, as we have a single
|
||||
# action
|
||||
cdef int b0 = s.B(0)
|
||||
for span in gold.negs[:gold.nr_neg]:
|
||||
if span.label == label and span.start == b0 and span.end == (b0+1):
|
||||
cdef shared_ptr[SpanC] span
|
||||
for span in gold.negs:
|
||||
if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
|
||||
cost += 1
|
||||
break
|
||||
return cost
|
||||
|
||||
|
||||
|
||||
|
||||
cdef class Out:
|
||||
|
|
89
spacy/pipeline/_parser_internals/search.pxd
Normal file
89
spacy/pipeline/_parser_internals/search.pxd
Normal file
|
@ -0,0 +1,89 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.pair cimport pair
|
||||
from libcpp.queue cimport priority_queue
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ...typedefs cimport class_t, weight_t, hash_t
|
||||
|
||||
ctypedef pair[weight_t, size_t] Entry
|
||||
ctypedef priority_queue[Entry] Queue
|
||||
|
||||
|
||||
ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
|
||||
|
||||
ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
|
||||
|
||||
ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
|
||||
|
||||
ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
|
||||
|
||||
ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
|
||||
|
||||
|
||||
cdef struct _State:
|
||||
void* content
|
||||
class_t* hist
|
||||
weight_t score
|
||||
weight_t loss
|
||||
int i
|
||||
int t
|
||||
bint is_done
|
||||
|
||||
|
||||
cdef class Beam:
|
||||
cdef Pool mem
|
||||
cdef class_t nr_class
|
||||
cdef class_t width
|
||||
cdef class_t size
|
||||
cdef public weight_t min_density
|
||||
cdef int t
|
||||
cdef readonly bint is_done
|
||||
cdef list histories
|
||||
cdef list _parent_histories
|
||||
cdef weight_t** scores
|
||||
cdef int** is_valid
|
||||
cdef weight_t** costs
|
||||
cdef _State* _parents
|
||||
cdef _State* _states
|
||||
cdef del_func_t del_func
|
||||
|
||||
cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
|
||||
|
||||
cdef inline void* at(self, int i) nogil:
|
||||
return self._states[i].content
|
||||
|
||||
cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
|
||||
cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
|
||||
void* extra_args) except -1
|
||||
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
|
||||
|
||||
|
||||
cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
|
||||
self.scores[i][j] = score
|
||||
self.is_valid[i][j] = is_valid
|
||||
self.costs[i][j] = cost
|
||||
|
||||
cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
|
||||
const weight_t* costs) except -1
|
||||
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
|
||||
|
||||
|
||||
cdef class MaxViolation:
|
||||
cdef Pool mem
|
||||
cdef weight_t cost
|
||||
cdef weight_t delta
|
||||
cdef readonly weight_t p_score
|
||||
cdef readonly weight_t g_score
|
||||
cdef readonly double Z
|
||||
cdef readonly double gZ
|
||||
cdef class_t n
|
||||
cdef readonly list p_hist
|
||||
cdef readonly list g_hist
|
||||
cdef readonly list p_probs
|
||||
cdef readonly list g_probs
|
||||
|
||||
cpdef int check(self, Beam pred, Beam gold) except -1
|
||||
cpdef int check_crf(self, Beam pred, Beam gold) except -1
|
306
spacy/pipeline/_parser_internals/search.pyx
Normal file
306
spacy/pipeline/_parser_internals/search.pyx
Normal file
|
@ -0,0 +1,306 @@
|
|||
# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
|
||||
cimport cython
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.math cimport log, exp
|
||||
import math
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
|
||||
cdef class Beam:
|
||||
def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
|
||||
assert nr_class != 0
|
||||
assert width != 0
|
||||
self.nr_class = nr_class
|
||||
self.width = width
|
||||
self.min_density = min_density
|
||||
self.size = 1
|
||||
self.t = 0
|
||||
self.mem = Pool()
|
||||
self.del_func = NULL
|
||||
self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
|
||||
self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
|
||||
cdef int i
|
||||
self.histories = [[] for i in range(self.width)]
|
||||
self._parent_histories = [[] for i in range(self.width)]
|
||||
|
||||
self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
|
||||
self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
|
||||
self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
|
||||
for i in range(self.width):
|
||||
self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
|
||||
self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
|
||||
self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
|
||||
|
||||
def __len__(self):
|
||||
return self.size
|
||||
|
||||
property score:
|
||||
def __get__(self):
|
||||
return self._states[0].score
|
||||
|
||||
property min_score:
|
||||
def __get__(self):
|
||||
return self._states[self.size-1].score
|
||||
|
||||
property loss:
|
||||
def __get__(self):
|
||||
return self._states[0].loss
|
||||
|
||||
property probs:
|
||||
def __get__(self):
|
||||
return _softmax([self._states[i].score for i in range(self.size)])
|
||||
|
||||
property scores:
|
||||
def __get__(self):
|
||||
return [self._states[i].score for i in range(self.size)]
|
||||
|
||||
property histories:
|
||||
def __get__(self):
|
||||
return self.histories
|
||||
|
||||
cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
|
||||
const weight_t* costs) except -1:
|
||||
cdef int j
|
||||
for j in range(self.nr_class):
|
||||
self.scores[i][j] = scores[j]
|
||||
self.is_valid[i][j] = is_valid[j]
|
||||
self.costs[i][j] = costs[j]
|
||||
|
||||
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
|
||||
cdef int i, j
|
||||
for i in range(self.width):
|
||||
memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
|
||||
memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
|
||||
memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
|
||||
|
||||
cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
|
||||
for i in range(self.width):
|
||||
self._states[i].content = init_func(self.mem, n, extra_args)
|
||||
self._parents[i].content = init_func(self.mem, n, extra_args)
|
||||
self.del_func = del_func
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.del_func == NULL:
|
||||
return
|
||||
|
||||
for i in range(self.width):
|
||||
self.del_func(self.mem, self._states[i].content, NULL)
|
||||
self.del_func(self.mem, self._parents[i].content, NULL)
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
|
||||
void* extra_args) except -1:
|
||||
cdef weight_t** scores = self.scores
|
||||
cdef int** is_valid = self.is_valid
|
||||
cdef weight_t** costs = self.costs
|
||||
|
||||
cdef Queue* q = new Queue()
|
||||
self._fill(q, scores, is_valid)
|
||||
# For a beam of width k, we only ever need 2k state objects. How?
|
||||
# Each transition takes a parent and a class and produces a new state.
|
||||
# So, we don't need the whole history --- just the parent. So at
|
||||
# each step, we take a parent, and apply one or more extensions to
|
||||
# it.
|
||||
self._parents, self._states = self._states, self._parents
|
||||
self._parent_histories, self.histories = self.histories, self._parent_histories
|
||||
cdef weight_t score
|
||||
cdef int p_i
|
||||
cdef int i = 0
|
||||
cdef class_t clas
|
||||
cdef _State* parent
|
||||
cdef _State* state
|
||||
cdef hash_t key
|
||||
cdef PreshMap seen_states = PreshMap(self.width)
|
||||
cdef uint64_t is_seen
|
||||
cdef uint64_t one = 1
|
||||
while i < self.width and not q.empty():
|
||||
data = q.top()
|
||||
p_i = data.second / self.nr_class
|
||||
clas = data.second % self.nr_class
|
||||
score = data.first
|
||||
q.pop()
|
||||
parent = &self._parents[p_i]
|
||||
# Indicates terminal state reached; i.e. state is done
|
||||
if parent.is_done:
|
||||
# Now parent will not be changed, so we don't have to copy.
|
||||
# Once finished, should also be unbranching.
|
||||
self._states[i], parent[0] = parent[0], self._states[i]
|
||||
parent.i = self._states[i].i
|
||||
parent.t = self._states[i].t
|
||||
parent.is_done = self._states[i].t
|
||||
self._states[i].score = score
|
||||
self.histories[i] = list(self._parent_histories[p_i])
|
||||
i += 1
|
||||
else:
|
||||
state = &self._states[i]
|
||||
# The supplied transition function should adjust the destination
|
||||
# state to be the result of applying the class to the source state
|
||||
transition_func(state.content, parent.content, clas, extra_args)
|
||||
key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
|
||||
is_seen = <uint64_t>seen_states.get(key)
|
||||
if key == 0 or key == 1 or not is_seen:
|
||||
if key != 0 and key != 1:
|
||||
seen_states.set(key, <void*>one)
|
||||
state.score = score
|
||||
state.loss = parent.loss + costs[p_i][clas]
|
||||
self.histories[i] = list(self._parent_histories[p_i])
|
||||
self.histories[i].append(clas)
|
||||
i += 1
|
||||
del q
|
||||
self.size = i
|
||||
assert self.size >= 1
|
||||
for i in range(self.width):
|
||||
memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
|
||||
memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
|
||||
memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
|
||||
self.t += 1
|
||||
|
||||
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
if not self._states[i].is_done:
|
||||
self._states[i].is_done = finish_func(self._states[i].content, extra_args)
|
||||
for i in range(self.size):
|
||||
if not self._states[i].is_done:
|
||||
self.is_done = False
|
||||
break
|
||||
else:
|
||||
self.is_done = True
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
|
||||
"""Populate the queue from a k * n matrix of scores, where k is the
|
||||
beam-width, and n is the number of classes.
|
||||
"""
|
||||
cdef Entry entry
|
||||
cdef weight_t score
|
||||
cdef _State* s
|
||||
cdef int i, j, move_id
|
||||
assert self.size >= 1
|
||||
cdef vector[Entry] entries
|
||||
for i in range(self.size):
|
||||
s = &self._states[i]
|
||||
move_id = i * self.nr_class
|
||||
if s.is_done:
|
||||
# Update score by path average, following TACL '13 paper.
|
||||
if self.histories[i]:
|
||||
entry.first = s.score + (s.score / self.t)
|
||||
else:
|
||||
entry.first = s.score
|
||||
entry.second = move_id
|
||||
entries.push_back(entry)
|
||||
else:
|
||||
for j in range(self.nr_class):
|
||||
if is_valid[i][j]:
|
||||
entry.first = s.score + scores[i][j]
|
||||
entry.second = move_id + j
|
||||
entries.push_back(entry)
|
||||
cdef double max_, Z, cutoff
|
||||
if self.min_density == 0.0:
|
||||
for i in range(entries.size()):
|
||||
q.push(entries[i])
|
||||
elif not entries.empty():
|
||||
max_ = entries[0].first
|
||||
Z = 0.
|
||||
cutoff = 0.
|
||||
# Softmax into probabilities, so we can prune
|
||||
for i in range(entries.size()):
|
||||
if entries[i].first > max_:
|
||||
max_ = entries[i].first
|
||||
for i in range(entries.size()):
|
||||
Z += exp(entries[i].first-max_)
|
||||
cutoff = (1. / Z) * self.min_density
|
||||
for i in range(entries.size()):
|
||||
prob = exp(entries[i].first-max_) / Z
|
||||
if prob >= cutoff:
|
||||
q.push(entries[i])
|
||||
|
||||
|
||||
cdef class MaxViolation:
|
||||
def __init__(self):
|
||||
self.p_score = 0.0
|
||||
self.g_score = 0.0
|
||||
self.Z = 0.0
|
||||
self.gZ = 0.0
|
||||
self.delta = -1
|
||||
self.cost = 0
|
||||
self.p_hist = []
|
||||
self.g_hist = []
|
||||
self.p_probs = []
|
||||
self.g_probs = []
|
||||
|
||||
cpdef int check(self, Beam pred, Beam gold) except -1:
|
||||
cdef _State* p = &pred._states[0]
|
||||
cdef _State* g = &gold._states[0]
|
||||
cdef weight_t d = p.score - g.score
|
||||
if p.loss >= 1 and (self.cost == 0 or d > self.delta):
|
||||
self.cost = p.loss
|
||||
self.delta = d
|
||||
self.p_hist = list(pred.histories[0])
|
||||
self.g_hist = list(gold.histories[0])
|
||||
self.p_score = p.score
|
||||
self.g_score = g.score
|
||||
self.Z = 1e-10
|
||||
self.gZ = 1e-10
|
||||
for i in range(pred.size):
|
||||
if pred._states[i].loss > 0:
|
||||
self.Z += exp(pred._states[i].score)
|
||||
for i in range(gold.size):
|
||||
if gold._states[i].loss == 0:
|
||||
prob = exp(gold._states[i].score)
|
||||
self.Z += prob
|
||||
self.gZ += prob
|
||||
|
||||
cpdef int check_crf(self, Beam pred, Beam gold) except -1:
|
||||
d = pred.score - gold.score
|
||||
seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
|
||||
if pred.loss > 0 and (self.cost == 0 or d > self.delta):
|
||||
p_hist = []
|
||||
p_scores = []
|
||||
g_hist = []
|
||||
g_scores = []
|
||||
for i in range(pred.size):
|
||||
if pred._states[i].loss > 0:
|
||||
p_scores.append(pred._states[i].score)
|
||||
p_hist.append(list(pred.histories[i]))
|
||||
# This can happen from non-monotonic actions
|
||||
# If we find a better gold analysis this way, be sure to keep it.
|
||||
elif pred._states[i].loss <= 0 \
|
||||
and tuple(pred.histories[i]) not in seen_golds:
|
||||
g_scores.append(pred._states[i].score)
|
||||
g_hist.append(list(pred.histories[i]))
|
||||
for i in range(gold.size):
|
||||
if gold._states[i].loss == 0:
|
||||
g_scores.append(gold._states[i].score)
|
||||
g_hist.append(list(gold.histories[i]))
|
||||
|
||||
all_probs = _softmax(p_scores + g_scores)
|
||||
p_probs = all_probs[:len(p_scores)]
|
||||
g_probs_all = all_probs[len(p_scores):]
|
||||
g_probs = _softmax(g_scores)
|
||||
|
||||
self.cost = pred.loss
|
||||
self.delta = d
|
||||
self.p_hist = p_hist
|
||||
self.g_hist = g_hist
|
||||
# TODO: These variables are misnamed! These are the gradients of the loss.
|
||||
self.p_probs = p_probs
|
||||
# Intuition here:
|
||||
# The gradient of the loss is:
|
||||
# P(model) - P(truth)
|
||||
# Normally, P(truth) is 1 for the gold
|
||||
# But, if we want to do the "partial credit" scheme, we want
|
||||
# to create a distribution over the gold, proportional to the scores
|
||||
# awarded.
|
||||
self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
|
||||
|
||||
|
||||
def _softmax(nums):
|
||||
if not nums:
|
||||
return []
|
||||
max_ = max(nums)
|
||||
nums = [(exp(n-max_) if n is not None else None) for n in nums]
|
||||
Z = sum(n for n in nums if n is not None)
|
||||
return [(n/Z if n is not None else None) for n in nums]
|
|
@ -20,6 +20,10 @@ cdef class StateClass:
|
|||
if self._borrowed != 1:
|
||||
del self.c
|
||||
|
||||
@property
|
||||
def history(self):
|
||||
return list(self.c.history)
|
||||
|
||||
@property
|
||||
def stack(self):
|
||||
return [self.S(i) for i in range(self.c.stack_depth())]
|
||||
|
@ -176,3 +180,6 @@ cdef class StateClass:
|
|||
|
||||
def clone(self, StateClass src):
|
||||
self.c.clone(src.c)
|
||||
|
||||
def set_context_tokens(self, int[:, :] output, int row, int n_feats):
|
||||
self.c.set_context_tokens(&output[row, 0], n_feats)
|
||||
|
|
|
@ -53,3 +53,10 @@ cdef class TransitionSystem:
|
|||
|
||||
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
||||
const StateC* state, gold) except -1
|
||||
|
||||
|
||||
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
|
||||
int batch_size) nogil
|
||||
|
||||
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import print_function
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdlib cimport calloc, free
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from collections import Counter
|
||||
import srsly
|
||||
|
@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t
|
|||
from ...tokens.doc cimport Doc
|
||||
from ...structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ._parser_utils cimport arg_max_if_valid
|
||||
|
||||
from ...errors import Errors
|
||||
from ... import util
|
||||
|
@ -73,7 +76,18 @@ cdef class TransitionSystem:
|
|||
offset += len(doc)
|
||||
return states
|
||||
|
||||
def follow_history(self, doc, history):
|
||||
cdef int clas
|
||||
cdef StateClass state = StateClass(doc)
|
||||
for clas in history:
|
||||
action = self.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(clas)
|
||||
return state
|
||||
|
||||
def get_oracle_sequence(self, Example example, _debug=False):
|
||||
if not self.has_gold(example):
|
||||
return []
|
||||
states, golds, _ = self.init_gold_batch([example])
|
||||
if not states:
|
||||
return []
|
||||
|
@ -85,6 +99,8 @@ cdef class TransitionSystem:
|
|||
return self.get_oracle_sequence_from_state(state, gold)
|
||||
|
||||
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
|
||||
if state.is_final():
|
||||
return []
|
||||
cdef Pool mem = Pool()
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
assert self.n_moves > 0
|
||||
|
@ -110,6 +126,7 @@ cdef class TransitionSystem:
|
|||
"S0 head?", str(state.has_head(state.S(0))),
|
||||
)))
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(i)
|
||||
break
|
||||
else:
|
||||
if _debug:
|
||||
|
@ -137,6 +154,28 @@ cdef class TransitionSystem:
|
|||
raise ValueError(Errors.E170.format(name=name))
|
||||
action = self.lookup_transition(name)
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(action.clas)
|
||||
|
||||
def apply_actions(self, states, const int[::1] actions):
|
||||
assert len(states) == actions.shape[0]
|
||||
cdef StateClass state
|
||||
cdef vector[StateC*] c_states
|
||||
c_states.resize(len(states))
|
||||
cdef int i
|
||||
for (i, state) in enumerate(states):
|
||||
c_states[i] = state.c
|
||||
c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
def transition_states(self, states, float[:, ::1] scores):
|
||||
assert len(states) == scores.shape[0]
|
||||
cdef StateClass state
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef vector[StateC*] c_states
|
||||
for state in states:
|
||||
c_states.push_back(state.c)
|
||||
c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
@ -250,3 +289,35 @@ cdef class TransitionSystem:
|
|||
self.cfg.update(msg['cfg'])
|
||||
self.initialize_actions(labels)
|
||||
return self
|
||||
|
||||
|
||||
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
|
||||
int batch_size) nogil:
|
||||
cdef int i
|
||||
cdef Transition action
|
||||
cdef StateC* state
|
||||
for i in range(batch_size):
|
||||
state = states[i]
|
||||
action = moves.c[actions[i]]
|
||||
action.do(state, action.label)
|
||||
state.history.push_back(action.clas)
|
||||
|
||||
|
||||
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
|
||||
cdef int i, guess
|
||||
cdef Transition action
|
||||
for i in range(batch_size):
|
||||
moves.set_valid(is_valid, states[i])
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
if guess == -1:
|
||||
# This shouldn't happen, but it's hard to raise an error here,
|
||||
# and we don't want to infinite loop. So, force to end state.
|
||||
states[i].force_final()
|
||||
else:
|
||||
action = moves.c[guess]
|
||||
action.do(states[i], action.label)
|
||||
states[i].history.push_back(guess)
|
||||
free(is_valid)
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from ..matcher import Matcher
|
|||
from ..scorer import Scorer
|
||||
from ..symbols import IDS
|
||||
from ..tokens import Doc, Span
|
||||
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
||||
from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
|
||||
from ..vocab import Vocab
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
|
@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
|
|||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.arc_eager cimport ArcEager
|
||||
from .transition_parser import Parser
|
||||
from ._parser_internals.arc_eager import ArcEager
|
||||
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language
|
||||
|
@ -18,12 +18,11 @@ from ..util import registry
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -123,6 +122,7 @@ def make_parser(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_parser",
|
||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||
|
@ -228,6 +228,7 @@ def parser_score(examples, **kwargs):
|
|||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
|
@ -235,8 +236,11 @@ def parser_score(examples, **kwargs):
|
|||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
results.update(
|
||||
Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
)
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
|
@ -249,11 +253,12 @@ def make_parser_scorer():
|
|||
return parser_score
|
||||
|
||||
|
||||
cdef class DependencyParser(Parser):
|
||||
class DependencyParser(Parser):
|
||||
"""Pipeline component for dependency parsing.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser
|
||||
"""
|
||||
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def __init__(
|
||||
|
@ -273,8 +278,7 @@ cdef class DependencyParser(Parser):
|
|||
incorrect_spans_key=None,
|
||||
scorer=parser_score,
|
||||
):
|
||||
"""Create a DependencyParser.
|
||||
"""
|
||||
"""Create a DependencyParser."""
|
||||
super().__init__(
|
||||
vocab,
|
||||
model,
|
|
@ -1,12 +1,13 @@
|
|||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union
|
||||
from typing import Tuple
|
||||
from collections import Counter
|
||||
from itertools import islice
|
||||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||
from thinc.api import Config, Model
|
||||
from thinc.types import ArrayXd, Floats2d, Ints1d
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
|
@ -20,6 +21,9 @@ from ..vocab import Vocab
|
|||
from .. import util
|
||||
|
||||
|
||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -48,6 +52,7 @@ DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["mo
|
|||
"overwrite": False,
|
||||
"top_k": 1,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
|
@ -60,6 +65,7 @@ def make_edit_tree_lemmatizer(
|
|||
overwrite: bool,
|
||||
top_k: int,
|
||||
scorer: Optional[Callable],
|
||||
save_activations: bool,
|
||||
):
|
||||
"""Construct an EditTreeLemmatizer component."""
|
||||
return EditTreeLemmatizer(
|
||||
|
@ -71,6 +77,7 @@ def make_edit_tree_lemmatizer(
|
|||
overwrite=overwrite,
|
||||
top_k=top_k,
|
||||
scorer=scorer,
|
||||
save_activations=save_activations,
|
||||
)
|
||||
|
||||
|
||||
|
@ -90,6 +97,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
overwrite: bool = False,
|
||||
top_k: int = 1,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
"""
|
||||
Construct an edit tree lemmatizer.
|
||||
|
@ -101,6 +109,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
frequency in the training data.
|
||||
overwrite (bool): overwrite existing lemma annotations.
|
||||
top_k (int): try to apply at most the k most probable edit trees.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -115,12 +124,15 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
validate_examples(examples, "EditTreeLemmatizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(
|
||||
normalize=False, missing_value=-1
|
||||
)
|
||||
|
||||
truths = []
|
||||
for eg in examples:
|
||||
|
@ -143,19 +155,43 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
|
||||
"""
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
|
||||
d_scores, loss = loss_func(student_scores, teacher_scores)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
n_labels = len(self.cfg["labels"])
|
||||
guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
|
||||
guesses: List[Ints1d] = [
|
||||
self.model.ops.alloc((0,), dtype="i") for doc in docs
|
||||
]
|
||||
scores: List[Floats2d] = [
|
||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
||||
]
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
return {"probabilities": scores, "tree_ids": guesses}
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
return {"probabilities": scores, "tree_ids": guesses}
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
guesses = []
|
||||
|
@ -183,8 +219,13 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
|
||||
batch_tree_ids = activations["tree_ids"]
|
||||
for i, doc in enumerate(docs):
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
for act_name, acts in activations.items():
|
||||
doc.activations[self.name][act_name] = acts[i]
|
||||
doc_tree_ids = batch_tree_ids[i]
|
||||
if hasattr(doc_tree_ids, "get"):
|
||||
doc_tree_ids = doc_tree_ids.get()
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from typing import Optional, Iterable, Callable, Dict, Union, List, Any
|
||||
from thinc.types import Floats2d
|
||||
from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
|
||||
from typing import cast
|
||||
from numpy import dtype
|
||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||
from pathlib import Path
|
||||
from itertools import islice
|
||||
import srsly
|
||||
|
@ -21,6 +23,11 @@ from ..util import SimpleFrozenList, registry
|
|||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
||||
|
||||
ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
|
||||
|
||||
KNOWLEDGE_BASE_IDS = "kb_ids"
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
|
||||
|
@ -59,6 +66,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"use_gold_ents": True,
|
||||
"candidates_batch_size": 1,
|
||||
"threshold": None,
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={
|
||||
"nel_micro_f": 1.0,
|
||||
|
@ -85,6 +93,7 @@ def make_entity_linker(
|
|||
use_gold_ents: bool,
|
||||
candidates_batch_size: int,
|
||||
threshold: Optional[float] = None,
|
||||
save_activations: bool,
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
|
@ -107,6 +116,7 @@ def make_entity_linker(
|
|||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
|
@ -140,6 +150,7 @@ def make_entity_linker(
|
|||
use_gold_ents=use_gold_ents,
|
||||
candidates_batch_size=candidates_batch_size,
|
||||
threshold=threshold,
|
||||
save_activations=save_activations,
|
||||
)
|
||||
|
||||
|
||||
|
@ -180,6 +191,7 @@ class EntityLinker(TrainablePipe):
|
|||
use_gold_ents: bool,
|
||||
candidates_batch_size: int,
|
||||
threshold: Optional[float] = None,
|
||||
save_activations: bool = False,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
|
@ -234,6 +246,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
self.threshold = threshold
|
||||
self.save_activations = save_activations
|
||||
|
||||
if candidates_batch_size < 1:
|
||||
raise ValueError(Errors.E1044)
|
||||
|
@ -422,7 +435,7 @@ class EntityLinker(TrainablePipe):
|
|||
loss = loss / len(entity_encodings)
|
||||
return float(loss), out
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
Returns the KB IDs for each entity in each doc, including NIL if there is
|
||||
no prediction.
|
||||
|
@ -435,13 +448,20 @@ class EntityLinker(TrainablePipe):
|
|||
self.validate_kb()
|
||||
entity_count = 0
|
||||
final_kb_ids: List[str] = []
|
||||
xp = self.model.ops.xp
|
||||
ops = self.model.ops
|
||||
xp = ops.xp
|
||||
docs_ents: List[Ragged] = []
|
||||
docs_scores: List[Ragged] = []
|
||||
if not docs:
|
||||
return final_kb_ids
|
||||
return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
for i, doc in enumerate(docs):
|
||||
for doc in docs:
|
||||
doc_ents: List[Ints1d] = []
|
||||
doc_scores: List[Floats1d] = []
|
||||
if len(doc) == 0:
|
||||
docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
|
||||
docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
|
||||
continue
|
||||
sentences = [s for s in doc.sents]
|
||||
|
||||
|
@ -489,14 +509,32 @@ class EntityLinker(TrainablePipe):
|
|||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
self._add_activations(
|
||||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
scores=[0.0],
|
||||
ents=[0],
|
||||
)
|
||||
else:
|
||||
candidates = list(batch_candidates[j])
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
self._add_activations(
|
||||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
scores=[0.0],
|
||||
ents=[0],
|
||||
)
|
||||
elif len(candidates) == 1 and self.threshold is None:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
self._add_activations(
|
||||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
scores=[1.0],
|
||||
ents=[candidates[0].entity_],
|
||||
)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
|
@ -530,28 +568,48 @@ class EntityLinker(TrainablePipe):
|
|||
or scores.max() >= self.threshold
|
||||
else EntityLinker.NIL
|
||||
)
|
||||
|
||||
self._add_activations(
|
||||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
scores=scores,
|
||||
ents=[c.entity for c in candidates],
|
||||
)
|
||||
self._add_doc_activations(
|
||||
docs_scores=docs_scores,
|
||||
docs_ents=docs_ents,
|
||||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
)
|
||||
if not (len(final_kb_ids) == entity_count):
|
||||
err = Errors.E147.format(
|
||||
method="predict", msg="result variables not of equal length"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
return final_kb_ids
|
||||
return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||
activations (ActivationsT): The activations used for setting annotations, produced
|
||||
by EntityLinker.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
||||
"""
|
||||
kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
|
||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||
if count_ents != len(kb_ids):
|
||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||
i = 0
|
||||
overwrite = self.cfg["overwrite"]
|
||||
for doc in docs:
|
||||
for j, doc in enumerate(docs):
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
for act_name, acts in activations.items():
|
||||
if act_name != KNOWLEDGE_BASE_IDS:
|
||||
# We only copy activations that are Ragged.
|
||||
doc.activations[self.name][act_name] = cast(Ragged, acts[j])
|
||||
|
||||
for ent in doc.ents:
|
||||
kb_id = kb_ids[i]
|
||||
i += 1
|
||||
|
@ -650,3 +708,32 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
def add_label(self, label):
|
||||
raise NotImplementedError
|
||||
|
||||
def _add_doc_activations(
|
||||
self,
|
||||
*,
|
||||
docs_scores: List[Ragged],
|
||||
docs_ents: List[Ragged],
|
||||
doc_scores: List[Floats1d],
|
||||
doc_ents: List[Ints1d],
|
||||
):
|
||||
if not self.save_activations:
|
||||
return
|
||||
ops = self.model.ops
|
||||
lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
|
||||
docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
|
||||
docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
|
||||
|
||||
def _add_activations(
|
||||
self,
|
||||
*,
|
||||
doc_scores: List[Floats1d],
|
||||
doc_ents: List[Ints1d],
|
||||
scores: Sequence[float],
|
||||
ents: Sequence[int],
|
||||
):
|
||||
if not self.save_activations:
|
||||
return
|
||||
ops = self.model.ops
|
||||
doc_scores.append(ops.asarray1f(scores))
|
||||
doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
|
||||
|
|
|
@ -1,541 +0,0 @@
|
|||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..training import Example
|
||||
from ..language import Language
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from ..scorer import get_ner_prf
|
||||
|
||||
|
||||
DEFAULT_ENT_ID_SEP = "||"
|
||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"entity_ruler",
|
||||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||
default_config={
|
||||
"phrase_matcher_attr": None,
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
"validate": False,
|
||||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_entity_ruler(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
matcher_fuzzy_compare: Callable,
|
||||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
ent_id_sep: str,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return EntityRuler(
|
||||
nlp,
|
||||
name,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||
validate=validate,
|
||||
overwrite_ents=overwrite_ents,
|
||||
ent_id_sep=ent_id_sep,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def entity_ruler_score(examples, **kwargs):
|
||||
return get_ner_prf(examples)
|
||||
|
||||
|
||||
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
||||
def make_entity_ruler_scorer():
|
||||
return entity_ruler_score
|
||||
|
||||
|
||||
class EntityRuler(Pipe):
|
||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||
rules or exact phrase matches. It can be combined with the statistical
|
||||
`EntityRecognizer` to boost accuracy, or used on its own to implement a
|
||||
purely rule-based entity recognition system. After initialization, the
|
||||
component is typically added to the pipeline using `nlp.add_pipe`.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler
|
||||
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
nlp: Language,
|
||||
name: str = "entity_ruler",
|
||||
*,
|
||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||
validate: bool = False,
|
||||
overwrite_ents: bool = False,
|
||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||
patterns: Optional[List[PatternType]] = None,
|
||||
scorer: Optional[Callable] = entity_ruler_score,
|
||||
) -> None:
|
||||
"""Initialize the entity ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
key. A pattern can either be a token pattern (list) or a phrase pattern
|
||||
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
|
||||
|
||||
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
||||
and process phrase patterns.
|
||||
name (str): Instance name of the current pipeline component. Typically
|
||||
passed in automatically from the factory when the component is
|
||||
added. Used to disable the current entity ruler while creating
|
||||
phrase patterns with the nlp object.
|
||||
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
||||
to the internal PhraseMatcher as `attr`.
|
||||
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
|
||||
internal Matcher. Defaults to
|
||||
spacy.matcher.levenshtein.levenshtein_compare.
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher and PhraseMatcher as `validate`
|
||||
patterns (iterable): Optional patterns to load in.
|
||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||
added by the model, overwrite them by matches if necessary.
|
||||
ent_id_sep (str): Separator used internally for entity IDs.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
spacy.scorer.get_ner_prf.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#init
|
||||
"""
|
||||
self.nlp = nlp
|
||||
self.name = name
|
||||
self.overwrite = overwrite_ents
|
||||
self.token_patterns = defaultdict(list) # type: ignore
|
||||
self.phrase_patterns = defaultdict(list) # type: ignore
|
||||
self._validate = validate
|
||||
self.matcher_fuzzy_compare = matcher_fuzzy_compare
|
||||
self.matcher = Matcher(
|
||||
nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
|
||||
)
|
||||
self.phrase_matcher_attr = phrase_matcher_attr
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||
)
|
||||
self.ent_id_sep = ent_id_sep
|
||||
self._ent_ids = defaultdict(tuple) # type: ignore
|
||||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
self.scorer = scorer
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
||||
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
|
||||
return n_token_patterns + n_phrase_patterns
|
||||
|
||||
def __contains__(self, label: str) -> bool:
|
||||
"""Whether a label is present in the patterns."""
|
||||
return label in self.token_patterns or label in self.phrase_patterns
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
"""Find matches in document and add them as entities.
|
||||
|
||||
doc (Doc): The Doc object in the pipeline.
|
||||
RETURNS (Doc): The Doc with added entities, if available.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#call
|
||||
"""
|
||||
error_handler = self.get_error_handler()
|
||||
try:
|
||||
matches = self.match(doc)
|
||||
self.set_annotations(doc, matches)
|
||||
return doc
|
||||
except Exception as e:
|
||||
return error_handler(self.name, self, [doc], e)
|
||||
|
||||
def match(self, doc: Doc):
|
||||
self._require_patterns()
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="\\[W036")
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
|
||||
final_matches = set(
|
||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||
)
|
||||
get_sort_key = lambda m: (m[2] - m[1], -m[1])
|
||||
final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
|
||||
return final_matches
|
||||
|
||||
def set_annotations(self, doc, matches):
|
||||
"""Modify the document in place"""
|
||||
entities = list(doc.ents)
|
||||
new_entities = []
|
||||
seen_tokens = set()
|
||||
for match_id, start, end in matches:
|
||||
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
|
||||
continue
|
||||
# check for end - 1 here because boundaries are inclusive
|
||||
if start not in seen_tokens and end - 1 not in seen_tokens:
|
||||
if match_id in self._ent_ids:
|
||||
label, ent_id = self._ent_ids[match_id]
|
||||
span = Span(doc, start, end, label=label, span_id=ent_id)
|
||||
else:
|
||||
span = Span(doc, start, end, label=match_id)
|
||||
new_entities.append(span)
|
||||
entities = [
|
||||
e for e in entities if not (e.start < end and e.end > start)
|
||||
]
|
||||
seen_tokens.update(range(start, end))
|
||||
doc.ents = entities + new_entities
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[str, ...]:
|
||||
"""All labels present in the match patterns.
|
||||
|
||||
RETURNS (set): The string labels.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#labels
|
||||
"""
|
||||
keys = set(self.token_patterns.keys())
|
||||
keys.update(self.phrase_patterns.keys())
|
||||
all_labels = set()
|
||||
|
||||
for l in keys:
|
||||
if self.ent_id_sep in l:
|
||||
label, _ = self._split_label(l)
|
||||
all_labels.add(label)
|
||||
else:
|
||||
all_labels.add(l)
|
||||
return tuple(sorted(all_labels))
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
patterns: Optional[Sequence[PatternType]] = None,
|
||||
):
|
||||
"""Initialize the pipe for training.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
patterns Optional[Iterable[PatternType]]: The list of patterns.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#initialize
|
||||
"""
|
||||
self.clear()
|
||||
if patterns:
|
||||
self.add_patterns(patterns) # type: ignore[arg-type]
|
||||
|
||||
@property
|
||||
def ent_ids(self) -> Tuple[Optional[str], ...]:
|
||||
"""All entity ids present in the match patterns `id` properties
|
||||
|
||||
RETURNS (set): The string entity ids.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#ent_ids
|
||||
"""
|
||||
keys = set(self.token_patterns.keys())
|
||||
keys.update(self.phrase_patterns.keys())
|
||||
all_ent_ids = set()
|
||||
|
||||
for l in keys:
|
||||
if self.ent_id_sep in l:
|
||||
_, ent_id = self._split_label(l)
|
||||
all_ent_ids.add(ent_id)
|
||||
return tuple(all_ent_ids)
|
||||
|
||||
@property
|
||||
def patterns(self) -> List[PatternType]:
|
||||
"""Get all patterns that were added to the entity ruler.
|
||||
|
||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#patterns
|
||||
"""
|
||||
all_patterns = []
|
||||
for label, patterns in self.token_patterns.items():
|
||||
for pattern in patterns:
|
||||
ent_label, ent_id = self._split_label(label)
|
||||
p = {"label": ent_label, "pattern": pattern}
|
||||
if ent_id:
|
||||
p["id"] = ent_id
|
||||
all_patterns.append(p)
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
for pattern in patterns:
|
||||
ent_label, ent_id = self._split_label(label)
|
||||
p = {"label": ent_label, "pattern": pattern.text}
|
||||
if ent_id:
|
||||
p["id"] = ent_id
|
||||
all_patterns.append(p)
|
||||
return all_patterns
|
||||
|
||||
def add_patterns(self, patterns: List[PatternType]) -> None:
|
||||
"""Add patterns to the entity ruler. A pattern can either be a token
|
||||
pattern (list of dicts) or a phrase pattern (string). For example:
|
||||
{'label': 'ORG', 'pattern': 'Apple'}
|
||||
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
||||
|
||||
patterns (list): The patterns to add.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||
"""
|
||||
|
||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||
try:
|
||||
current_index = -1
|
||||
for i, (name, pipe) in enumerate(self.nlp.pipeline):
|
||||
if self == pipe:
|
||||
current_index = i
|
||||
break
|
||||
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
|
||||
except ValueError:
|
||||
subsequent_pipes = []
|
||||
with self.nlp.select_pipes(disable=subsequent_pipes):
|
||||
token_patterns = []
|
||||
phrase_pattern_labels = []
|
||||
phrase_pattern_texts = []
|
||||
phrase_pattern_ids = []
|
||||
for entry in patterns:
|
||||
if isinstance(entry["pattern"], str):
|
||||
phrase_pattern_labels.append(entry["label"])
|
||||
phrase_pattern_texts.append(entry["pattern"])
|
||||
phrase_pattern_ids.append(entry.get("id"))
|
||||
elif isinstance(entry["pattern"], list):
|
||||
token_patterns.append(entry)
|
||||
phrase_patterns = []
|
||||
for label, pattern, ent_id in zip(
|
||||
phrase_pattern_labels,
|
||||
self.nlp.pipe(phrase_pattern_texts),
|
||||
phrase_pattern_ids,
|
||||
):
|
||||
phrase_pattern = {"label": label, "pattern": pattern}
|
||||
if ent_id:
|
||||
phrase_pattern["id"] = ent_id
|
||||
phrase_patterns.append(phrase_pattern)
|
||||
for entry in token_patterns + phrase_patterns: # type: ignore[operator]
|
||||
label = entry["label"] # type: ignore
|
||||
if "id" in entry:
|
||||
ent_label = label
|
||||
label = self._create_label(label, entry["id"])
|
||||
key = self.matcher._normalize_key(label)
|
||||
self._ent_ids[key] = (ent_label, entry["id"])
|
||||
pattern = entry["pattern"] # type: ignore
|
||||
if isinstance(pattern, Doc):
|
||||
self.phrase_patterns[label].append(pattern)
|
||||
self.phrase_matcher.add(label, [pattern]) # type: ignore
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
self.matcher.add(label, [pattern])
|
||||
else:
|
||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Reset all patterns."""
|
||||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self._ent_ids = defaultdict(tuple)
|
||||
self.matcher = Matcher(
|
||||
self.nlp.vocab,
|
||||
validate=self._validate,
|
||||
fuzzy_compare=self.matcher_fuzzy_compare,
|
||||
)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||
)
|
||||
|
||||
def remove(self, ent_id: str) -> None:
|
||||
"""Remove a pattern by its ent_id if a pattern with this ent_id was added before
|
||||
|
||||
ent_id (str): id of the pattern to be removed
|
||||
RETURNS: None
|
||||
DOCS: https://spacy.io/api/entityruler#remove
|
||||
"""
|
||||
label_id_pairs = [
|
||||
(label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
|
||||
]
|
||||
if not label_id_pairs:
|
||||
raise ValueError(
|
||||
Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
|
||||
)
|
||||
created_labels = [
|
||||
self._create_label(label, eid) for (label, eid) in label_id_pairs
|
||||
]
|
||||
# remove the patterns from self.phrase_patterns
|
||||
self.phrase_patterns = defaultdict(
|
||||
list,
|
||||
{
|
||||
label: val
|
||||
for (label, val) in self.phrase_patterns.items()
|
||||
if label not in created_labels
|
||||
},
|
||||
)
|
||||
# remove the patterns from self.token_pattern
|
||||
self.token_patterns = defaultdict(
|
||||
list,
|
||||
{
|
||||
label: val
|
||||
for (label, val) in self.token_patterns.items()
|
||||
if label not in created_labels
|
||||
},
|
||||
)
|
||||
# remove the patterns from self.token_pattern
|
||||
for label in created_labels:
|
||||
if label in self.phrase_matcher:
|
||||
self.phrase_matcher.remove(label)
|
||||
else:
|
||||
self.matcher.remove(label)
|
||||
|
||||
def _require_patterns(self) -> None:
|
||||
"""Raise a warning if this component has no patterns defined."""
|
||||
if len(self) == 0:
|
||||
warnings.warn(Warnings.W036.format(name=self.name))
|
||||
|
||||
def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
|
||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||
|
||||
label (str): The value of label in a pattern entry
|
||||
RETURNS (tuple): ent_label, ent_id
|
||||
"""
|
||||
if self.ent_id_sep in label:
|
||||
ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
|
||||
else:
|
||||
ent_label = label
|
||||
ent_id = None # type: ignore
|
||||
return ent_label, ent_id
|
||||
|
||||
def _create_label(self, label: Any, ent_id: Any) -> str:
|
||||
"""Join Entity label with ent_id if the pattern has an `id` attribute
|
||||
If ent_id is not a string, the label is returned as is.
|
||||
|
||||
label (str): The label to set for ent.label_
|
||||
ent_id (str): The label
|
||||
RETURNS (str): The ent_label joined with configured `ent_id_sep`
|
||||
"""
|
||||
if isinstance(ent_id, str):
|
||||
label = f"{label}{self.ent_id_sep}{ent_id}"
|
||||
return label
|
||||
|
||||
def from_bytes(
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
"""Load the entity ruler from a bytestring.
|
||||
|
||||
patterns_bytes (bytes): The bytestring to load.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
||||
"""
|
||||
cfg = srsly.msgpack_loads(patterns_bytes)
|
||||
self.clear()
|
||||
if isinstance(cfg, dict):
|
||||
self.add_patterns(cfg.get("patterns", cfg))
|
||||
self.overwrite = cfg.get("overwrite", False)
|
||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab,
|
||||
attr=self.phrase_matcher_attr,
|
||||
)
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
else:
|
||||
self.add_patterns(cfg)
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the entity ruler patterns to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized patterns.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#to_bytes
|
||||
"""
|
||||
serial = {
|
||||
"overwrite": self.overwrite,
|
||||
"ent_id_sep": self.ent_id_sep,
|
||||
"phrase_matcher_attr": self.phrase_matcher_attr,
|
||||
"patterns": self.patterns,
|
||||
}
|
||||
return srsly.msgpack_dumps(serial)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
"""Load the entity ruler from a file. Expects a file containing
|
||||
newline-delimited JSON (JSONL) with one entry per line.
|
||||
|
||||
path (str / Path): The JSONL file to load.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
self.clear()
|
||||
depr_patterns_path = path.with_suffix(".jsonl")
|
||||
if path.suffix == ".jsonl": # user provides a jsonl
|
||||
if path.is_file:
|
||||
patterns = srsly.read_jsonl(path)
|
||||
self.add_patterns(patterns)
|
||||
else:
|
||||
raise ValueError(Errors.E1023.format(path=path))
|
||||
elif depr_patterns_path.is_file():
|
||||
patterns = srsly.read_jsonl(depr_patterns_path)
|
||||
self.add_patterns(patterns)
|
||||
elif path.is_dir(): # path is a valid directory
|
||||
cfg = {}
|
||||
deserializers_patterns = {
|
||||
"patterns": lambda p: self.add_patterns(
|
||||
srsly.read_jsonl(p.with_suffix(".jsonl"))
|
||||
)
|
||||
}
|
||||
deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
|
||||
from_disk(path, deserializers_cfg, {})
|
||||
self.overwrite = cfg.get("overwrite", False)
|
||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
)
|
||||
from_disk(path, deserializers_patterns, {})
|
||||
else: # path is not a valid directory or file
|
||||
raise ValueError(Errors.E146.format(path=path))
|
||||
return self
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||
saved as newline-delimited JSON (JSONL).
|
||||
|
||||
path (str / Path): The JSONL file to save.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
cfg = {
|
||||
"overwrite": self.overwrite,
|
||||
"phrase_matcher_attr": self.phrase_matcher_attr,
|
||||
"ent_id_sep": self.ent_id_sep,
|
||||
}
|
||||
serializers = {
|
||||
"patterns": lambda p: srsly.write_jsonl(
|
||||
p.with_suffix(".jsonl"), self.patterns
|
||||
),
|
||||
"cfg": lambda p: srsly.write_json(p, cfg),
|
||||
}
|
||||
if path.suffix == ".jsonl": # user wants to save only JSONL
|
||||
srsly.write_jsonl(path, self.patterns)
|
||||
else:
|
||||
to_disk(path, serializers, {})
|
|
@ -1,7 +1,9 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, Union, Dict, Callable
|
||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||
import srsly
|
||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||
from thinc.api import Model, Config
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
from itertools import islice
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
@ -13,7 +15,7 @@ from ..symbols import POS
|
|||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from .pipe import deserialize_config
|
||||
from .tagger import Tagger
|
||||
from .tagger import ActivationsT, Tagger
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
|
@ -52,7 +54,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||
default_config={
|
||||
"model": DEFAULT_MORPH_MODEL,
|
||||
"overwrite": True,
|
||||
"extend": False,
|
||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
|
@ -62,8 +70,10 @@ def make_morphologizer(
|
|||
overwrite: bool,
|
||||
extend: bool,
|
||||
scorer: Optional[Callable],
|
||||
save_activations: bool,
|
||||
):
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
|
||||
save_activations=save_activations)
|
||||
|
||||
|
||||
def morphologizer_score(examples, **kwargs):
|
||||
|
@ -95,6 +105,7 @@ class Morphologizer(Tagger):
|
|||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
"""Initialize a morphologizer.
|
||||
|
||||
|
@ -105,6 +116,7 @@ class Morphologizer(Tagger):
|
|||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#init
|
||||
"""
|
||||
|
@ -124,11 +136,12 @@ class Morphologizer(Tagger):
|
|||
}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
"""RETURNS (Tuple[str]): The labels currently added to the component."""
|
||||
return tuple(self.cfg["labels_morph"].keys())
|
||||
"""RETURNS (Iterable[str]): The labels currently added to the component."""
|
||||
return self.cfg["labels_morph"].keys()
|
||||
|
||||
@property
|
||||
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
||||
|
@ -151,7 +164,7 @@ class Morphologizer(Tagger):
|
|||
# normalize label
|
||||
norm_label = self.vocab.morphology.normalize_features(label)
|
||||
# extract separate POS and morph tags
|
||||
label_dict = Morphology.feats_to_dict(label)
|
||||
label_dict = Morphology.feats_to_dict(label, sort_values=False)
|
||||
pos = label_dict.get(self.POS_FEAT, "")
|
||||
if self.POS_FEAT in label_dict:
|
||||
label_dict.pop(self.POS_FEAT)
|
||||
|
@ -189,7 +202,7 @@ class Morphologizer(Tagger):
|
|||
continue
|
||||
morph = str(token.morph)
|
||||
# create and add the combined morph+POS label
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
|
||||
if pos:
|
||||
morph_dict[self.POS_FEAT] = pos
|
||||
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||
|
@ -206,7 +219,7 @@ class Morphologizer(Tagger):
|
|||
for i, token in enumerate(example.reference):
|
||||
pos = token.pos_
|
||||
morph = str(token.morph)
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
|
||||
if pos:
|
||||
morph_dict[self.POS_FEAT] = pos
|
||||
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||
|
@ -217,40 +230,48 @@ class Morphologizer(Tagger):
|
|||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
|
||||
activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#set_annotations
|
||||
"""
|
||||
batch_tag_ids = activations["label_ids"]
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
cdef bint extend = self.cfg["extend"]
|
||||
labels = self.labels
|
||||
|
||||
# We require random access for the upcoming ops, so we need
|
||||
# to allocate a compatible container out of the iterable.
|
||||
labels = tuple(self.labels)
|
||||
for i, doc in enumerate(docs):
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
for act_name, acts in activations.items():
|
||||
doc.activations[self.name][act_name] = acts[i]
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
morph = labels[tag_id]
|
||||
morph = labels[int(tag_id)]
|
||||
# set morph
|
||||
if doc.c[j].morph == 0 or overwrite or extend:
|
||||
if overwrite and extend:
|
||||
# morphologizer morph overwrites any existing features
|
||||
# while extending
|
||||
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
|
||||
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
|
||||
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
|
||||
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
|
||||
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
|
||||
elif extend:
|
||||
# existing features are preserved and any new features
|
||||
# are added
|
||||
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
|
||||
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
|
||||
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
|
||||
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
|
||||
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
|
||||
else:
|
||||
# clobber
|
||||
|
@ -270,7 +291,7 @@ class Morphologizer(Tagger):
|
|||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Morphologizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
|
@ -291,7 +312,7 @@ class Morphologizer(Tagger):
|
|||
label = None
|
||||
# Otherwise, generate the combined label
|
||||
else:
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
label_dict = Morphology.feats_to_dict(morph, sort_values=False)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
|
|
|
@ -1,221 +0,0 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional
|
||||
import numpy
|
||||
from thinc.api import CosineDistance, to_categorical, Model, Config
|
||||
from thinc.api import set_dropout_rate
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from .tagger import Tagger
|
||||
from ..training import validate_examples
|
||||
from ..language import Language
|
||||
from ._parser_internals import nonproj
|
||||
from ..attrs import POS, ID
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.MultiTask.v1"
|
||||
maxout_pieces = 3
|
||||
token_vector_width = 96
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
subword_features = true
|
||||
"""
|
||||
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"nn_labeller",
|
||||
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
|
||||
)
|
||||
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
|
||||
return MultitaskObjective(nlp.vocab, model, name)
|
||||
|
||||
|
||||
class MultitaskObjective(Tagger):
|
||||
"""Experimental: Assist training of a parser or tagger, by training a
|
||||
side-objective.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, model, name="nn_labeller", *, target):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
if target == "dep":
|
||||
self.make_label = self.make_dep
|
||||
elif target == "tag":
|
||||
self.make_label = self.make_tag
|
||||
elif target == "ent":
|
||||
self.make_label = self.make_ent
|
||||
elif target == "dep_tag_offset":
|
||||
self.make_label = self.make_dep_tag_offset
|
||||
elif target == "ent_tag":
|
||||
self.make_label = self.make_ent_tag
|
||||
elif target == "sent_start":
|
||||
self.make_label = self.make_sent_start
|
||||
elif hasattr(target, "__call__"):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(Errors.E016)
|
||||
cfg = {"labels": {}, "target": target}
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
return self.cfg.setdefault("labels", {})
|
||||
|
||||
@labels.setter
|
||||
def labels(self, value):
|
||||
self.cfg["labels"] = value
|
||||
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def initialize(self, get_examples, nlp=None, labels=None):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
if labels is not None:
|
||||
self.labels = labels
|
||||
else:
|
||||
for example in get_examples():
|
||||
for token in example.y:
|
||||
label = self.make_label(token)
|
||||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
|
||||
def predict(self, docs):
|
||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||
scores = self.model.get_ref("softmax")(tokvecs)
|
||||
return tokvecs, scores
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
docs = [eg.predicted for eg in examples]
|
||||
for i, eg in enumerate(examples):
|
||||
# Handles alignment for tokenization differences
|
||||
doc_annots = eg.get_aligned() # TODO
|
||||
for j in range(len(eg.predicted)):
|
||||
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||
label = self.make_label(j, tok_annots)
|
||||
if label is None or label not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
correct[idx] = self.labels[label]
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
loss = (d_scores**2).sum()
|
||||
return float(loss), d_scores
|
||||
|
||||
@staticmethod
|
||||
def make_dep(token):
|
||||
return token.dep_
|
||||
|
||||
@staticmethod
|
||||
def make_tag(token):
|
||||
return token.tag_
|
||||
|
||||
@staticmethod
|
||||
def make_ent(token):
|
||||
if token.ent_iob_ == "O":
|
||||
return "O"
|
||||
else:
|
||||
return token.ent_iob_ + "-" + token.ent_type_
|
||||
|
||||
@staticmethod
|
||||
def make_dep_tag_offset(token):
|
||||
dep = token.dep_
|
||||
tag = token.tag_
|
||||
offset = token.head.i - token.i
|
||||
offset = min(offset, 2)
|
||||
offset = max(offset, -2)
|
||||
return f"{dep}-{tag}:{offset}"
|
||||
|
||||
@staticmethod
|
||||
def make_ent_tag(token):
|
||||
if token.ent_iob_ == "O":
|
||||
ent = "O"
|
||||
else:
|
||||
ent = token.ent_iob_ + "-" + token.ent_type_
|
||||
tag = token.tag_
|
||||
return f"{tag}-{ent}"
|
||||
|
||||
@staticmethod
|
||||
def make_sent_start(token):
|
||||
"""A multi-task objective for representing sentence boundaries,
|
||||
using BILU scheme. (O is impossible)
|
||||
"""
|
||||
if token.is_sent_start and token.is_sent_end:
|
||||
return "U-SENT"
|
||||
elif token.is_sent_start:
|
||||
return "B-SENT"
|
||||
else:
|
||||
return "I-SENT"
|
||||
|
||||
|
||||
class ClozeMultitask(TrainablePipe):
|
||||
def __init__(self, vocab, model, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = cfg
|
||||
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
|
||||
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||
self.model.output_layer.initialize(X)
|
||||
|
||||
def predict(self, docs):
|
||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||
vectors = self.model.get_ref("output_layer")(tokvecs)
|
||||
return tokvecs, vectors
|
||||
|
||||
def get_loss(self, examples, vectors, prediction):
|
||||
validate_examples(examples, "ClozeMultitask.get_loss")
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
|
||||
target = vectors[ids]
|
||||
gradient = self.distance.get_grad(prediction, target)
|
||||
loss = self.distance.get_loss(prediction, target)
|
||||
return float(loss), gradient
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
pass
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
set_dropout_rate(self.model, drop)
|
||||
validate_examples(examples, "ClozeMultitask.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
predictions, bp_predictions = self.model.begin_update()
|
||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||
bp_predictions(d_predictions)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def add_label(self, label):
|
||||
raise NotImplementedError
|
|
@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
|
|||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.ner cimport BiluoPushDown
|
||||
from .transition_parser import Parser
|
||||
from ._parser_internals.ner import BiluoPushDown
|
||||
from ..language import Language
|
||||
from ..scorer import get_ner_prf, PRFScore
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
from ..training import remove_bilu_prefix
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"incorrect_spans_key": None,
|
||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_ner(
|
||||
nlp: Language,
|
||||
|
@ -98,6 +102,7 @@ def make_ner(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_ner",
|
||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
|
@ -111,7 +116,12 @@ def make_ner(
|
|||
"incorrect_spans_key": None,
|
||||
"scorer": None,
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_beam_ner(
|
||||
nlp: Language,
|
||||
|
@ -185,11 +195,12 @@ def make_ner_scorer():
|
|||
return ner_score
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
class EntityRecognizer(Parser):
|
||||
"""Pipeline component for named entity recognition.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer
|
||||
"""
|
||||
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
def __init__(
|
||||
|
@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser):
|
|||
incorrect_spans_key=None,
|
||||
scorer=ner_score,
|
||||
):
|
||||
"""Create an EntityRecognizer.
|
||||
"""
|
||||
"""Create an EntityRecognizer."""
|
||||
super().__init__(
|
||||
vocab,
|
||||
model,
|
||||
name,
|
||||
moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
min_action_freq=1, # not relevant for NER
|
||||
min_action_freq=1, # not relevant for NER
|
||||
learn_tokens=False, # not relevant for NER
|
||||
beam_width=beam_width,
|
||||
beam_density=beam_density,
|
|
@ -19,13 +19,6 @@ cdef class Pipe:
|
|||
DOCS: https://spacy.io/api/pipe
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls, **kwargs):
|
||||
"""Raise a warning if an inheriting class implements 'begin_training'
|
||||
(from v2) instead of the new 'initialize' method (from v3)"""
|
||||
if hasattr(cls, "begin_training"):
|
||||
warnings.warn(Warnings.W088.format(name=cls.__name__))
|
||||
|
||||
def __call__(self, Doc doc) -> Doc:
|
||||
"""Apply the pipe to one document. The document is modified in place,
|
||||
and returned. This usually happens under the hood when the nlp object
|
||||
|
@ -94,6 +87,10 @@ cdef class Pipe:
|
|||
return self.scorer(examples, **scorer_kwargs)
|
||||
return {}
|
||||
|
||||
@property
|
||||
def is_distillable(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_trainable(self) -> bool:
|
||||
return False
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, Callable
|
||||
from typing import Dict, Iterable, Optional, Callable, List, Union
|
||||
from itertools import islice
|
||||
|
||||
import srsly
|
||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||
from thinc.api import Model, Config
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .tagger import Tagger
|
||||
from .tagger import ActivationsT, Tagger
|
||||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
|
@ -38,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"senter",
|
||||
assigns=["token.is_sent_start"],
|
||||
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||
default_config={
|
||||
"model": DEFAULT_SENTER_MODEL,
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
|
||||
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
||||
def make_senter(nlp: Language,
|
||||
name: str,
|
||||
model: Model,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
save_activations: bool):
|
||||
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
|
||||
|
||||
|
||||
def senter_score(examples, **kwargs):
|
||||
|
@ -72,6 +85,7 @@ class SentenceRecognizer(Tagger):
|
|||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=senter_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
"""Initialize a sentence recognizer.
|
||||
|
||||
|
@ -81,6 +95,7 @@ class SentenceRecognizer(Tagger):
|
|||
losses during training.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#init
|
||||
"""
|
||||
|
@ -90,6 +105,7 @@ class SentenceRecognizer(Tagger):
|
|||
self._rehearsal_model = None
|
||||
self.cfg = {"overwrite": overwrite}
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -107,19 +123,24 @@ class SentenceRecognizer(Tagger):
|
|||
def label_data(self):
|
||||
return None
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
|
||||
activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
|
||||
"""
|
||||
batch_tag_ids = activations["label_ids"]
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
for i, doc in enumerate(docs):
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
for act_name, acts in activations.items():
|
||||
doc.activations[self.name][act_name] = acts[i]
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
|
@ -142,7 +163,7 @@ class SentenceRecognizer(Tagger):
|
|||
"""
|
||||
validate_examples(examples, "SentenceRecognizer.get_loss")
|
||||
labels = self.labels
|
||||
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truth = []
|
||||
|
|
|
@ -11,7 +11,7 @@ from ..language import Language
|
|||
from ..errors import Errors, Warnings
|
||||
from ..util import ensure_path, SimpleFrozenList, registry
|
||||
from ..tokens import Doc, Span
|
||||
from ..scorer import Scorer
|
||||
from ..scorer import Scorer, get_ner_prf
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from .. import util
|
||||
|
@ -21,7 +21,7 @@ DEFAULT_SPANS_KEY = "ruler"
|
|||
|
||||
|
||||
@Language.factory(
|
||||
"future_entity_ruler",
|
||||
"entity_ruler",
|
||||
assigns=["doc.ents"],
|
||||
default_config={
|
||||
"phrase_matcher_attr": None,
|
||||
|
@ -67,6 +67,15 @@ def make_entity_ruler(
|
|||
)
|
||||
|
||||
|
||||
def entity_ruler_score(examples, **kwargs):
|
||||
return get_ner_prf(examples)
|
||||
|
||||
|
||||
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
||||
def make_entity_ruler_scorer():
|
||||
return entity_ruler_score
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"span_ruler",
|
||||
assigns=["doc.spans"],
|
||||
|
@ -124,7 +133,7 @@ def prioritize_new_ents_filter(
|
|||
) -> List[Span]:
|
||||
"""Merge entities and spans into one list without overlaps by allowing
|
||||
spans to overwrite any entities that they overlap with. Intended to
|
||||
replicate the overwrite_ents=True behavior from the EntityRuler.
|
||||
replicate the overwrite_ents=True behavior from the v3 EntityRuler.
|
||||
|
||||
entities (Iterable[Span]): The entities, already filtered for overlaps.
|
||||
spans (Iterable[Span]): The spans to merge, may contain overlaps.
|
||||
|
@ -155,7 +164,7 @@ def prioritize_existing_ents_filter(
|
|||
) -> List[Span]:
|
||||
"""Merge entities and spans into one list without overlaps by prioritizing
|
||||
existing entities. Intended to replicate the overwrite_ents=False behavior
|
||||
from the EntityRuler.
|
||||
from the v3 EntityRuler.
|
||||
|
||||
entities (Iterable[Span]): The entities, already filtered for overlaps.
|
||||
spans (Iterable[Span]): The spans to merge, may contain overlaps.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||
from typing import Union
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
@ -16,6 +17,9 @@ from ..errors import Errors
|
|||
from ..util import registry
|
||||
|
||||
|
||||
ActivationsT = Dict[str, Union[Floats2d, Ragged]]
|
||||
|
||||
|
||||
spancat_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
@ -106,6 +110,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
|||
"model": DEFAULT_SPANCAT_MODEL,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
|
@ -118,6 +123,7 @@ def make_spancat(
|
|||
scorer: Optional[Callable],
|
||||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
save_activations: bool,
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
|
@ -141,6 +147,7 @@ def make_spancat(
|
|||
0.5.
|
||||
max_positive (Optional[int]): Maximum number of labels to consider positive
|
||||
per span. Defaults to None, indicating no limit.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
|
@ -151,6 +158,7 @@ def make_spancat(
|
|||
max_positive=max_positive,
|
||||
name=name,
|
||||
scorer=scorer,
|
||||
save_activations=save_activations,
|
||||
)
|
||||
|
||||
|
||||
|
@ -189,6 +197,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
threshold: float = 0.5,
|
||||
max_positive: Optional[int] = None,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
save_activations: bool = False,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
|
@ -221,6 +230,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -263,7 +273,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return list(self.labels)
|
||||
|
||||
def predict(self, docs: Iterable[Doc]):
|
||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
docs (Iterable[Doc]): The documents to predict.
|
||||
|
@ -276,7 +286,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
scores = self.model.ops.alloc2f(0, 0)
|
||||
else:
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
return {"indices": indices, "scores": scores}
|
||||
|
||||
def set_candidates(
|
||||
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
|
||||
|
@ -296,19 +306,29 @@ class SpanCategorizer(TrainablePipe):
|
|||
for index in candidates.dataXd:
|
||||
doc.spans[candidates_key].append(doc[index[0] : index[1]])
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
|
||||
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
scores: The scores to set, produced by SpanCategorizer.predict.
|
||||
activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||
"""
|
||||
labels = self.labels
|
||||
indices, scores = indices_scores
|
||||
|
||||
indices = activations["indices"]
|
||||
assert isinstance(indices, Ragged)
|
||||
scores = cast(Floats2d, activations["scores"])
|
||||
|
||||
offset = 0
|
||||
for i, doc in enumerate(docs):
|
||||
indices_i = indices[i].dataXd
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
doc.activations[self.name]["indices"] = indices_i
|
||||
doc.activations[self.name]["scores"] = scores[
|
||||
offset : offset + indices.lengths[i]
|
||||
]
|
||||
doc.spans[self.key] = self._make_span_group(
|
||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||
)
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Callable, Optional
|
||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||
from typing import Tuple
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model, set_dropout_rate, Config
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
import warnings
|
||||
from itertools import islice
|
||||
|
||||
|
@ -22,6 +24,9 @@ from ..training import validate_examples, validate_get_examples
|
|||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
|
@ -45,7 +50,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"},
|
||||
default_config={
|
||||
"model": DEFAULT_TAGGER_MODEL,
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
||||
"neg_prefix": "!",
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(
|
||||
|
@ -55,6 +66,7 @@ def make_tagger(
|
|||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
neg_prefix: str,
|
||||
save_activations: bool,
|
||||
):
|
||||
"""Construct a part-of-speech tagger component.
|
||||
|
||||
|
@ -63,7 +75,8 @@ def make_tagger(
|
|||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||
with the rows summing to 1).
|
||||
"""
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix)
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
|
||||
save_activations=save_activations)
|
||||
|
||||
|
||||
def tagger_score(examples, **kwargs):
|
||||
|
@ -89,6 +102,7 @@ class Tagger(TrainablePipe):
|
|||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=tagger_score,
|
||||
neg_prefix="!",
|
||||
save_activations: bool = False,
|
||||
):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
||||
|
@ -98,6 +112,7 @@ class Tagger(TrainablePipe):
|
|||
losses during training.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attribute "tag".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#init
|
||||
"""
|
||||
|
@ -108,6 +123,7 @@ class Tagger(TrainablePipe):
|
|||
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -126,7 +142,7 @@ class Tagger(TrainablePipe):
|
|||
"""Data about the labels currently added to the component."""
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
def predict(self, docs):
|
||||
def predict(self, docs) -> ActivationsT:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
docs (Iterable[Doc]): The documents to predict.
|
||||
|
@ -139,12 +155,12 @@ class Tagger(TrainablePipe):
|
|||
n_labels = len(self.labels)
|
||||
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
||||
assert len(guesses) == len(docs)
|
||||
return guesses
|
||||
return {"probabilities": guesses, "label_ids": guesses}
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == len(docs), (len(scores), len(docs))
|
||||
guesses = self._scores2guesses(scores)
|
||||
assert len(guesses) == len(docs)
|
||||
return guesses
|
||||
return {"probabilities": scores, "label_ids": guesses}
|
||||
|
||||
def _scores2guesses(self, scores):
|
||||
guesses = []
|
||||
|
@ -155,14 +171,15 @@ class Tagger(TrainablePipe):
|
|||
guesses.append(doc_guesses)
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
batch_tag_ids: The IDs to set, produced by Tagger.predict.
|
||||
activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#set_annotations
|
||||
"""
|
||||
batch_tag_ids = activations["label_ids"]
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
|
@ -170,6 +187,10 @@ class Tagger(TrainablePipe):
|
|||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
labels = self.labels
|
||||
for i, doc in enumerate(docs):
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
for act_name, acts in activations.items():
|
||||
doc.activations[self.name][act_name] = acts[i]
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
|
@ -225,7 +246,6 @@ class Tagger(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#rehearse
|
||||
"""
|
||||
loss_func = SequenceCategoricalCrossentropy()
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
@ -239,12 +259,32 @@ class Tagger(TrainablePipe):
|
|||
set_dropout_rate(self.model, drop)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
||||
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
||||
grads, loss = loss_func(tag_scores, tutor_tag_scores)
|
||||
loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
|
||||
bp_tag_scores(grads)
|
||||
self.finish_update(sgd)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
|
||||
"""
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
|
||||
d_scores, loss = loss_func(student_scores, teacher_scores)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
return float(loss), d_scores
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
their predicted scores.
|
||||
|
@ -256,7 +296,7 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Tagger.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
||||
# Convert empty tag "" to missing value None so that both misaligned
|
||||
# tokens and tokens with missing annotation have the default missing
|
||||
# value None.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
|
||||
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
|
||||
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
|
||||
from thinc.types import Floats2d
|
||||
import numpy
|
||||
|
@ -14,6 +14,9 @@ from ..util import registry
|
|||
from ..vocab import Vocab
|
||||
|
||||
|
||||
ActivationsT = Dict[str, Floats2d]
|
||||
|
||||
|
||||
single_label_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
|
@ -75,6 +78,7 @@ subword_features = true
|
|||
"threshold": 0.0,
|
||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
|
@ -95,6 +99,7 @@ def make_textcat(
|
|||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
save_activations: bool,
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
|
@ -104,8 +109,16 @@ def make_textcat(
|
|||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
"""
|
||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
||||
return TextCategorizer(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
threshold=threshold,
|
||||
scorer=scorer,
|
||||
save_activations=save_activations,
|
||||
)
|
||||
|
||||
|
||||
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
|
@ -136,6 +149,7 @@ class TextCategorizer(TrainablePipe):
|
|||
*,
|
||||
threshold: float,
|
||||
scorer: Optional[Callable] = textcat_score,
|
||||
save_activations: bool = False,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for single-label classification.
|
||||
|
||||
|
@ -161,6 +175,7 @@ class TextCategorizer(TrainablePipe):
|
|||
}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
@property
|
||||
def support_missing_values(self):
|
||||
|
@ -185,7 +200,7 @@ class TextCategorizer(TrainablePipe):
|
|||
"""
|
||||
return self.labels # type: ignore[return-value]
|
||||
|
||||
def predict(self, docs: Iterable[Doc]):
|
||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
docs (Iterable[Doc]): The documents to predict.
|
||||
|
@ -198,12 +213,12 @@ class TextCategorizer(TrainablePipe):
|
|||
tensors = [doc.tensor for doc in docs]
|
||||
xp = self.model.ops.xp
|
||||
scores = xp.zeros((len(list(docs)), len(self.labels)))
|
||||
return scores
|
||||
return {"probabilities": scores}
|
||||
scores = self.model.predict(docs)
|
||||
scores = self.model.ops.asarray(scores)
|
||||
return scores
|
||||
return {"probabilities": scores}
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], scores) -> None:
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
|
||||
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
|
@ -211,9 +226,13 @@ class TextCategorizer(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/textcategorizer#set_annotations
|
||||
"""
|
||||
probs = activations["probabilities"]
|
||||
for i, doc in enumerate(docs):
|
||||
if self.save_activations:
|
||||
doc.activations[self.name] = {}
|
||||
doc.activations[self.name]["probabilities"] = probs[i]
|
||||
for j, label in enumerate(self.labels):
|
||||
doc.cats[label] = float(scores[i, j])
|
||||
doc.cats[label] = float(probs[i, j])
|
||||
|
||||
def update(
|
||||
self,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Iterable, Optional, Dict, List, Callable, Any
|
||||
from typing import Iterable, Optional, Dict, List, Callable, Any, Union
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model, Config
|
||||
|
||||
|
@ -75,6 +75,7 @@ subword_features = true
|
|||
"threshold": 0.5,
|
||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
|
@ -95,8 +96,9 @@ def make_multilabel_textcat(
|
|||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
save_activations: bool,
|
||||
) -> "MultiLabel_TextCategorizer":
|
||||
"""Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
to be non-mutually exclusive, which means that there can be zero or more labels
|
||||
per doc).
|
||||
|
@ -107,7 +109,12 @@ def make_multilabel_textcat(
|
|||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return MultiLabel_TextCategorizer(
|
||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
threshold=threshold,
|
||||
scorer=scorer,
|
||||
save_activations=save_activations,
|
||||
)
|
||||
|
||||
|
||||
|
@ -139,6 +146,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
*,
|
||||
threshold: float,
|
||||
scorer: Optional[Callable] = textcat_multilabel_score,
|
||||
save_activations: bool = False,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for multi-label classification.
|
||||
|
||||
|
@ -148,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
|
@ -158,6 +167,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
cfg = {"labels": [], "threshold": threshold}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
||||
@property
|
||||
def support_missing_values(self):
|
||||
|
|
|
@ -6,3 +6,4 @@ cdef class TrainablePipe(Pipe):
|
|||
cdef public object model
|
||||
cdef public object cfg
|
||||
cdef public object scorer
|
||||
cdef bint _save_activations
|
||||
|
|
|
@ -2,11 +2,12 @@
|
|||
from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable
|
||||
import srsly
|
||||
from thinc.api import set_dropout_rate, Model, Optimizer
|
||||
import warnings
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from ..training import validate_examples
|
||||
from ..errors import Errors
|
||||
from ..training import validate_examples, validate_distillation_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from .pipe import Pipe, deserialize_config
|
||||
from .. import util
|
||||
from ..vocab import Vocab
|
||||
|
@ -55,6 +56,53 @@ cdef class TrainablePipe(Pipe):
|
|||
except Exception as e:
|
||||
error_handler(self.name, self, [doc], e)
|
||||
|
||||
|
||||
def distill(self,
|
||||
teacher_pipe: Optional["TrainablePipe"],
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float=0.0,
|
||||
sgd: Optional[Optimizer]=None,
|
||||
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
||||
"""Train a pipe (the student) on the predictions of another pipe
|
||||
(the teacher). The student is typically trained on the probability
|
||||
distribution of the teacher, but details may differ per pipe.
|
||||
|
||||
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
|
||||
from.
|
||||
examples (Iterable[Example]): Distillation examples. The reference
|
||||
and predicted docs must have the same number of tokens and the
|
||||
same orthography.
|
||||
drop (float): dropout rate.
|
||||
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||
create_optimizer if not set.
|
||||
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||
distillation.
|
||||
RETURNS: The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#distill
|
||||
"""
|
||||
# By default we require a teacher pipe, but there are downstream
|
||||
# implementations that don't require a pipe.
|
||||
if teacher_pipe is None:
|
||||
raise ValueError(Errors.E4002.format(name=self.name))
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
validate_distillation_examples(examples, "TrainablePipe.distill")
|
||||
set_dropout_rate(self.model, drop)
|
||||
for node in teacher_pipe.model.walk():
|
||||
if node.name == "softmax":
|
||||
node.attrs["softmax_normalize"] = True
|
||||
teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
|
||||
student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||
bp_student_scores(d_scores)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
|
@ -168,6 +216,19 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
|
||||
|
||||
def get_teacher_student_loss(self, teacher_scores, student_scores):
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
|
||||
|
||||
def create_optimizer(self) -> Optimizer:
|
||||
"""Create an optimizer for the pipeline component.
|
||||
|
||||
|
@ -204,6 +265,14 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
||||
|
||||
@property
|
||||
def is_distillable(self) -> bool:
|
||||
# Normally a pipe overrides `get_teacher_student_loss` to implement
|
||||
# distillation. In more exceptional cases, a pipe can provide its
|
||||
# own `distill` implementation. If neither of these methods is
|
||||
# overridden, the pipe does not implement distillation.
|
||||
return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
|
||||
|
||||
@property
|
||||
def is_trainable(self) -> bool:
|
||||
return True
|
||||
|
@ -342,3 +411,11 @@ cdef class TrainablePipe(Pipe):
|
|||
deserialize["model"] = load_model
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
@property
|
||||
def save_activations(self):
|
||||
return self._save_activations
|
||||
|
||||
@save_activations.setter
|
||||
def save_activations(self, save_activations: bool):
|
||||
self._save_activations = save_activations
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.backends.cblas cimport CBlas
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
from .trainable_pipe cimport TrainablePipe
|
||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||
from ._parser_internals._state cimport StateC
|
||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
cdef public object _rehearsal_model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
|
@ -1,5 +1,6 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||
from __future__ import print_function
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
from cymem.cymem cimport Pool
|
||||
cimport numpy as np
|
||||
from itertools import islice
|
||||
|
@ -7,25 +8,30 @@ from libcpp.vector cimport vector
|
|||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free
|
||||
import random
|
||||
import contextlib
|
||||
|
||||
import srsly
|
||||
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
|
||||
from thinc.api import chain, softmax_activation, use_ops, get_array_module
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
import numpy.random
|
||||
import numpy
|
||||
import warnings
|
||||
|
||||
from ._parser_internals.stateclass cimport StateClass
|
||||
from ..ml.parser_model cimport alloc_activations, free_activations
|
||||
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||
from ..ml.tb_framework import TransitionModelInputs
|
||||
from ._parser_internals.stateclass cimport StateC, StateClass
|
||||
from ._parser_internals.search cimport Beam
|
||||
from ..tokens.doc cimport Doc
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from .trainable_pipe cimport TrainablePipe
|
||||
from ._parser_internals cimport _beam_utils
|
||||
from ._parser_internals import _beam_utils
|
||||
from ..vocab cimport Vocab
|
||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||
from ..typedefs cimport weight_t
|
||||
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..training import validate_distillation_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from .. import util
|
||||
|
||||
|
@ -33,7 +39,7 @@ from .. import util
|
|||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
class Parser(TrainablePipe):
|
||||
"""
|
||||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
|
@ -123,6 +129,7 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
self._rehearsal_model = None
|
||||
self.scorer = scorer
|
||||
self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
|
||||
|
||||
def __getnewargs_ex__(self):
|
||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||
|
@ -132,8 +139,9 @@ cdef class Parser(TrainablePipe):
|
|||
@property
|
||||
def move_names(self):
|
||||
names = []
|
||||
cdef TransitionSystem moves = self.moves
|
||||
for i in range(self.moves.n_moves):
|
||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||
name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
|
||||
# Explicitly removing the internal "U-" token used for blocking entities
|
||||
if name != "U-":
|
||||
names.append(name)
|
||||
|
@ -202,6 +210,118 @@ cdef class Parser(TrainablePipe):
|
|||
# Defined in subclasses, to avoid circular import
|
||||
raise NotImplementedError
|
||||
|
||||
def distill(self,
|
||||
teacher_pipe: Optional[TrainablePipe],
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float=0.0,
|
||||
sgd: Optional[Optimizer]=None,
|
||||
losses: Optional[Dict[str, float]]=None):
|
||||
"""Train a pipe (the student) on the predictions of another pipe
|
||||
(the teacher). The student is trained on the transition probabilities
|
||||
of the teacher.
|
||||
|
||||
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
|
||||
from.
|
||||
examples (Iterable[Example]): Distillation examples. The reference
|
||||
and predicted docs must have the same number of tokens and the
|
||||
same orthography.
|
||||
drop (float): dropout rate.
|
||||
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||
create_optimizer if not set.
|
||||
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||
distillation.
|
||||
RETURNS: The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#distill
|
||||
"""
|
||||
if teacher_pipe is None:
|
||||
raise ValueError(Errors.E4002.format(name=self.name))
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
||||
validate_distillation_examples(examples, "TransitionParser.distill")
|
||||
|
||||
set_dropout_rate(self.model, drop)
|
||||
|
||||
student_docs = [eg.predicted for eg in examples]
|
||||
|
||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||
if max_moves >= 1:
|
||||
# Chop sequences into lengths of this many words, to make the
|
||||
# batch uniform length. Since we do not have a gold standard
|
||||
# sequence, we use the teacher's predictions as the gold
|
||||
# standard.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states = self._init_batch(teacher_pipe, student_docs, max_moves)
|
||||
else:
|
||||
states = self.moves.init_batch(student_docs)
|
||||
|
||||
# We distill as follows: 1. we first let the student predict transition
|
||||
# sequences (and the corresponding transition probabilities); (2) we
|
||||
# let the teacher follow the student's predicted transition sequences
|
||||
# to obtain the teacher's transition probabilities; (3) we compute the
|
||||
# gradients of the student's transition distributions relative to the
|
||||
# teacher's distributions.
|
||||
|
||||
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
|
||||
max_moves=max_moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
|
||||
moves=self.moves, actions=actions)
|
||||
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||
backprop_scores((student_states, d_scores))
|
||||
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
|
||||
losses[self.name] += loss
|
||||
|
||||
return losses
|
||||
|
||||
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
|
||||
normalize: bool=False,
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
|
||||
"""
|
||||
|
||||
# We can't easily hook up a softmax layer in the parsing model, since
|
||||
# the get_loss does additional masking. So, we could apply softmax
|
||||
# manually here and use Thinc's cross-entropy loss. But it's a bit
|
||||
# suboptimal, since we can have a lot of states that would result in
|
||||
# many kernel launches. Futhermore the parsing model's backprop expects
|
||||
# a XP array, so we'd have to concat the softmaxes anyway. So, like
|
||||
# the get_loss implementation, we'll compute the loss and gradients
|
||||
# ourselves.
|
||||
|
||||
teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
|
||||
axis=-1, inplace=True)
|
||||
student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
|
||||
axis=-1, inplace=True)
|
||||
|
||||
assert teacher_scores.shape == student_scores.shape
|
||||
|
||||
d_scores = student_scores - teacher_scores
|
||||
if normalize:
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum() / d_scores.size
|
||||
|
||||
return float(loss), d_scores
|
||||
|
||||
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
||||
"""Setup models for secondary objectives, to benefit from multi-task
|
||||
learning. This method is intended to be overridden by subclasses.
|
||||
|
@ -222,9 +342,6 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
|
||||
deals with a failing batch of documents. The default function just reraises
|
||||
the exception.
|
||||
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
|
@ -246,83 +363,29 @@ cdef class Parser(TrainablePipe):
|
|||
def predict(self, docs):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
self._ensure_labels_are_added(docs)
|
||||
if not any(len(doc) for doc in docs):
|
||||
result = self.moves.init_batch(docs)
|
||||
return result
|
||||
if self.cfg["beam_width"] == 1:
|
||||
return self.greedy_parse(docs, drop=0.0)
|
||||
else:
|
||||
return self.beam_parse(
|
||||
docs,
|
||||
drop=0.0,
|
||||
beam_width=self.cfg["beam_width"],
|
||||
beam_density=self.cfg["beam_density"]
|
||||
)
|
||||
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
states_or_beams, _ = self.model.predict(inputs)
|
||||
return states_or_beams
|
||||
|
||||
def greedy_parse(self, docs, drop=0.):
|
||||
cdef vector[StateC*] states
|
||||
cdef StateClass state
|
||||
ops = self.model.ops
|
||||
cdef CBlas cblas
|
||||
if isinstance(ops, CupyOps):
|
||||
cblas = NUMPY_OPS.cblas()
|
||||
else:
|
||||
cblas = ops.cblas()
|
||||
self._resize()
|
||||
self._ensure_labels_are_added(docs)
|
||||
set_dropout_rate(self.model, drop)
|
||||
batch = self.moves.init_batch(docs)
|
||||
model = self.model.predict(docs)
|
||||
weights = get_c_weights(model)
|
||||
for state in batch:
|
||||
if not state.is_final():
|
||||
states.push_back(state.c)
|
||||
sizes = get_c_sizes(model, states.size())
|
||||
with nogil:
|
||||
self._parseC(cblas, &states[0], weights, sizes)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return batch
|
||||
with _change_attrs(self.model, beam_width=1):
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
states, _ = self.model.predict(inputs)
|
||||
return states
|
||||
|
||||
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
||||
cdef Beam beam
|
||||
cdef Doc doc
|
||||
self._ensure_labels_are_added(docs)
|
||||
batch = _beam_utils.BeamBatch(
|
||||
self.moves,
|
||||
self.moves.init_batch(docs),
|
||||
None,
|
||||
beam_width,
|
||||
density=beam_density
|
||||
)
|
||||
model = self.model.predict(docs)
|
||||
while not batch.is_done:
|
||||
states = batch.get_unfinished_states()
|
||||
if not states:
|
||||
break
|
||||
scores = model.predict(states)
|
||||
batch.advance(scores)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return list(batch)
|
||||
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil:
|
||||
cdef int i, j
|
||||
cdef vector[StateC*] unfinished
|
||||
cdef ActivationsC activations = alloc_activations(sizes)
|
||||
while sizes.states >= 1:
|
||||
predict_states(cblas, &activations, states, &weights, sizes)
|
||||
# Validate actions, argmax, take action.
|
||||
self.c_transition_batch(states,
|
||||
activations.scores, sizes.classes, sizes.states)
|
||||
for i in range(sizes.states):
|
||||
if not states[i].is_final():
|
||||
unfinished.push_back(states[i])
|
||||
for i in range(unfinished.size()):
|
||||
states[i] = unfinished[i]
|
||||
sizes.states = unfinished.size()
|
||||
unfinished.clear()
|
||||
free_activations(&activations)
|
||||
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
beams, _ = self.model.predict(inputs)
|
||||
return beams
|
||||
|
||||
def set_annotations(self, docs, states_or_beams):
|
||||
cdef StateClass state
|
||||
|
@ -334,35 +397,6 @@ cdef class Parser(TrainablePipe):
|
|||
for hook in self.postprocesses:
|
||||
hook(doc)
|
||||
|
||||
def transition_states(self, states, float[:, ::1] scores):
|
||||
cdef StateClass state
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef vector[StateC*] c_states
|
||||
for state in states:
|
||||
c_states.push_back(state.c)
|
||||
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
with gil:
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
||||
cdef int i, guess
|
||||
cdef Transition action
|
||||
for i in range(batch_size):
|
||||
self.moves.set_valid(is_valid, states[i])
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
if guess == -1:
|
||||
# This shouldn't happen, but it's hard to raise an error here,
|
||||
# and we don't want to infinite loop. So, force to end state.
|
||||
states[i].force_final()
|
||||
else:
|
||||
action = self.moves.c[guess]
|
||||
action.do(states[i], action.label)
|
||||
free(is_valid)
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
cdef StateClass state
|
||||
if losses is None:
|
||||
|
@ -374,67 +408,99 @@ cdef class Parser(TrainablePipe):
|
|||
)
|
||||
for multitask in self._multitasks:
|
||||
multitask.update(examples, drop=drop, sgd=sgd)
|
||||
# We need to take care to act on the whole batch, because we might be
|
||||
# getting vectors via a listener.
|
||||
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
||||
if n_examples == 0:
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
# The probability we use beam update, instead of falling back to
|
||||
# a greedy update
|
||||
beam_update_prob = self.cfg["beam_update_prob"]
|
||||
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
|
||||
return self.update_beam(
|
||||
examples,
|
||||
beam_width=self.cfg["beam_width"],
|
||||
sgd=sgd,
|
||||
losses=losses,
|
||||
beam_density=self.cfg["beam_density"]
|
||||
)
|
||||
docs = [eg.x for eg in examples if len(eg.x)]
|
||||
|
||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||
if max_moves >= 1:
|
||||
# Chop sequences into lengths of this many words, to make the
|
||||
# batch uniform length.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states, golds, _ = self._init_gold_batch(
|
||||
max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
|
||||
init_states, gold_states, _ = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=max_moves
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
return losses
|
||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
||||
|
||||
all_states = list(states)
|
||||
states_golds = list(zip(states, golds))
|
||||
n_moves = 0
|
||||
while states_golds:
|
||||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||
# Note that the gradient isn't normalized by the batch size
|
||||
# here, because our "samples" are really the states...But we
|
||||
# can't normalize by the number of states either, as then we'd
|
||||
# be getting smaller gradients for states in long sequences.
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
||||
if max_moves >= 1 and n_moves >= max_moves:
|
||||
break
|
||||
n_moves += 1
|
||||
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
|
||||
|
||||
backprop_tok2vec(golds)
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
|
||||
max_moves=max_moves, states=[state.copy() for state in init_states])
|
||||
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
|
||||
if sum(s.shape[0] for s in scores) == 0:
|
||||
return losses
|
||||
d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
|
||||
examples, max_moves)
|
||||
backprop_scores((pred_states, d_scores))
|
||||
if sgd not in (None, False):
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += float((d_scores**2).sum())
|
||||
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||
# removing these in time if we don't explicitly delete? It's confusing.
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
del model
|
||||
del backprop_scores
|
||||
return losses
|
||||
|
||||
def get_loss(self, states_scores, examples, max_moves):
|
||||
gold_states, init_states, pred_states, scores = states_scores
|
||||
scores = self.model.ops.xp.vstack(scores)
|
||||
costs = self._get_costs_from_histories(
|
||||
examples,
|
||||
gold_states,
|
||||
init_states,
|
||||
[list(state.history) for state in pred_states],
|
||||
max_moves
|
||||
)
|
||||
xp = get_array_module(scores)
|
||||
best_costs = costs.min(axis=1, keepdims=True)
|
||||
gscores = scores.copy()
|
||||
min_score = scores.min() - 1000
|
||||
assert costs.shape == scores.shape, (costs.shape, scores.shape)
|
||||
gscores[costs > best_costs] = min_score
|
||||
max_ = scores.max(axis=1, keepdims=True)
|
||||
gmax = gscores.max(axis=1, keepdims=True)
|
||||
exp_scores = xp.exp(scores - max_)
|
||||
exp_gscores = xp.exp(gscores - gmax)
|
||||
Z = exp_scores.sum(axis=1, keepdims=True)
|
||||
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
||||
d_scores = exp_scores / Z
|
||||
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
|
||||
return d_scores
|
||||
|
||||
def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
|
||||
cdef TransitionSystem moves = self.moves
|
||||
cdef StateClass state
|
||||
cdef int clas
|
||||
cdef int nF = self.model.get_dim("nF")
|
||||
cdef int nO = moves.n_moves
|
||||
cdef int nS = sum([len(history) for history in histories])
|
||||
cdef Pool mem = Pool()
|
||||
cdef np.ndarray costs_i
|
||||
is_valid = <int*>mem.alloc(nO, sizeof(int))
|
||||
batch = list(zip(init_states, histories, gold_states))
|
||||
n_moves = 0
|
||||
output = []
|
||||
while batch:
|
||||
costs = numpy.zeros((len(batch), nO), dtype="f")
|
||||
for i, (state, history, gold) in enumerate(batch):
|
||||
costs_i = costs[i]
|
||||
clas = history.pop(0)
|
||||
moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
|
||||
action = moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(clas)
|
||||
output.append(costs)
|
||||
batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
|
||||
if n_moves >= max_moves >= 1:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
return self.model.ops.xp.vstack(output)
|
||||
|
||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||
if losses is None:
|
||||
|
@ -444,10 +510,9 @@ cdef class Parser(TrainablePipe):
|
|||
multitask.rehearse(examples, losses=losses, sgd=sgd)
|
||||
if self._rehearsal_model is None:
|
||||
return None
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses.setdefault(self.name, 0.0)
|
||||
validate_examples(examples, "Parser.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
states = self.moves.init_batch(docs)
|
||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
|
@ -455,85 +520,33 @@ cdef class Parser(TrainablePipe):
|
|||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||
set_dropout_rate(self.model, 0.0)
|
||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
||||
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||
n_scores = 0.
|
||||
loss = 0.
|
||||
while states:
|
||||
targets, _ = tutor.begin_update(states)
|
||||
guesses, backprop = model.begin_update(states)
|
||||
d_scores = (guesses - targets) / targets.shape[0]
|
||||
# If all weights for an output are 0 in the original model, don't
|
||||
# supervise that output. This allows us to add classes.
|
||||
loss += (d_scores**2).sum()
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, guesses)
|
||||
states = [state for state in states if not state.is_final()]
|
||||
n_scores += d_scores.size
|
||||
# Do the backprop
|
||||
backprop_tok2vec(docs)
|
||||
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
|
||||
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
|
||||
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
|
||||
|
||||
teacher_scores = self.model.ops.xp.vstack(teacher_scores)
|
||||
student_scores = self.model.ops.xp.vstack(student_scores)
|
||||
assert teacher_scores.shape == student_scores.shape
|
||||
|
||||
d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
|
||||
# If all weights for an output are 0 in the original model, don't
|
||||
# supervise that output. This allows us to add classes.
|
||||
loss = (d_scores**2).sum() / d_scores.size
|
||||
backprop_scores((student_states, d_scores))
|
||||
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss / n_scores
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
tutor.clear_memory()
|
||||
del model
|
||||
del tutor
|
||||
losses[self.name] += loss
|
||||
|
||||
return losses
|
||||
|
||||
def update_beam(self, examples, *, beam_width,
|
||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
return losses
|
||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
model, backprop_tok2vec = self.model.begin_update(
|
||||
[eg.predicted for eg in examples])
|
||||
loss = _beam_utils.update_beam(
|
||||
self.moves,
|
||||
states,
|
||||
golds,
|
||||
model,
|
||||
beam_width,
|
||||
beam_density=beam_density,
|
||||
)
|
||||
losses[self.name] += loss
|
||||
backprop_tok2vec(golds)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
|
||||
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
|
||||
cdef StateClass state
|
||||
cdef Pool mem = Pool()
|
||||
cdef int i
|
||||
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
|
||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
||||
dtype='f', order='C')
|
||||
c_d_scores = <float*>d_scores.data
|
||||
unseen_classes = self.model.attrs["unseen_classes"]
|
||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
||||
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
|
||||
memset(costs, 0, self.moves.n_moves * sizeof(float))
|
||||
self.moves.set_costs(is_valid, costs, state.c, gold)
|
||||
for j in range(self.moves.n_moves):
|
||||
if costs[j] <= 0.0 and j in unseen_classes:
|
||||
unseen_classes.remove(j)
|
||||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
return d_scores
|
||||
raise NotImplementedError
|
||||
|
||||
def set_output(self, nO):
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
@ -572,7 +585,7 @@ cdef class Parser(TrainablePipe):
|
|||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.predicted)
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(doc_sample)
|
||||
self.model.initialize((doc_sample, self.moves))
|
||||
if nlp is not None:
|
||||
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||
|
||||
|
@ -629,28 +642,63 @@ cdef class Parser(TrainablePipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
return self
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
def _init_batch(self, teacher_step_model, docs, max_length):
|
||||
"""Make a square batch of length equal to the shortest transition
|
||||
sequence or a cap. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
where N is the shortest doc. We'll make two states, one representing
|
||||
long_doc[:N], and another representing long_doc[N:]."""
|
||||
long_doc[:N], and another representing long_doc[N:]. In contrast to
|
||||
_init_gold_batch, this version uses a teacher model to generate the
|
||||
cut sequences."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||
all_states = self.moves.init_batch(docs)
|
||||
states = []
|
||||
to_cut = []
|
||||
for state, doc in zip(all_states, docs):
|
||||
if not state.is_final():
|
||||
if len(doc) < max_length:
|
||||
states.append(state)
|
||||
else:
|
||||
to_cut.append(state)
|
||||
while to_cut:
|
||||
states.extend(state.copy() for state in to_cut)
|
||||
# Move states forward max_length actions.
|
||||
length = 0
|
||||
while to_cut and length < max_length:
|
||||
teacher_scores = teacher_step_model.predict(to_cut)
|
||||
self.transition_states(to_cut, teacher_scores)
|
||||
# States that are completed do not need further cutting.
|
||||
to_cut = [state for state in to_cut if not state.is_final()]
|
||||
length += 1
|
||||
return states
|
||||
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
sequence or a cap. A long doc will get multiple states. Let's say we
|
||||
have a doc of length 2*N, where N is the shortest doc. We'll make
|
||||
two states, one representing long_doc[:N], and another representing
|
||||
long_doc[N:]."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
TransitionSystem moves = self.moves
|
||||
all_states = moves.init_batch([eg.predicted for eg in examples])
|
||||
states = []
|
||||
golds = []
|
||||
to_cut = []
|
||||
for state, eg in zip(all_states, examples):
|
||||
if self.moves.has_gold(eg) and not state.is_final():
|
||||
gold = self.moves.init_gold(state, eg)
|
||||
if moves.has_gold(eg) and not state.is_final():
|
||||
gold = moves.init_gold(state, eg)
|
||||
if len(eg.x) < max_length:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
else:
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
oracle_actions = moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
to_cut.append((eg, state, gold, oracle_actions))
|
||||
if not to_cut:
|
||||
|
@ -660,13 +708,52 @@ cdef class Parser(TrainablePipe):
|
|||
for i in range(0, len(oracle_actions), max_length):
|
||||
start_state = state.copy()
|
||||
for clas in oracle_actions[i:i+max_length]:
|
||||
action = self.moves.c[clas]
|
||||
action = moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
if state.is_final():
|
||||
break
|
||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||
if moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||
states.append(start_state)
|
||||
golds.append(gold)
|
||||
if state.is_final():
|
||||
break
|
||||
return states, golds, max_length
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _change_attrs(model, **kwargs):
|
||||
"""Temporarily modify a thinc model's attributes."""
|
||||
unset = object()
|
||||
old_attrs = {}
|
||||
for key, value in kwargs.items():
|
||||
old_attrs[key] = model.attrs.get(key, unset)
|
||||
model.attrs[key] = value
|
||||
yield model
|
||||
for key, value in old_attrs.items():
|
||||
if value is unset:
|
||||
model.attrs.pop(key)
|
||||
else:
|
||||
model.attrs[key] = value
|
||||
|
||||
|
||||
def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
||||
cdef int step
|
||||
cdef StateClass state
|
||||
cdef StateC* c_state
|
||||
actions = []
|
||||
while True:
|
||||
step = len(actions)
|
||||
|
||||
step_actions = []
|
||||
for state in states:
|
||||
c_state = state.c
|
||||
if step < c_state.history.size():
|
||||
step_actions.append(c_state.history[step])
|
||||
|
||||
# We are done if we have exhausted all histories.
|
||||
if len(step_actions) == 0:
|
||||
break
|
||||
|
||||
actions.append(numpy.array(step_actions, dtype="i"))
|
||||
|
||||
return actions
|
||||
|
|
|
@ -144,7 +144,7 @@ def validate_init_settings(
|
|||
|
||||
def validate_token_pattern(obj: list) -> List[str]:
|
||||
# Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
|
||||
get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
|
||||
get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
|
||||
if isinstance(obj, list):
|
||||
converted = []
|
||||
for pattern in obj:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from libc.stdint cimport int64_t
|
||||
from libc.stdint cimport int64_t, uint32_t
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.set cimport set
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -7,13 +7,6 @@ from murmurhash.mrmr cimport hash64
|
|||
|
||||
from .typedefs cimport attr_t, hash_t
|
||||
|
||||
|
||||
cpdef hash_t hash_string(str string) except 0
|
||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||
|
||||
cdef str decode_Utf8Str(const Utf8Str* string)
|
||||
|
||||
|
||||
ctypedef union Utf8Str:
|
||||
unsigned char[8] s
|
||||
unsigned char* p
|
||||
|
@ -21,9 +14,13 @@ ctypedef union Utf8Str:
|
|||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef vector[hash_t] _keys
|
||||
cdef PreshMap _map
|
||||
|
||||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
cdef hash_t _intern_str(self, str string)
|
||||
cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *
|
||||
cdef str _decode_str_repr(self, const Utf8Str* string)
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||
|
||||
cpdef hash_t hash_string(object string) except -1
|
||||
cpdef hash_t get_string_id(object string_or_hash) except -1
|
||||
|
|
|
@ -1,21 +1,20 @@
|
|||
from typing import Optional, Iterable, Iterator, Union, Any, overload
|
||||
from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
|
||||
from pathlib import Path
|
||||
|
||||
def get_string_id(key: Union[str, int]) -> int: ...
|
||||
|
||||
class StringStore:
|
||||
def __init__(
|
||||
self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
|
||||
) -> None: ...
|
||||
def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
|
||||
@overload
|
||||
def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ...
|
||||
def __getitem__(self, string_or_hash: str) -> int: ...
|
||||
@overload
|
||||
def __getitem__(self, string_or_id: int) -> str: ...
|
||||
def as_int(self, key: Union[bytes, str, int]) -> int: ...
|
||||
def as_string(self, key: Union[bytes, str, int]) -> str: ...
|
||||
def __getitem__(self, string_or_hash: int) -> str: ...
|
||||
def as_int(self, string_or_hash: Union[str, int]) -> int: ...
|
||||
def as_string(self, string_or_hash: Union[str, int]) -> str: ...
|
||||
def add(self, string: str) -> int: ...
|
||||
def items(self) -> List[Tuple[str, int]]: ...
|
||||
def keys(self) -> List[str]: ...
|
||||
def values(self) -> List[int]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __contains__(self, string: str) -> bool: ...
|
||||
def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
|
||||
def __iter__(self) -> Iterator[str]: ...
|
||||
def __reduce__(self) -> Any: ...
|
||||
def to_disk(self, path: Union[str, Path]) -> None: ...
|
||||
|
@ -23,3 +22,5 @@ class StringStore:
|
|||
def to_bytes(self, **kwargs: Any) -> bytes: ...
|
||||
def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
|
||||
def _reset_and_load(self, strings: Iterable[str]) -> None: ...
|
||||
|
||||
def get_string_id(string_or_hash: Union[str, int]) -> int: ...
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
# cython: infer_types=True
|
||||
from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
|
||||
cimport cython
|
||||
from libc.string cimport memcpy
|
||||
from libcpp.set cimport set
|
||||
from libc.stdint cimport uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
import srsly
|
||||
|
||||
|
@ -14,105 +15,13 @@ from .symbols import NAMES as SYMBOLS_BY_INT
|
|||
from .errors import Errors
|
||||
from . import util
|
||||
|
||||
# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
|
||||
cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
||||
try:
|
||||
out_hash[0] = key
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def get_string_id(key):
|
||||
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
||||
already an ID, return it.
|
||||
|
||||
This function optimises for convenience over performance, so shouldn't be
|
||||
used in tight loops.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
if isinstance(key, str):
|
||||
if len(key) == 0:
|
||||
return 0
|
||||
|
||||
symbol = SYMBOLS_BY_STR.get(key, None)
|
||||
if symbol is not None:
|
||||
return symbol
|
||||
else:
|
||||
chars = key.encode("utf8")
|
||||
return hash_utf8(chars, len(chars))
|
||||
elif _try_coerce_to_hash(key, &str_hash):
|
||||
# Coerce the integral key to the expected primitive hash type.
|
||||
# This ensures that custom/overloaded "primitive" data types
|
||||
# such as those implemented by numpy are not inadvertently used
|
||||
# downsteam (as these are internally implemented as custom PyObjects
|
||||
# whose comparison operators can incur a significant overhead).
|
||||
return str_hash
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
return key
|
||||
|
||||
|
||||
cpdef hash_t hash_string(str string) except 0:
|
||||
chars = string.encode("utf8")
|
||||
return hash_utf8(chars, len(chars))
|
||||
|
||||
|
||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
|
||||
return hash64(utf8_string, length, 1)
|
||||
|
||||
|
||||
cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
||||
return hash32(utf8_string, length, 1)
|
||||
|
||||
|
||||
cdef str decode_Utf8Str(const Utf8Str* string):
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1].decode("utf8")
|
||||
elif string.p[0] < 255:
|
||||
return string.p[1:string.p[0]+1].decode("utf8")
|
||||
else:
|
||||
i = 0
|
||||
length = 0
|
||||
while string.p[i] == 255:
|
||||
i += 1
|
||||
length += 255
|
||||
length += string.p[i]
|
||||
i += 1
|
||||
return string.p[i:length + i].decode("utf8")
|
||||
|
||||
|
||||
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
||||
cdef uint32_t ulength = length
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
memcpy(&string.s[1], chars, length)
|
||||
return string
|
||||
elif length < 255:
|
||||
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
|
||||
string.p[0] = length
|
||||
memcpy(&string.p[1], chars, length)
|
||||
return string
|
||||
else:
|
||||
i = 0
|
||||
n_length_bytes = (length // 255) + 1
|
||||
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
|
||||
for i in range(n_length_bytes-1):
|
||||
string.p[i] = 255
|
||||
string.p[n_length_bytes-1] = length % 255
|
||||
memcpy(&string.p[n_length_bytes], chars, length)
|
||||
return string
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
"""Look up strings by 64-bit hashes.
|
||||
"""Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
|
||||
|
||||
DOCS: https://spacy.io/api/stringstore
|
||||
"""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
def __init__(self, strings: Optional[Iterable[str]] = None):
|
||||
"""Create the StringStore.
|
||||
|
||||
strings (iterable): A sequence of unicode strings to add to the store.
|
||||
|
@ -123,127 +32,126 @@ cdef class StringStore:
|
|||
for string in strings:
|
||||
self.add(string)
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given hash, or vice versa.
|
||||
def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
|
||||
"""Retrieve a string from a given hash. If a string
|
||||
is passed as the input, add it to the store and return
|
||||
its hash.
|
||||
|
||||
string_or_id (bytes, str or uint64): The value to encode.
|
||||
Returns (str / uint64): The value to be retrieved.
|
||||
string_or_hash (int / str): The hash value to lookup or the string to store.
|
||||
RETURNS (str / int): The stored string or the hash of the newly added string.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
cdef Utf8Str* utf8str = NULL
|
||||
|
||||
if isinstance(string_or_id, str):
|
||||
if len(string_or_id) == 0:
|
||||
return 0
|
||||
|
||||
# Return early if the string is found in the symbols LUT.
|
||||
symbol = SYMBOLS_BY_STR.get(string_or_id, None)
|
||||
if symbol is not None:
|
||||
return symbol
|
||||
else:
|
||||
return hash_string(string_or_id)
|
||||
elif isinstance(string_or_id, bytes):
|
||||
return hash_utf8(string_or_id, len(string_or_id))
|
||||
elif _try_coerce_to_hash(string_or_id, &str_hash):
|
||||
if str_hash == 0:
|
||||
return ""
|
||||
elif str_hash < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[str_hash]
|
||||
else:
|
||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||
if isinstance(string_or_hash, str):
|
||||
return self.add(string_or_hash)
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
||||
return self._get_interned_str(string_or_hash)
|
||||
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
def __contains__(self, string_or_hash: Union[str, int]) -> bool:
|
||||
"""Check whether a string or a hash is in the store.
|
||||
|
||||
string (str / int): The string/hash to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
cdef hash_t str_hash = get_string_id(string_or_hash)
|
||||
if str_hash in SYMBOLS_BY_INT:
|
||||
return True
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
return self._map.get(str_hash) is not NULL
|
||||
|
||||
def as_int(self, key):
|
||||
"""If key is an int, return it; otherwise, get the int value."""
|
||||
if not isinstance(key, str):
|
||||
return key
|
||||
else:
|
||||
return self[key]
|
||||
def __iter__(self) -> Iterator[str]:
|
||||
"""Iterate over the strings in the store in insertion order.
|
||||
|
||||
def as_string(self, key):
|
||||
"""If key is a string, return it; otherwise, get the string value."""
|
||||
if isinstance(key, str):
|
||||
return key
|
||||
else:
|
||||
return self[key]
|
||||
RETURNS: An iterable collection of strings.
|
||||
"""
|
||||
return iter(self.keys())
|
||||
|
||||
def add(self, string):
|
||||
def __reduce__(self):
|
||||
strings = list(self)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of strings in the store.
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self._keys.size()
|
||||
|
||||
def add(self, string: str) -> int:
|
||||
"""Add a string to the StringStore.
|
||||
|
||||
string (str): The string to add.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
if isinstance(string, str):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
|
||||
string = string.encode("utf8")
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
elif isinstance(string, bytes):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
else:
|
||||
if not isinstance(string, str):
|
||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||
return str_hash
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.keys.size()
|
||||
|
||||
def __contains__(self, string_or_id not None):
|
||||
"""Check whether a string or ID is in the store.
|
||||
|
||||
string_or_id (str or int): The string to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
if isinstance(string_or_id, str):
|
||||
if len(string_or_id) == 0:
|
||||
return True
|
||||
elif string_or_id in SYMBOLS_BY_STR:
|
||||
return True
|
||||
str_hash = hash_string(string_or_id)
|
||||
elif _try_coerce_to_hash(string_or_id, &str_hash):
|
||||
pass
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
return self._map.get(string_or_id) is not NULL
|
||||
return self._intern_str(string)
|
||||
|
||||
if str_hash < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
def as_int(self, string_or_hash: Union[str, int]) -> str:
|
||||
"""If a hash value is passed as the input, return it as-is. If the input
|
||||
is a string, return its corresponding hash.
|
||||
|
||||
string_or_hash (str / int): The string to hash or a hash value.
|
||||
RETURNS (int): The hash of the string or the input hash value.
|
||||
"""
|
||||
if isinstance(string_or_hash, int):
|
||||
return string_or_hash
|
||||
else:
|
||||
return self._map.get(str_hash) is not NULL
|
||||
return get_string_id(string_or_hash)
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
def as_string(self, string_or_hash: Union[str, int]) -> str:
|
||||
"""If a string is passed as the input, return it as-is. If the input
|
||||
is a hash value, return its corresponding string.
|
||||
|
||||
YIELDS (str): A string in the store.
|
||||
string_or_hash (str / int): The hash value to lookup or a string.
|
||||
RETURNS (str): The stored string or the input string.
|
||||
"""
|
||||
if isinstance(string_or_hash, str):
|
||||
return string_or_hash
|
||||
else:
|
||||
return self._get_interned_str(string_or_hash)
|
||||
|
||||
def items(self) -> List[Tuple[str, int]]:
|
||||
"""Iterate over the stored strings and their hashes in insertion order.
|
||||
|
||||
RETURNS: A list of string-hash pairs.
|
||||
"""
|
||||
# Even though we internally store the hashes as keys and the strings as
|
||||
# values, we invert the order in the public API to keep it consistent with
|
||||
# the implementation of the `__iter__` method (where we wish to iterate over
|
||||
# the strings in the store).
|
||||
cdef int i
|
||||
pairs = [None] * self._keys.size()
|
||||
for i in range(self._keys.size()):
|
||||
str_hash = self._keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||
pairs[i] = (self._decode_str_repr(utf8str), str_hash)
|
||||
return pairs
|
||||
|
||||
def keys(self) -> List[str]:
|
||||
"""Iterate over the stored strings in insertion order.
|
||||
|
||||
RETURNS: A list of strings.
|
||||
"""
|
||||
cdef int i
|
||||
cdef hash_t key
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
# TODO: Iterate OOV here?
|
||||
strings = [None] * self._keys.size()
|
||||
for i in range(self._keys.size()):
|
||||
utf8str = <Utf8Str*>self._map.get(self._keys[i])
|
||||
strings[i] = self._decode_str_repr(utf8str)
|
||||
return strings
|
||||
|
||||
def __reduce__(self):
|
||||
strings = list(self)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
def values(self) -> List[int]:
|
||||
"""Iterate over the stored strings hashes in insertion order.
|
||||
|
||||
RETURNS: A list of string hashs.
|
||||
"""
|
||||
cdef int i
|
||||
hashes = [None] * self._keys.size()
|
||||
for i in range(self._keys.size()):
|
||||
hashes[i] = self._keys[i]
|
||||
return hashes
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
@ -294,24 +202,122 @@ cdef class StringStore:
|
|||
def _reset_and_load(self, strings):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self.keys.clear()
|
||||
self._keys.clear()
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode("utf8")
|
||||
return self._intern_utf8(byte_string, len(byte_string), NULL)
|
||||
def _get_interned_str(self, hash_value: int) -> str:
|
||||
cdef hash_t str_hash
|
||||
if not _try_coerce_to_hash(hash_value, &str_hash):
|
||||
raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))
|
||||
|
||||
@cython.final
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
|
||||
# Handle reserved symbols and empty strings correctly.
|
||||
if str_hash == 0:
|
||||
return ""
|
||||
|
||||
symbol = SYMBOLS_BY_INT.get(str_hash)
|
||||
if symbol is not None:
|
||||
return symbol
|
||||
|
||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=str_hash))
|
||||
else:
|
||||
return self._decode_str_repr(utf8str)
|
||||
|
||||
cdef hash_t _intern_str(self, str string):
|
||||
# TODO: This function's API/behaviour is an unholy mess...
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
||||
chars = string.encode('utf-8')
|
||||
cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
|
||||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
return key
|
||||
|
||||
value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
self._keys.push_back(key)
|
||||
return key
|
||||
|
||||
cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
|
||||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
|
||||
cdef uint32_t ulength = length
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
memcpy(&string.s[1], chars, length)
|
||||
return string
|
||||
elif length < 255:
|
||||
string.p = <unsigned char*>self.mem.alloc(length + 1, sizeof(unsigned char))
|
||||
string.p[0] = length
|
||||
memcpy(&string.p[1], chars, length)
|
||||
return string
|
||||
else:
|
||||
i = 0
|
||||
n_length_bytes = (length // 255) + 1
|
||||
string.p = <unsigned char*>self.mem.alloc(length + n_length_bytes, sizeof(unsigned char))
|
||||
for i in range(n_length_bytes-1):
|
||||
string.p[i] = 255
|
||||
string.p[n_length_bytes-1] = length % 255
|
||||
memcpy(&string.p[n_length_bytes], chars, length)
|
||||
return string
|
||||
|
||||
cdef str _decode_str_repr(self, const Utf8Str* string):
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1].decode('utf-8')
|
||||
elif string.p[0] < 255:
|
||||
return string.p[1:string.p[0]+1].decode('utf-8')
|
||||
else:
|
||||
i = 0
|
||||
length = 0
|
||||
while string.p[i] == 255:
|
||||
i += 1
|
||||
length += 255
|
||||
length += string.p[i]
|
||||
i += 1
|
||||
return string.p[i:length + i].decode('utf-8')
|
||||
|
||||
|
||||
cpdef hash_t hash_string(object string) except -1:
|
||||
if not isinstance(string, str):
|
||||
raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string)))
|
||||
|
||||
# Handle reserved symbols and empty strings correctly.
|
||||
if len(string) == 0:
|
||||
return 0
|
||||
|
||||
symbol = SYMBOLS_BY_STR.get(string)
|
||||
if symbol is not None:
|
||||
return symbol
|
||||
|
||||
chars = string.encode('utf-8')
|
||||
return hash64(<unsigned char*>chars, len(chars), 1)
|
||||
|
||||
|
||||
cpdef hash_t get_string_id(object string_or_hash) except -1:
|
||||
cdef hash_t str_hash
|
||||
|
||||
try:
|
||||
return hash_string(string_or_hash)
|
||||
except:
|
||||
if _try_coerce_to_hash(string_or_hash, &str_hash):
|
||||
# Coerce the integral key to the expected primitive hash type.
|
||||
# This ensures that custom/overloaded "primitive" data types
|
||||
# such as those implemented by numpy are not inadvertently used
|
||||
# downsteam (as these are internally implemented as custom PyObjects
|
||||
# whose comparison operators can incur a significant overhead).
|
||||
return str_hash
|
||||
else:
|
||||
raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash)))
|
||||
|
||||
|
||||
# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
|
||||
cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
||||
try:
|
||||
out_hash[0] = key
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
|
|
|
@ -58,14 +58,6 @@ cdef struct TokenC:
|
|||
hash_t ent_id
|
||||
|
||||
|
||||
cdef struct MorphAnalysisC:
|
||||
hash_t key
|
||||
int length
|
||||
|
||||
attr_t* fields
|
||||
attr_t* features
|
||||
|
||||
|
||||
# Internal struct, for storage and disambiguation of entities.
|
||||
cdef struct KBEntryC:
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0.
|
||||
cdef enum symbol_t:
|
||||
NIL
|
||||
NIL = 0
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
|
@ -65,7 +66,7 @@ cdef enum symbol_t:
|
|||
FLAG62
|
||||
FLAG63
|
||||
|
||||
ID
|
||||
ID = 64
|
||||
ORTH
|
||||
LOWER
|
||||
NORM
|
||||
|
@ -385,7 +386,7 @@ cdef enum symbol_t:
|
|||
DEPRECATED275
|
||||
DEPRECATED276
|
||||
|
||||
PERSON
|
||||
PERSON = 380
|
||||
NORP
|
||||
FACILITY
|
||||
ORG
|
||||
|
@ -405,7 +406,7 @@ cdef enum symbol_t:
|
|||
ORDINAL
|
||||
CARDINAL
|
||||
|
||||
acomp
|
||||
acomp = 398
|
||||
advcl
|
||||
advmod
|
||||
agent
|
||||
|
@ -458,12 +459,12 @@ cdef enum symbol_t:
|
|||
rcmod
|
||||
root
|
||||
xcomp
|
||||
|
||||
acl
|
||||
|
||||
ENT_KB_ID
|
||||
ENT_KB_ID = 452
|
||||
MORPH
|
||||
ENT_ID
|
||||
|
||||
IDX
|
||||
_
|
||||
_ = 456
|
||||
# DO NOT ADD ANY NEW SYMBOLS!
|
||||
|
|
|
@ -469,11 +469,7 @@ IDS = {
|
|||
}
|
||||
|
||||
|
||||
def sort_nums(x):
|
||||
return x[1]
|
||||
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
NAMES = {v: k for k, v in IDS.items()}
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
# We keep the enum cdef, and just make sure the names are available to Python
|
||||
|
|
|
@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
|
|||
|
||||
To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
|
||||
|
||||
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
|
||||
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
|
||||
- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
||||
- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
|
||||
- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
import pytest
|
||||
from spacy.util import get_lang_class
|
||||
import functools
|
||||
from hypothesis import settings
|
||||
import inspect
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
# Functionally disable deadline settings for tests
|
||||
# to prevent spurious test failures in CI builds.
|
||||
|
@ -47,6 +51,33 @@ def pytest_runtest_setup(item):
|
|||
pytest.skip("not referencing any issues")
|
||||
|
||||
|
||||
# Decorator for Cython-built tests
|
||||
# https://shwina.github.io/cython-testing/
|
||||
def cytest(func):
|
||||
"""
|
||||
Wraps `func` in a plain Python function.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
bound = inspect.signature(func).bind(*args, **kwargs)
|
||||
return func(*bound.args, **bound.kwargs)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
def register_cython_tests(cython_mod_name: str, test_mod_name: str):
|
||||
"""
|
||||
Registers all callables with name `test_*` in Cython module `cython_mod_name`
|
||||
as attributes in module `test_mod_name`, making them discoverable by pytest.
|
||||
"""
|
||||
cython_mod = importlib.import_module(cython_mod_name)
|
||||
for name in dir(cython_mod):
|
||||
item = getattr(cython_mod, name)
|
||||
if callable(item) and name.startswith("test_"):
|
||||
setattr(sys.modules[test_mod_name], name, item)
|
||||
|
||||
|
||||
# Fixtures for language tokenizers (languages sorted alphabetically)
|
||||
|
||||
|
||||
|
@ -239,7 +270,7 @@ def hsb_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer():
|
||||
pytest.importorskip("natto")
|
||||
pytest.importorskip("mecab_ko")
|
||||
return get_lang_class("ko")().tokenizer
|
||||
|
||||
|
||||
|
@ -261,6 +292,20 @@ def la_tokenizer():
|
|||
return get_lang_class("la")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer_natto():
|
||||
pytest.importorskip("natto")
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("ko").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lb_tokenizer():
|
||||
return get_lang_class("lb")().tokenizer
|
||||
|
|
|
@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
|
|||
assert [t.ent_iob_ for t in doc] == orig_iobs
|
||||
|
||||
|
||||
def test_ents_clear(en_vocab):
|
||||
"""Ensure that removing entities clears token attributes"""
|
||||
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||
doc = Doc(en_vocab, words=text)
|
||||
entity = Span(doc, 0, 4, label=391, span_id="TEST")
|
||||
doc.ents = [entity]
|
||||
doc.ents = []
|
||||
for token in doc:
|
||||
assert token.ent_iob == 2
|
||||
assert token.ent_type == 0
|
||||
assert token.ent_id == 0
|
||||
assert token.ent_kb_id == 0
|
||||
doc.ents = [entity]
|
||||
doc.set_ents([], default="missing")
|
||||
for token in doc:
|
||||
assert token.ent_iob == 0
|
||||
assert token.ent_type == 0
|
||||
assert token.ent_id == 0
|
||||
assert token.ent_kb_id == 0
|
||||
doc.set_ents([], default="blocked")
|
||||
for token in doc:
|
||||
assert token.ent_iob == 3
|
||||
assert token.ent_type == 0
|
||||
assert token.ent_id == 0
|
||||
assert token.ent_kb_id == 0
|
||||
|
||||
|
||||
def test_add_overlapping_entities(en_vocab):
|
||||
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||
doc = Doc(en_vocab, words=text)
|
||||
|
|
|
@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
|||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
|
||||
)
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||
assert tokens.text == new_tokens.text
|
||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer):
|
|||
assert len(doc.spans["key2"]) == 1
|
||||
doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
|
||||
assert len(doc.spans["key3"]) == 2
|
||||
|
||||
|
||||
def test_doc_sentiment_from_bytes_v3_to_v4():
|
||||
"""Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
|
||||
doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
|
||||
doc = Doc(Vocab()).from_bytes(doc_bytes)
|
||||
assert doc.text == "happy"
|
||||
with pytest.raises(AttributeError):
|
||||
doc.sentiment == 1.0
|
||||
|
|
|
@ -4,7 +4,7 @@ from numpy.testing import assert_array_equal
|
|||
|
||||
from spacy.attrs import ORTH, LENGTH
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.util import filter_spans
|
||||
from thinc.api import get_current_ops
|
||||
|
@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
|
|||
assert span.text == text
|
||||
|
||||
|
||||
@pytest.mark.issue(9556)
|
||||
def test_modify_span_group(doc):
|
||||
group = SpanGroup(doc, spans=doc.ents)
|
||||
for span in group:
|
||||
span.start = 0
|
||||
span.label = doc.vocab.strings["TEST"]
|
||||
|
||||
# Span changes must be reflected in the span group
|
||||
assert group[0].start == 0
|
||||
assert group[0].label == doc.vocab.strings["TEST"]
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -293,31 +305,6 @@ def test_span_similarity_match():
|
|||
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
|
||||
|
||||
|
||||
def test_spans_default_sentiment(en_tokenizer):
|
||||
"""Test span.sentiment property's default averaging behaviour"""
|
||||
text = "good stuff bad stuff"
|
||||
tokens = en_tokenizer(text)
|
||||
tokens.vocab[tokens[0].text].sentiment = 3.0
|
||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||
assert doc[:2].sentiment == 3.0 / 2
|
||||
assert doc[-2:].sentiment == -2.0 / 2
|
||||
assert doc[:-1].sentiment == (3.0 + -2) / 3.0
|
||||
|
||||
|
||||
def test_spans_override_sentiment(en_tokenizer):
|
||||
"""Test span.sentiment property's default averaging behaviour"""
|
||||
text = "good stuff bad stuff"
|
||||
tokens = en_tokenizer(text)
|
||||
tokens.vocab[tokens[0].text].sentiment = 3.0
|
||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||
doc.user_span_hooks["sentiment"] = lambda span: 10.0
|
||||
assert doc[:2].sentiment == 10.0
|
||||
assert doc[-2:].sentiment == 10.0
|
||||
assert doc[:-1].sentiment == 10.0
|
||||
|
||||
|
||||
def test_spans_are_hashable(en_tokenizer):
|
||||
"""Test spans can be hashed."""
|
||||
text = "good stuff bad stuff"
|
||||
|
@ -680,3 +667,23 @@ def test_span_group_copy(doc):
|
|||
assert len(doc.spans["test"]) == 3
|
||||
# check that the copy spans were not modified and this is an isolated doc
|
||||
assert len(doc_copy.spans["test"]) == 2
|
||||
|
||||
|
||||
@pytest.mark.issue(11113)
|
||||
def test_span_ent_id(en_tokenizer):
|
||||
doc = en_tokenizer("a b c d")
|
||||
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
|
||||
span = doc.ents[0]
|
||||
assert doc[1].ent_id_ == "ID0"
|
||||
|
||||
# setting Span.id sets Token.ent_id
|
||||
span.id_ = "ID1"
|
||||
doc.ents = [span]
|
||||
assert doc.ents[0].ent_id_ == "ID1"
|
||||
assert doc[1].ent_id_ == "ID1"
|
||||
|
||||
# Span.ent_id is an alias of Span.id
|
||||
span.ent_id_ = "ID2"
|
||||
doc.ents = [span]
|
||||
assert doc.ents[0].ent_id_ == "ID2"
|
||||
assert doc[1].ent_id_ == "ID2"
|
||||
|
|
|
@ -102,8 +102,10 @@ def test_span_group_set_item(doc, other_doc):
|
|||
span.label_ = "NEW LABEL"
|
||||
span.kb_id = doc.vocab.strings["KB_ID"]
|
||||
|
||||
assert span_group[index].label != span.label
|
||||
assert span_group[index].kb_id != span.kb_id
|
||||
# Indexing a span group returns a span in which C
|
||||
# data is shared.
|
||||
assert span_group[index].label == span.label
|
||||
assert span_group[index].kb_id == span.kb_id
|
||||
|
||||
span_group[index] = span
|
||||
assert span_group[index].start == span.start
|
||||
|
|
|
@ -3,6 +3,10 @@ from mock import Mock
|
|||
from spacy.tokens import Doc, Span, Token
|
||||
from spacy.tokens.underscore import Underscore
|
||||
|
||||
# Helper functions
|
||||
def _get_tuple(s: Span):
|
||||
return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def clean_underscore():
|
||||
|
@ -171,3 +175,118 @@ def test_underscore_docstring(en_vocab):
|
|||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
assert test_method.__doc__ == "I am a docstring"
|
||||
assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
|
||||
|
||||
|
||||
def test_underscore_for_unique_span(en_tokenizer):
|
||||
"""Test that spans with the same boundaries but with different labels are uniquely identified (see #9706)."""
|
||||
Doc.set_extension(name="doc_extension", default=None)
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
Token.set_extension(name="token_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text = "Hello, world!"
|
||||
doc = en_tokenizer(text)
|
||||
span_1 = Span(doc, 0, 2, "SPAN_1")
|
||||
span_2 = Span(doc, 0, 2, "SPAN_2")
|
||||
|
||||
# Set custom extensions
|
||||
doc._.doc_extension = "doc extension"
|
||||
doc[0]._.token_extension = "token extension"
|
||||
span_1._.span_extension = "span_1 extension"
|
||||
span_2._.span_extension = "span_2 extension"
|
||||
|
||||
# Assert extensions
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
||||
# Change label of span and assert extensions
|
||||
span_1.label_ = "NEW_LABEL"
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
||||
# Change KB_ID and assert extensions
|
||||
span_1.kb_id_ = "KB_ID"
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
||||
# Change extensions and assert
|
||||
span_2._.span_extension = "updated span_2 extension"
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
|
||||
|
||||
# Change span ID and assert extensions
|
||||
span_2.id = 2
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
|
||||
|
||||
# Assert extensions with original key
|
||||
assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
|
||||
assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
|
||||
|
||||
|
||||
def test_underscore_for_unique_span_from_docs(en_tokenizer):
|
||||
"""Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
Token.set_extension(name="token_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text_1 = "Hello, world!"
|
||||
doc_1 = en_tokenizer(text_1)
|
||||
span_1a = Span(doc_1, 0, 2, "SPAN_1a")
|
||||
span_1b = Span(doc_1, 0, 2, "SPAN_1b")
|
||||
|
||||
text_2 = "This is a test."
|
||||
doc_2 = en_tokenizer(text_2)
|
||||
span_2a = Span(doc_2, 0, 3, "SPAN_2a")
|
||||
|
||||
# Set custom extensions
|
||||
doc_1[0]._.token_extension = "token_1"
|
||||
doc_2[1]._.token_extension = "token_2"
|
||||
span_1a._.span_extension = "span_1a extension"
|
||||
span_1b._.span_extension = "span_1b extension"
|
||||
span_2a._.span_extension = "span_2a extension"
|
||||
|
||||
doc = Doc.from_docs([doc_1, doc_2])
|
||||
# Assert extensions
|
||||
assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
|
||||
assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
|
||||
assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
|
||||
|
||||
# Check extensions on merged doc
|
||||
assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
|
||||
assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
|
||||
assert (
|
||||
doc.user_data[
|
||||
(
|
||||
"._.",
|
||||
"span_extension",
|
||||
span_2a.start_char + len(doc_1.text) + 1,
|
||||
span_2a.end_char + len(doc_1.text) + 1,
|
||||
span_2a.label,
|
||||
span_2a.kb_id,
|
||||
span_2a.id,
|
||||
)
|
||||
]
|
||||
== "span_2a extension"
|
||||
)
|
||||
|
||||
|
||||
def test_underscore_for_unique_span_as_span(en_tokenizer):
|
||||
"""Test that spans in the user_data keep the same data structure when using Span.as_doc"""
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text = "Hello, world!"
|
||||
doc = en_tokenizer(text)
|
||||
span_1 = Span(doc, 0, 2, "SPAN_1")
|
||||
span_2 = Span(doc, 0, 2, "SPAN_2")
|
||||
|
||||
# Set custom extensions
|
||||
span_1._.span_extension = "span_1 extension"
|
||||
span_2._.span_extension = "span_2 extension"
|
||||
|
||||
span_doc = span_1.as_doc(copy_user_data=True)
|
||||
|
||||
# Assert extensions
|
||||
assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
|
|
@ -7,3 +7,11 @@ import pytest
|
|||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||
)
|
||||
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
|
||||
test_lemma = ko_tokenizer_natto(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
|
|
@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
|
|||
b = pickle.dumps(ko_tokenizer)
|
||||
ko_tokenizer_re = pickle.loads(b)
|
||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
|
||||
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
ko_tokenizer_natto.to_disk(file_path)
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
|
||||
b = pickle.dumps(ko_tokenizer_natto)
|
||||
ko_tokenizer_natto_re = pickle.loads(b)
|
||||
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
|
||||
|
|
|
@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
|||
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
||||
# fmt: on
|
||||
|
||||
# tests for ko_tokenizer (default KoreanTokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
||||
|
@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
|||
assert pos == expected_pos.split()
|
||||
|
||||
|
||||
def test_ko_empty_doc(ko_tokenizer):
|
||||
def test_ko_tokenizer_empty_doc(ko_tokenizer):
|
||||
tokens = ko_tokenizer("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
|||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
|
||||
tokens = [token.text for token in ko_tokenizer_natto(text)]
|
||||
assert tokens == expected_tokens.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
|
||||
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
|
||||
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
|
||||
tags = ko_tokenizer_natto(text).user_data["full_tags"]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
|
||||
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
|
||||
assert pos == expected_pos.split()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
|
||||
tokens = ko_tokenizer_natto("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
@pytest.mark.issue(10535)
|
||||
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
|
||||
tokens = ko_tokenizer_natto("미닛 리피터")
|
||||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# fmt: off
|
||||
SPACY_TOKENIZER_TESTS = [
|
||||
("있다.", "있다 ."),
|
||||
|
|
|
@ -26,14 +26,6 @@ def test_attrs_idempotence(text):
|
|||
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["dog"])
|
||||
def test_attrs_do_deprecated(text):
|
||||
int_attrs = intify_attrs(
|
||||
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
|
||||
)
|
||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
def test_attrs_ent_iob_intify():
|
||||
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
|
|
@ -50,8 +50,6 @@ def test_matcher_from_usage_docs(en_vocab):
|
|||
|
||||
def label_sentiment(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
if doc.vocab.strings[match_id] == "HAPPY":
|
||||
doc.sentiment += 0.1
|
||||
span = doc[start:end]
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(span)
|
||||
|
@ -61,7 +59,6 @@ def test_matcher_from_usage_docs(en_vocab):
|
|||
matcher = Matcher(en_vocab)
|
||||
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
|
||||
matcher(doc)
|
||||
assert doc.sentiment != 0
|
||||
assert doc[1].norm_ == "happy emoji"
|
||||
|
||||
|
||||
|
@ -793,9 +790,16 @@ def test_matcher_span(matcher):
|
|||
doc = Doc(matcher.vocab, words=text.split())
|
||||
span_js = doc[:3]
|
||||
span_java = doc[4:]
|
||||
assert len(matcher(doc)) == 2
|
||||
assert len(matcher(span_js)) == 1
|
||||
assert len(matcher(span_java)) == 1
|
||||
doc_matches = matcher(doc)
|
||||
span_js_matches = matcher(span_js)
|
||||
span_java_matches = matcher(span_java)
|
||||
assert len(doc_matches) == 2
|
||||
assert len(span_js_matches) == 1
|
||||
assert len(span_java_matches) == 1
|
||||
|
||||
# match offsets always refer to the doc
|
||||
assert doc_matches[0] == span_js_matches[0]
|
||||
assert doc_matches[1] == span_java_matches[0]
|
||||
|
||||
|
||||
def test_matcher_as_spans(matcher):
|
||||
|
|
|
@ -87,14 +87,15 @@ def test_issue4373():
|
|||
|
||||
@pytest.mark.issue(4651)
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
"""Test that the entity_ruler PhraseMatcher is deserialized correctly using
|
||||
the method from_disk when the entity_ruler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
|
||||
config = {"phrase_matcher_attr": "LOWER"}
|
||||
ruler = nlp.add_pipe("entity_ruler", config=config)
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr():
|
|||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
|
||||
nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
@ -198,28 +199,6 @@ def test_phrase_matcher_contains(en_vocab):
|
|||
assert "TEST2" not in matcher
|
||||
|
||||
|
||||
def test_phrase_matcher_add_new_api(en_vocab):
|
||||
doc = Doc(en_vocab, words=["a", "b"])
|
||||
patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("OLD_API", None, *patterns)
|
||||
assert len(matcher(doc)) == 2
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
on_match = Mock()
|
||||
matcher.add("OLD_API_CALLBACK", on_match, *patterns)
|
||||
assert len(matcher(doc)) == 2
|
||||
assert on_match.call_count == 2
|
||||
# New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("NEW_API", patterns)
|
||||
assert len(matcher(doc)) == 2
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
on_match = Mock()
|
||||
matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
|
||||
assert len(matcher(doc)) == 2
|
||||
assert on_match.call_count == 2
|
||||
|
||||
|
||||
def test_phrase_matcher_repeated_add(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
# match ID only gets added once
|
||||
|
@ -468,6 +447,13 @@ def test_phrase_matcher_deprecated(en_vocab):
|
|||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
||||
|
||||
def test_phrase_matcher_non_doc(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("TEST", [doc, "junk"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
|
||||
def test_phrase_matcher_sent_start(en_vocab, attr):
|
||||
_ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841
|
||||
|
|
|
@ -4,8 +4,8 @@ from pathlib import Path
|
|||
|
||||
def test_build_dependencies():
|
||||
# Check that library requirements are pinned exactly the same across different setup files.
|
||||
# TODO: correct checks for numpy rather than ignoring
|
||||
libs_ignore_requirements = [
|
||||
"cython",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"mock",
|
||||
|
@ -22,7 +22,7 @@ def test_build_dependencies():
|
|||
# ignore language-specific packages that shouldn't be installed by all
|
||||
libs_ignore_setup = [
|
||||
"fugashi",
|
||||
"natto-py",
|
||||
"mecab-ko",
|
||||
"pythainlp",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
|
|
119
spacy/tests/parser/_search.pyx
Normal file
119
spacy/tests/parser/_search.pyx
Normal file
|
@ -0,0 +1,119 @@
|
|||
# cython: infer_types=True, binding=True
|
||||
from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
|
||||
from spacy.typedefs cimport class_t, weight_t
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..conftest import cytest
|
||||
import pytest
|
||||
|
||||
cdef struct TestState:
|
||||
int length
|
||||
int x
|
||||
Py_UNICODE* string
|
||||
|
||||
|
||||
cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
|
||||
dest_state = <TestState*>dest
|
||||
src_state = <TestState*>src
|
||||
dest_state.length = src_state.length
|
||||
dest_state.x = src_state.x
|
||||
dest_state.x += clas
|
||||
if extra_args != NULL:
|
||||
dest_state.string = <Py_UNICODE*>extra_args
|
||||
else:
|
||||
dest_state.string = src_state.string
|
||||
|
||||
|
||||
cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
|
||||
state = <TestState*>mem.alloc(1, sizeof(TestState))
|
||||
state.length = n
|
||||
state.x = 1
|
||||
if extra_args == NULL:
|
||||
state.string = u'default'
|
||||
else:
|
||||
state.string = <Py_UNICODE*>extra_args
|
||||
return state
|
||||
|
||||
|
||||
cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
|
||||
state = <TestState*>state
|
||||
mem.free(state)
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width",
|
||||
[
|
||||
(2, 3),
|
||||
(3, 6),
|
||||
(4, 20),
|
||||
]
|
||||
)
|
||||
def test_init(nr_class, beam_width):
|
||||
b = Beam(nr_class, beam_width)
|
||||
assert b.size == 1
|
||||
assert b.width == beam_width
|
||||
assert b.nr_class == nr_class
|
||||
|
||||
@cytest
|
||||
def test_init_violn():
|
||||
MaxViolation()
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length",
|
||||
[
|
||||
(2, 3, 3),
|
||||
(3, 6, 15),
|
||||
(4, 20, 32),
|
||||
]
|
||||
)
|
||||
def test_initialize(nr_class, beam_width, length):
|
||||
b = Beam(nr_class, beam_width)
|
||||
b.initialize(initialize, destroy, length, NULL)
|
||||
for i in range(b.width):
|
||||
s = <TestState*>b.at(i)
|
||||
assert s.length == length, s.length
|
||||
assert s.string == 'default'
|
||||
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length,extra",
|
||||
[
|
||||
(2, 3, 4, None),
|
||||
(3, 6, 15, u"test beam 1"),
|
||||
]
|
||||
)
|
||||
def test_initialize_extra(nr_class, beam_width, length, extra):
|
||||
b = Beam(nr_class, beam_width)
|
||||
if extra is None:
|
||||
b.initialize(initialize, destroy, length, NULL)
|
||||
else:
|
||||
b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
|
||||
for i in range(b.width):
|
||||
s = <TestState*>b.at(i)
|
||||
assert s.length == length
|
||||
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length",
|
||||
[
|
||||
(3, 6, 15),
|
||||
(4, 20, 32),
|
||||
]
|
||||
)
|
||||
def test_transition(nr_class, beam_width, length):
|
||||
b = Beam(nr_class, beam_width)
|
||||
b.initialize(initialize, destroy, length, NULL)
|
||||
b.set_cell(0, 2, 30, True, 0)
|
||||
b.set_cell(0, 1, 42, False, 0)
|
||||
b.advance(transition, NULL, NULL)
|
||||
assert b.size == 1, b.size
|
||||
assert b.score == 30, b.score
|
||||
s = <TestState*>b.at(0)
|
||||
assert s.x == 3
|
||||
assert b._states[0].score == 30, b._states[0].score
|
||||
b.set_cell(0, 1, 10, True, 0)
|
||||
b.set_cell(0, 2, 20, True, 0)
|
||||
b.advance(transition, NULL, NULL)
|
||||
assert b._states[0].score == 50, b._states[0].score
|
||||
assert b._states[1].score == 40
|
||||
s = <TestState*>b.at(0)
|
||||
assert s.x == 5
|
|
@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
|||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import fix_random_seed
|
||||
import logging
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -412,7 +413,7 @@ def test_train_empty():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
ner = nlp.add_pipe("ner", last=True)
|
||||
ner.add_label("PERSON")
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
for itn in range(2):
|
||||
losses = {}
|
||||
batches = util.minibatch(train_examples, size=8)
|
||||
|
@ -539,11 +540,11 @@ def test_block_ner():
|
|||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_upper", [True, False])
|
||||
def test_overfitting_IO(use_upper):
|
||||
def test_overfitting_IO():
|
||||
fix_random_seed(1)
|
||||
# Simple test to try and quickly overfit the NER component
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
|
||||
ner = nlp.add_pipe("ner", config={"model": {}})
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper):
|
|||
assert ents2[0].label_ == "LOC"
|
||||
# Ensure that the predictions are still the same, even after adding a new label
|
||||
ner2 = nlp2.get_pipe("ner")
|
||||
assert ner2.model.attrs["has_upper"] == use_upper
|
||||
ner2.add_label("RANDOM_NEW_LABEL")
|
||||
doc3 = nlp2(test_text)
|
||||
ents3 = doc3.ents
|
||||
|
@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
|
|||
assert ents[1].kb_id == 0
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
assert ner.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
teacher = English()
|
||||
teacher_ner = teacher.add_pipe("ner")
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
|
||||
for ent in annotations.get("entities"):
|
||||
teacher_ner.add_label(ent[2])
|
||||
|
||||
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["ner"] < 0.00001
|
||||
|
||||
student = English()
|
||||
student_ner = student.add_pipe("ner")
|
||||
student_ner.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_ner.label_data
|
||||
)
|
||||
|
||||
distill_examples = [
|
||||
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||
]
|
||||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["ner"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like London."
|
||||
doc = student(test_text)
|
||||
ents = doc.ents
|
||||
assert len(ents) == 1
|
||||
assert ents[0].text == "London"
|
||||
assert ents[0].label_ == "LOC"
|
||||
|
||||
|
||||
def test_beam_ner_scores():
|
||||
# Test that we can get confidence values out of the beam_ner pipe
|
||||
beam_width = 16
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
import itertools
|
||||
import pytest
|
||||
import numpy
|
||||
from numpy.testing import assert_equal
|
||||
from thinc.api import Adam
|
||||
|
||||
from spacy import registry, util
|
||||
from spacy.attrs import DEP, NORM
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy import util, registry
|
||||
from thinc.api import fix_random_seed
|
||||
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
|
@ -59,6 +63,8 @@ PARTIAL_DATA = [
|
|||
),
|
||||
]
|
||||
|
||||
PARSERS = ["parser"] # TODO: Test beam_parser when ready
|
||||
|
||||
eps = 0.1
|
||||
|
||||
|
||||
|
@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
|
|||
assert doc[0].dep != 0
|
||||
|
||||
|
||||
def test_parser_apply_actions(en_vocab, en_parser):
|
||||
words = ["I", "ate", "pizza"]
|
||||
words2 = ["Eat", "more", "pizza", "!"]
|
||||
doc1 = Doc(en_vocab, words=words)
|
||||
doc2 = Doc(en_vocab, words=words2)
|
||||
docs = [doc1, doc2]
|
||||
|
||||
moves = en_parser.moves
|
||||
moves.add_action(0, "")
|
||||
moves.add_action(1, "")
|
||||
moves.add_action(2, "nsubj")
|
||||
moves.add_action(3, "obj")
|
||||
moves.add_action(2, "amod")
|
||||
|
||||
actions = [
|
||||
numpy.array([0, 0], dtype="i"),
|
||||
numpy.array([2, 0], dtype="i"),
|
||||
numpy.array([0, 4], dtype="i"),
|
||||
numpy.array([3, 3], dtype="i"),
|
||||
numpy.array([1, 1], dtype="i"),
|
||||
numpy.array([1, 1], dtype="i"),
|
||||
numpy.array([0], dtype="i"),
|
||||
numpy.array([1], dtype="i"),
|
||||
]
|
||||
|
||||
states = moves.init_batch(docs)
|
||||
active_states = states
|
||||
|
||||
for step_actions in actions:
|
||||
active_states = moves.apply_actions(active_states, step_actions)
|
||||
|
||||
assert len(active_states) == 0
|
||||
|
||||
for (state, doc) in zip(states, docs):
|
||||
moves.set_annotations(state, doc)
|
||||
|
||||
assert docs[0][0].head.i == 1
|
||||
assert docs[0][0].dep_ == "nsubj"
|
||||
assert docs[0][1].head.i == 1
|
||||
assert docs[0][1].dep_ == "ROOT"
|
||||
assert docs[0][2].head.i == 1
|
||||
assert docs[0][2].dep_ == "obj"
|
||||
|
||||
assert docs[1][0].head.i == 0
|
||||
assert docs[1][0].dep_ == "ROOT"
|
||||
assert docs[1][1].head.i == 2
|
||||
assert docs[1][1].dep_ == "amod"
|
||||
assert docs[1][2].head.i == 0
|
||||
assert docs[1][2].dep_ == "obj"
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
|
@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab):
|
|||
DependencyParser(en_vocab, model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
@pytest.mark.parametrize("pipe_name", PARSERS)
|
||||
def test_incomplete_data(pipe_name):
|
||||
# Test that the parser works with incomplete information
|
||||
nlp = English()
|
||||
|
@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name):
|
|||
assert doc[2].head.i == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
def test_overfitting_IO(pipe_name):
|
||||
@pytest.mark.parametrize(
|
||||
"pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
|
||||
)
|
||||
def test_overfitting_IO(pipe_name, max_moves):
|
||||
fix_random_seed(0)
|
||||
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe(pipe_name)
|
||||
parser.cfg["update_with_oracle_cut_size"] = max_moves
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -396,16 +457,67 @@ def test_overfitting_IO(pipe_name):
|
|||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe("parser")
|
||||
assert parser.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
teacher = English()
|
||||
teacher_parser = teacher.add_pipe("parser")
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
teacher_parser.add_label(dep)
|
||||
|
||||
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["parser"] < 0.0001
|
||||
|
||||
student = English()
|
||||
student_parser = student.add_pipe("parser")
|
||||
student_parser.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_parser.label_data
|
||||
)
|
||||
|
||||
distill_examples = [
|
||||
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||
]
|
||||
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
student_parser.distill(
|
||||
teacher_parser, distill_examples, sgd=optimizer, losses=losses
|
||||
)
|
||||
assert losses["parser"] < 0.0001
|
||||
|
||||
test_text = "I like securities."
|
||||
doc = student(test_text)
|
||||
assert doc[0].dep_ == "nsubj"
|
||||
assert doc[2].dep_ == "dobj"
|
||||
assert doc[3].dep_ == "punct"
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[2].head.i == 1
|
||||
assert doc[3].head.i == 1
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
@pytest.mark.parametrize(
|
||||
"parser_config",
|
||||
[
|
||||
# TransitionBasedParser V1
|
||||
({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||
# TransitionBasedParser V2
|
||||
# TODO: re-enable after we have a spacy-legacy release for v4. See
|
||||
# https://github.com/explosion/spacy-legacy/pull/36
|
||||
#({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
|
||||
({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
|
|
3
spacy/tests/parser/test_search.py
Normal file
3
spacy/tests/parser/test_search.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from ..conftest import register_cython_tests
|
||||
|
||||
register_cython_tests("spacy.tests.parser._search", __name__)
|
|
@ -1,3 +1,4 @@
|
|||
from typing import cast
|
||||
import pickle
|
||||
import pytest
|
||||
from hypothesis import given
|
||||
|
@ -6,6 +7,7 @@ from spacy import util
|
|||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from spacy.pipeline.trainable_pipe import TrainablePipe
|
||||
from spacy.training import Example
|
||||
from spacy.strings import StringStore
|
||||
from spacy.util import make_tempdir
|
||||
|
@ -193,6 +195,53 @@ def test_overfitting_IO():
|
|||
assert doc4[3].lemma_ == "egg"
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
assert lemmatizer.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
teacher = English()
|
||||
teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
|
||||
teacher_lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
|
||||
|
||||
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["trainable_lemmatizer"] < 0.00001
|
||||
|
||||
student = English()
|
||||
student_lemmatizer = student.add_pipe("trainable_lemmatizer")
|
||||
student_lemmatizer.min_tree_freq = 1
|
||||
student_lemmatizer.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
|
||||
)
|
||||
|
||||
distill_examples = [
|
||||
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||
]
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
student_lemmatizer.distill(
|
||||
teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
|
||||
)
|
||||
assert losses["trainable_lemmatizer"] < 0.00001
|
||||
|
||||
test_text = "She likes blue eggs"
|
||||
doc = student(test_text)
|
||||
assert doc[0].lemma_ == "she"
|
||||
assert doc[1].lemma_ == "like"
|
||||
assert doc[2].lemma_ == "blue"
|
||||
assert doc[3].lemma_ == "egg"
|
||||
|
||||
|
||||
def test_lemmatizer_requires_labels():
|
||||
nlp = English()
|
||||
nlp.add_pipe("trainable_lemmatizer")
|
||||
|
@ -313,3 +362,26 @@ def test_empty_strings():
|
|||
no_change = trees.add("xyz", "xyz")
|
||||
empty = trees.add("", "")
|
||||
assert no_change == empty
|
||||
|
||||
|
||||
def test_save_activations():
|
||||
nlp = English()
|
||||
lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
nO = lemmatizer.model.get_dim("nO")
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert "trainable_lemmatizer" not in doc.activations
|
||||
|
||||
lemmatizer.save_activations = True
|
||||
doc = nlp("This is a test.")
|
||||
assert list(doc.activations["trainable_lemmatizer"].keys()) == [
|
||||
"probabilities",
|
||||
"tree_ids",
|
||||
]
|
||||
assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
|
||||
assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Callable, Iterable, Dict, Any
|
||||
from typing import Callable, Iterable, Dict, Any, cast
|
||||
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from thinc.types import Ragged
|
||||
|
||||
from spacy import registry, util
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
|
@ -10,7 +11,7 @@ from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
|||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker
|
||||
from spacy.pipeline import EntityLinker
|
||||
from spacy.pipeline import EntityLinker, TrainablePipe
|
||||
from spacy.pipeline.legacy import EntityLinker_v1
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.scorer import Scorer
|
||||
|
@ -1203,6 +1204,69 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
|||
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
|
||||
|
||||
|
||||
def test_save_activations():
|
||||
nlp = English()
|
||||
vector_length = 3
|
||||
assert "Q2146908" not in nlp.vocab.strings
|
||||
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||
train_examples = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
doc = nlp(text)
|
||||
train_examples.append(Example.from_dict(doc, annotation))
|
||||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||
# Q2146908 (Russ Cochran): American golfer
|
||||
# Q7381115 (Russ Cochran): publisher
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(
|
||||
alias="Russ Cochran",
|
||||
entities=["Q2146908", "Q7381115"],
|
||||
probabilities=[0.5, 0.5],
|
||||
)
|
||||
return mykb
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True))
|
||||
assert isinstance(entity_linker, EntityLinker)
|
||||
entity_linker.set_kb(create_kb)
|
||||
assert "Q2146908" in entity_linker.vocab.strings
|
||||
assert "Q2146908" in entity_linker.kb.vocab.strings
|
||||
|
||||
# initialize the NEL pipe
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
nO = entity_linker.model.get_dim("nO")
|
||||
|
||||
nlp.add_pipe("sentencizer", first=True)
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
|
||||
{"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
|
||||
]
|
||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
doc = nlp("Russ Cochran was a publisher")
|
||||
assert "entity_linker" not in doc.activations
|
||||
|
||||
entity_linker.save_activations = True
|
||||
doc = nlp("Russ Cochran was a publisher")
|
||||
assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"}
|
||||
ents = doc.activations["entity_linker"]["ents"]
|
||||
assert isinstance(ents, Ragged)
|
||||
assert ents.data.shape == (2, 1)
|
||||
assert ents.data.dtype == "uint64"
|
||||
assert ents.lengths.shape == (1,)
|
||||
scores = doc.activations["entity_linker"]["scores"]
|
||||
assert isinstance(scores, Ragged)
|
||||
assert scores.data.shape == (2, 1)
|
||||
assert scores.data.dtype == "float32"
|
||||
assert scores.lengths.shape == (1,)
|
||||
|
||||
|
||||
def test_span_maker_forward_with_empty():
|
||||
"""The forward pass of the span maker may have a doc with no entities."""
|
||||
nlp = English()
|
||||
|
|
|
@ -4,7 +4,7 @@ from spacy import registry
|
|||
from spacy.tokens import Doc, Span
|
||||
from spacy.language import Language
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities
|
||||
from spacy.pipeline import EntityRecognizer, merge_entities
|
||||
from spacy.pipeline import SpanRuler
|
||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
from spacy.errors import MatchPatternError
|
||||
|
@ -12,8 +12,6 @@ from spacy.tests.util import make_tempdir
|
|||
|
||||
from thinc.api import NumpyOps, get_current_ops
|
||||
|
||||
ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
|
@ -40,13 +38,12 @@ def add_ent_component(doc):
|
|||
|
||||
|
||||
@pytest.mark.issue(3345)
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_issue3345(entity_ruler_factory):
|
||||
def test_issue3345():
|
||||
"""Test case where preset entity crosses sentence boundary."""
|
||||
nlp = English()
|
||||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||
doc[4].is_sent_start = True
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
|
||||
cfg = {"model": DEFAULT_NER_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
|
@ -65,15 +62,14 @@ def test_issue3345(entity_ruler_factory):
|
|||
|
||||
|
||||
@pytest.mark.issue(4849)
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_issue4849(entity_ruler_factory):
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
]
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory,
|
||||
"entity_ruler",
|
||||
name="entity_ruler",
|
||||
config={"phrase_matcher_attr": "LOWER"},
|
||||
)
|
||||
|
@ -96,11 +92,10 @@ def test_issue4849(entity_ruler_factory):
|
|||
|
||||
|
||||
@pytest.mark.issue(5918)
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_issue5918(entity_ruler_factory):
|
||||
def test_issue5918():
|
||||
# Test edge case when merging entities.
|
||||
nlp = English()
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "ORG", "pattern": "Digicon Inc"},
|
||||
{"label": "ORG", "pattern": "Rotan Mosle Inc's"},
|
||||
|
@ -125,10 +120,9 @@ def test_issue5918(entity_ruler_factory):
|
|||
|
||||
|
||||
@pytest.mark.issue(8168)
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_issue8168(entity_ruler_factory):
|
||||
def test_issue8168():
|
||||
nlp = English()
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "ORG", "pattern": "Apple"},
|
||||
{
|
||||
|
@ -148,12 +142,9 @@ def test_issue8168(entity_ruler_factory):
|
|||
|
||||
|
||||
@pytest.mark.issue(8216)
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
|
||||
def test_entity_ruler_fix8216(nlp, patterns):
|
||||
"""Test that patterns don't get added excessively."""
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory, name="entity_ruler", config={"validate": True}
|
||||
)
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
|
||||
ruler.add_patterns(patterns)
|
||||
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||
assert pattern_count > 0
|
||||
|
@ -162,16 +153,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
|
|||
assert after_count == pattern_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_init(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert "HELLO" in ruler
|
||||
assert "BYE" in ruler
|
||||
nlp.remove_pipe("entity_ruler")
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("hello world bye bye")
|
||||
assert len(doc.ents) == 2
|
||||
|
@ -179,23 +169,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
|
|||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_no_patterns_warns(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
assert len(ruler) == 0
|
||||
assert len(ruler.labels) == 0
|
||||
nlp.remove_pipe("entity_ruler")
|
||||
nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
nlp.add_pipe("entity_ruler")
|
||||
assert nlp.pipe_names == ["entity_ruler"]
|
||||
with pytest.warns(UserWarning):
|
||||
doc = nlp("hello world bye bye")
|
||||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
|
||||
def test_entity_ruler_init_patterns(nlp, patterns):
|
||||
# initialize with patterns
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
assert len(ruler.labels) == 0
|
||||
ruler.initialize(lambda: [], patterns=patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
|
@ -207,7 +195,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
|
|||
nlp.config["initialize"]["components"]["entity_ruler"] = {
|
||||
"patterns": {"@misc": "entity_ruler_patterns"}
|
||||
}
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
assert len(ruler.labels) == 0
|
||||
nlp.initialize()
|
||||
assert len(ruler.labels) == 4
|
||||
|
@ -216,20 +204,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
|
|||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory):
|
||||
def test_entity_ruler_init_clear(nlp, patterns):
|
||||
"""Test that initialization clears patterns."""
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
ruler.initialize(lambda: [])
|
||||
assert len(ruler.labels) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
|
||||
def test_entity_ruler_clear(nlp, patterns):
|
||||
"""Test that initialization clears patterns."""
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
doc = nlp("hello world")
|
||||
|
@ -241,9 +227,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
|
|||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_existing(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe("add_ent", before="entity_ruler")
|
||||
doc = nlp("OH HELLO WORLD bye bye")
|
||||
|
@ -252,11 +237,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
|
|||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
|
||||
)
|
||||
def test_entity_ruler_existing_overwrite(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe("add_ent", before="entity_ruler")
|
||||
doc = nlp("OH HELLO WORLD bye bye")
|
||||
|
@ -266,11 +248,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
|
|||
assert doc.ents[1].label_ == "BYE"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
|
||||
)
|
||||
def test_entity_ruler_existing_complex(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe("add_ent", before="entity_ruler")
|
||||
doc = nlp("foo foo bye bye")
|
||||
|
@ -281,11 +260,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
|
|||
assert len(doc.ents[1]) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
|
||||
)
|
||||
def test_entity_ruler_entity_id(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("Apple is a technology company")
|
||||
assert len(doc.ents) == 1
|
||||
|
@ -293,26 +269,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
|
|||
assert doc.ents[0].ent_id_ == "a1"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory):
|
||||
def test_entity_ruler_cfg_ent_id_sep(nlp, patterns):
|
||||
config = {"overwrite_ents": True, "ent_id_sep": "**"}
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config)
|
||||
ruler = nlp.add_pipe("entity_ruler", config=config)
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("Apple is a technology company")
|
||||
if isinstance(ruler, EntityRuler):
|
||||
assert "TECH_ORG**a1" in ruler.phrase_patterns
|
||||
assert len(doc.ents) == 1
|
||||
assert doc.ents[0].label_ == "TECH_ORG"
|
||||
assert doc.ents[0].ent_id_ == "a1"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
|
||||
ruler = EntityRuler(nlp, patterns=patterns)
|
||||
def test_entity_ruler_serialize_bytes(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler")
|
||||
assert len(new_ruler) == 0
|
||||
assert len(new_ruler.labels) == 0
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
|
@ -324,28 +297,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
|
|||
assert sorted(new_ruler.labels) == sorted(ruler.labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_serialize_phrase_matcher_attr_bytes(
|
||||
nlp, patterns, entity_ruler_factory
|
||||
):
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns)
|
||||
def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = nlp.add_pipe(
|
||||
"entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"}
|
||||
)
|
||||
assert len(new_ruler) == 0
|
||||
assert len(new_ruler.labels) == 0
|
||||
assert new_ruler.phrase_matcher_attr is None
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(patterns)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.phrase_matcher_attr == "LOWER"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_validate(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
validated_ruler = EntityRuler(nlp, validate=True)
|
||||
def test_entity_ruler_validate(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
validated_ruler = nlp.add_pipe(
|
||||
"entity_ruler", name="validated_ruler", config={"validate": True}
|
||||
)
|
||||
|
||||
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
|
||||
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
|
||||
|
@ -362,16 +334,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory):
|
|||
validated_ruler.add_patterns([invalid_pattern])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory):
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
def test_entity_ruler_properties(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
|
||||
ruler.add_patterns(patterns)
|
||||
assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
|
||||
assert sorted(ruler.ent_ids) == ["a1", "a2"]
|
||||
assert sorted(ruler.ids) == ["a1", "a2"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_overlapping_spans(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "FOOBAR", "pattern": "foo bar"},
|
||||
{"label": "BARBAZ", "pattern": "bar baz"},
|
||||
|
@ -382,9 +353,8 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
|
|||
assert doc.ents[0].label_ == "FOOBAR"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_fuzzy_pipe(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("helloo")
|
||||
|
@ -392,9 +362,8 @@ def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
|
|||
assert doc.ents[0].label_ == "HELLO"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_fuzzy(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("helloo")
|
||||
|
@ -402,15 +371,13 @@ def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
|
|||
assert doc.ents[0].label_ == "HELLO"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
|
||||
def test_entity_ruler_fuzzy_disabled(nlp):
|
||||
@registry.misc("test_fuzzy_compare_disabled")
|
||||
def make_test_fuzzy_compare_disabled():
|
||||
return lambda x, y, z: False
|
||||
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory,
|
||||
name="entity_ruler",
|
||||
"entity_ruler",
|
||||
config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
|
||||
)
|
||||
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||
|
@ -420,14 +387,13 @@ def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
|
|||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
|
||||
def test_entity_ruler_multiprocessing(nlp, n_process):
|
||||
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||
texts = ["I enjoy eating Pizza Hut pizza."]
|
||||
|
||||
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
|
||||
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
for doc in nlp.pipe(texts, n_process=2):
|
||||
|
@ -435,9 +401,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
|
|||
assert ent.ent_id_ == "1234"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_serialize_jsonl(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
with make_tempdir() as d:
|
||||
ruler.to_disk(d / "test_ruler.jsonl")
|
||||
|
@ -446,9 +411,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
|
|||
ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_serialize_dir(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
with make_tempdir() as d:
|
||||
ruler.to_disk(d / "test_ruler")
|
||||
|
@ -457,9 +421,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
|
|||
ruler.from_disk(d / "non_existing_dir") # read from a bad directory
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_basic(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "Dina", "id": "dina"},
|
||||
{"label": "ORG", "pattern": "ACME", "id": "acme"},
|
||||
|
@ -469,24 +432,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
|
|||
doc = nlp("Dina went to school")
|
||||
assert len(ruler.patterns) == 3
|
||||
assert len(doc.ents) == 1
|
||||
if isinstance(ruler, EntityRuler):
|
||||
assert "PERSON||dina" in ruler.phrase_matcher
|
||||
assert doc.ents[0].label_ == "PERSON"
|
||||
assert doc.ents[0].text == "Dina"
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("dina")
|
||||
else:
|
||||
ruler.remove_by_id("dina")
|
||||
ruler.remove_by_id("dina")
|
||||
doc = nlp("Dina went to school")
|
||||
assert len(doc.ents) == 0
|
||||
if isinstance(ruler, EntityRuler):
|
||||
assert "PERSON||dina" not in ruler.phrase_matcher
|
||||
assert len(ruler.patterns) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "Dina", "id": "dina"},
|
||||
{"label": "ORG", "pattern": "DinaCorp", "id": "dina"},
|
||||
|
@ -495,25 +450,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory
|
|||
ruler.add_patterns(patterns)
|
||||
doc = nlp("Dina founded DinaCorp and ACME.")
|
||||
assert len(ruler.patterns) == 3
|
||||
if isinstance(ruler, EntityRuler):
|
||||
assert "PERSON||dina" in ruler.phrase_matcher
|
||||
assert "ORG||dina" in ruler.phrase_matcher
|
||||
assert len(doc.ents) == 3
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("dina")
|
||||
else:
|
||||
ruler.remove_by_id("dina")
|
||||
ruler.remove_by_id("dina")
|
||||
doc = nlp("Dina founded DinaCorp and ACME.")
|
||||
assert len(ruler.patterns) == 1
|
||||
if isinstance(ruler, EntityRuler):
|
||||
assert "PERSON||dina" not in ruler.phrase_matcher
|
||||
assert "ORG||dina" not in ruler.phrase_matcher
|
||||
assert len(doc.ents) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_nonexisting_pattern(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "Dina", "id": "dina"},
|
||||
{"label": "ORG", "pattern": "ACME", "id": "acme"},
|
||||
|
@ -528,9 +473,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
|
|||
ruler.remove_by_id("nepattern")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_several_patterns(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "Dina", "id": "dina"},
|
||||
{"label": "ORG", "pattern": "ACME", "id": "acme"},
|
||||
|
@ -544,27 +488,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
|
|||
assert doc.ents[0].text == "Dina"
|
||||
assert doc.ents[1].label_ == "ORG"
|
||||
assert doc.ents[1].text == "ACME"
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("dina")
|
||||
else:
|
||||
ruler.remove_by_id("dina")
|
||||
ruler.remove_by_id("dina")
|
||||
doc = nlp("Dina founded her company ACME")
|
||||
assert len(ruler.patterns) == 2
|
||||
assert len(doc.ents) == 1
|
||||
assert doc.ents[0].label_ == "ORG"
|
||||
assert doc.ents[0].text == "ACME"
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("acme")
|
||||
else:
|
||||
ruler.remove_by_id("acme")
|
||||
ruler.remove_by_id("acme")
|
||||
doc = nlp("Dina founded her company ACME")
|
||||
assert len(ruler.patterns) == 1
|
||||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_patterns_in_a_row(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "Dina", "id": "dina"},
|
||||
{"label": "ORG", "pattern": "ACME", "id": "acme"},
|
||||
|
@ -580,21 +517,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
|
|||
assert doc.ents[1].text == "ACME"
|
||||
assert doc.ents[2].label_ == "DATE"
|
||||
assert doc.ents[2].text == "her birthday"
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("dina")
|
||||
ruler.remove("acme")
|
||||
ruler.remove("bday")
|
||||
else:
|
||||
ruler.remove_by_id("dina")
|
||||
ruler.remove_by_id("acme")
|
||||
ruler.remove_by_id("bday")
|
||||
ruler.remove_by_id("dina")
|
||||
ruler.remove_by_id("acme")
|
||||
ruler.remove_by_id("bday")
|
||||
doc = nlp("Dina went to school")
|
||||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_all_patterns(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": "Dina", "id": "dina"},
|
||||
{"label": "ORG", "pattern": "ACME", "id": "acme"},
|
||||
|
@ -602,29 +533,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
|
|||
]
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.patterns) == 3
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("dina")
|
||||
else:
|
||||
ruler.remove_by_id("dina")
|
||||
ruler.remove_by_id("dina")
|
||||
assert len(ruler.patterns) == 2
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("acme")
|
||||
else:
|
||||
ruler.remove_by_id("acme")
|
||||
ruler.remove_by_id("acme")
|
||||
assert len(ruler.patterns) == 1
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("bday")
|
||||
else:
|
||||
ruler.remove_by_id("bday")
|
||||
ruler.remove_by_id("bday")
|
||||
assert len(ruler.patterns) == 0
|
||||
with pytest.warns(UserWarning):
|
||||
doc = nlp("Dina founded her company ACME on her birthday")
|
||||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
def test_entity_ruler_remove_and_add(nlp):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [{"label": "DATE", "pattern": "last time"}]
|
||||
ruler.add_patterns(patterns)
|
||||
doc = ruler(
|
||||
|
@ -645,10 +566,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
|
|||
assert doc.ents[0].text == "last time"
|
||||
assert doc.ents[1].label_ == "DATE"
|
||||
assert doc.ents[1].text == "this time"
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("ttime")
|
||||
else:
|
||||
ruler.remove_by_id("ttime")
|
||||
ruler.remove_by_id("ttime")
|
||||
doc = ruler(
|
||||
nlp.make_doc("I saw him last time we met, this time he brought some flowers")
|
||||
)
|
||||
|
@ -671,10 +589,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
|
|||
)
|
||||
assert len(ruler.patterns) == 3
|
||||
assert len(doc.ents) == 3
|
||||
if isinstance(ruler, EntityRuler):
|
||||
ruler.remove("ttime")
|
||||
else:
|
||||
ruler.remove_by_id("ttime")
|
||||
ruler.remove_by_id("ttime")
|
||||
doc = ruler(
|
||||
nlp.make_doc(
|
||||
"I saw him last time we met, this time he brought some flowers, another time some chocolate."
|
||||
|
|
|
@ -9,7 +9,7 @@ from thinc.types import Array2d, Ragged
|
|||
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import FeatureExtractor, StaticVectors
|
||||
from spacy.ml._character_embed import CharacterEmbed
|
||||
from spacy.ml.character_embed import CharacterEmbed
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from typing import cast
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
|
@ -7,6 +8,7 @@ from spacy.lang.en import English
|
|||
from spacy.language import Language
|
||||
from spacy.tests.util import make_tempdir
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.pipeline import TrainablePipe
|
||||
from spacy.attrs import MORPH
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -48,6 +50,12 @@ def test_implicit_label():
|
|||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
assert morphologizer.is_distillable
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
nlp = Language()
|
||||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
|
@ -197,3 +205,25 @@ def test_overfitting_IO():
|
|||
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
||||
|
||||
def test_save_activations():
|
||||
nlp = English()
|
||||
morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
|
||||
train_examples = []
|
||||
for inst in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert "morphologizer" not in doc.activations
|
||||
|
||||
morphologizer.save_activations = True
|
||||
doc = nlp("This is a test.")
|
||||
assert "morphologizer" in doc.activations
|
||||
assert set(doc.activations["morphologizer"].keys()) == {
|
||||
"label_ids",
|
||||
"probabilities",
|
||||
}
|
||||
assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
|
||||
assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
|
||||
|
|
|
@ -529,17 +529,6 @@ def test_pipe_label_data_no_labels(pipe):
|
|||
assert "labels" not in get_arg_names(initialize)
|
||||
|
||||
|
||||
def test_warning_pipe_begin_training():
|
||||
with pytest.warns(UserWarning, match="begin_training"):
|
||||
|
||||
class IncompatPipe(TrainablePipe):
|
||||
def __init__(self):
|
||||
...
|
||||
|
||||
def begin_training(*args, **kwargs):
|
||||
...
|
||||
|
||||
|
||||
def test_pipe_methods_initialize():
|
||||
"""Test that the [initialize] config reflects the components correctly."""
|
||||
nlp = Language()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from typing import cast
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from spacy.attrs import SENT_START
|
||||
|
@ -6,9 +7,16 @@ from spacy import util
|
|||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import TrainablePipe
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
senter = nlp.add_pipe("senter")
|
||||
assert senter.is_distillable
|
||||
|
||||
|
||||
def test_label_types():
|
||||
nlp = Language()
|
||||
senter = nlp.add_pipe("senter")
|
||||
|
@ -101,3 +109,26 @@ def test_overfitting_IO():
|
|||
# test internal pipe labels vs. Language.pipe_labels with hidden labels
|
||||
assert nlp.get_pipe("senter").labels == ("I", "S")
|
||||
assert "senter" not in nlp.pipe_labels
|
||||
|
||||
|
||||
def test_save_activations():
|
||||
# Test if activations are correctly added to Doc when requested.
|
||||
nlp = English()
|
||||
senter = cast(TrainablePipe, nlp.add_pipe("senter"))
|
||||
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
nO = senter.model.get_dim("nO")
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert "senter" not in doc.activations
|
||||
|
||||
senter.save_activations = True
|
||||
doc = nlp("This is a test.")
|
||||
assert "senter" in doc.activations
|
||||
assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
|
||||
assert doc.activations["senter"]["probabilities"].shape == (5, nO)
|
||||
assert doc.activations["senter"]["label_ids"].shape == (5,)
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
import pytest
|
||||
import numpy
|
||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||
from thinc.api import get_current_ops, Ragged
|
||||
from thinc.api import get_current_ops, Ragged, fix_random_seed
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import SpanGroup
|
||||
from spacy.tokens._dict_proxies import SpanGroups
|
||||
from spacy.tokens.span_groups import SpanGroups
|
||||
from spacy.training import Example
|
||||
from spacy.util import fix_random_seed, registry, make_tempdir
|
||||
from spacy.util import registry, make_tempdir
|
||||
|
||||
OPS = get_current_ops()
|
||||
|
||||
|
@ -444,3 +444,23 @@ def test_set_candidates():
|
|||
assert len(docs[0].spans["candidates"]) == 9
|
||||
assert docs[0].spans["candidates"][0].text == "Just"
|
||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||
|
||||
|
||||
def test_save_activations():
|
||||
# Test if activations are correctly added to Doc when requested.
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
nO = spancat.model.get_dim("nO")
|
||||
assert nO == 2
|
||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert "spancat" not in doc.activations
|
||||
|
||||
spancat.save_activations = True
|
||||
doc = nlp("This is a test.")
|
||||
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
|
||||
assert doc.activations["spancat"]["indices"].shape == (12, 2)
|
||||
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user