Merge branch 'explosion:master' into master

This commit is contained in:
Peter Baumgartner 2023-01-23 13:45:27 -05:00 committed by GitHub
commit f6e39a3072
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 78 additions and 24 deletions

View File

@ -5,8 +5,8 @@ from itertools import islice
import numpy as np import numpy as np
import srsly import srsly
from thinc.api import Config, Model, SequenceCategoricalCrossentropy from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
from thinc.types import Floats2d, Ints1d, Ints2d from thinc.types import Floats2d, Ints2d
from ._edit_tree_internals.edit_trees import EditTrees from ._edit_tree_internals.edit_trees import EditTrees
from ._edit_tree_internals.schemas import validate_edit_tree from ._edit_tree_internals.schemas import validate_edit_tree
@ -20,6 +20,10 @@ from ..vocab import Vocab
from .. import util from .. import util
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
TOP_K_GUARDRAIL = 20
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
self.cfg: Dict[str, Any] = {"labels": []} self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer self.scorer = scorer
self.numpy_ops = NumpyOps()
def get_loss( def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d] self, examples: Iterable[Example], scores: List[Floats2d]
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
return float(loss), d_scores return float(loss), d_scores
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]: def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
if self.top_k == 1:
scores2guesses = self._scores2guesses_top_k_equals_1
elif self.top_k <= TOP_K_GUARDRAIL:
scores2guesses = self._scores2guesses_top_k_greater_1
else:
scores2guesses = self._scores2guesses_top_k_guardrail
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
# for its principal purpose of lemmatizing tokens. However, the code could also
# be used for other purposes, and with very large values of *top_k* the method
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
# instead.
n_docs = len(list(docs)) n_docs = len(list(docs))
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
return guesses return guesses
scores = self.model.predict(docs) scores = self.model.predict(docs)
assert len(scores) == n_docs assert len(scores) == n_docs
guesses = self._scores2guesses(docs, scores) guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs assert len(guesses) == n_docs
return guesses return guesses
def _scores2guesses(self, docs, scores): def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = [] guesses = []
for doc, doc_scores in zip(docs, scores): for doc, doc_scores in zip(docs, scores):
if self.top_k == 1: doc_guesses = doc_scores.argmax(axis=1)
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1) doc_guesses = self.numpy_ops.asarray(doc_guesses)
else:
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
if not isinstance(doc_guesses, np.ndarray): doc_compat_guesses = []
doc_guesses = doc_guesses.get() for i, token in enumerate(doc):
tree_id = self.cfg["labels"][doc_guesses[i]]
if self.trees.apply(tree_id, token.text) is not None:
doc_compat_guesses.append(tree_id)
else:
doc_compat_guesses.append(-1)
guesses.append(np.array(doc_compat_guesses))
return guesses
def _scores2guesses_top_k_greater_1(self, docs, scores):
guesses = []
top_k = min(self.top_k, len(self.labels))
for doc, doc_scores in zip(docs, scores):
doc_scores = self.numpy_ops.asarray(doc_scores)
doc_compat_guesses = []
for i, token in enumerate(doc):
for _ in range(top_k):
candidate = int(doc_scores[i].argmax())
candidate_tree_id = self.cfg["labels"][candidate]
if self.trees.apply(candidate_tree_id, token.text) is not None:
doc_compat_guesses.append(candidate_tree_id)
break
doc_scores[i, candidate] = np.finfo(np.float32).min
else:
doc_compat_guesses.append(-1)
guesses.append(np.array(doc_compat_guesses))
return guesses
def _scores2guesses_top_k_guardrail(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
doc_guesses = self.numpy_ops.asarray(doc_guesses)
doc_compat_guesses = [] doc_compat_guesses = []
for token, candidates in zip(doc, doc_guesses): for token, candidates in zip(doc, doc_guesses):

View File

@ -101,14 +101,15 @@ def test_initialize_from_labels():
} }
def test_no_data(): @pytest.mark.parametrize("top_k", (1, 5, 30))
def test_no_data(top_k):
# Test that the lemmatizer provides a nice error when there's no tagging data / labels # Test that the lemmatizer provides a nice error when there's no tagging data / labels
TEXTCAT_DATA = [ TEXTCAT_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
] ]
nlp = English() nlp = English()
nlp.add_pipe("trainable_lemmatizer") nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp.add_pipe("textcat") nlp.add_pipe("textcat")
train_examples = [] train_examples = []
@ -119,10 +120,11 @@ def test_no_data():
nlp.initialize(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_incomplete_data(): @pytest.mark.parametrize("top_k", (1, 5, 30))
def test_incomplete_data(top_k):
# Test that the lemmatizer works with incomplete information # Test that the lemmatizer works with incomplete information
nlp = English() nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer") lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1 lemmatizer.min_tree_freq = 1
train_examples = [] train_examples = []
for t in PARTIAL_DATA: for t in PARTIAL_DATA:
@ -154,9 +156,10 @@ def test_incomplete_data():
assert xp.count_nonzero(dX[1][1]) == 0 assert xp.count_nonzero(dX[1][1]) == 0
def test_overfitting_IO(): @pytest.mark.parametrize("top_k", (1, 5, 30))
def test_overfitting_IO(top_k):
nlp = English() nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer") lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1 lemmatizer.min_tree_freq = 1
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
@ -189,7 +192,7 @@ def test_overfitting_IO():
# Check model after a {to,from}_bytes roundtrip # Check model after a {to,from}_bytes roundtrip
nlp_bytes = nlp.to_bytes() nlp_bytes = nlp.to_bytes()
nlp3 = English() nlp3 = English()
nlp3.add_pipe("trainable_lemmatizer") nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp3.from_bytes(nlp_bytes) nlp3.from_bytes(nlp_bytes)
doc3 = nlp3(test_text) doc3 = nlp3(test_text)
assert doc3[0].lemma_ == "she" assert doc3[0].lemma_ == "she"

View File

@ -1074,7 +1074,7 @@ def test_cli_find_threshold(capsys):
) )
with make_tempdir() as nlp_dir: with make_tempdir() as nlp_dir:
nlp.to_disk(nlp_dir) nlp.to_disk(nlp_dir)
res = find_threshold( best_threshold, best_score, res = find_threshold(
model=nlp_dir, model=nlp_dir,
data_path=docs_dir / "docs.spacy", data_path=docs_dir / "docs.spacy",
pipe_name="tc_multi", pipe_name="tc_multi",
@ -1082,10 +1082,10 @@ def test_cli_find_threshold(capsys):
scores_key="cats_macro_f", scores_key="cats_macro_f",
silent=True, silent=True,
) )
assert res[0] != thresholds[0] assert best_threshold != thresholds[0]
assert thresholds[0] < res[0] < thresholds[9] assert thresholds[0] < best_threshold < thresholds[9]
assert res[1] == 1.0 assert best_score == max(res.values())
assert res[2][1.0] == 0.0 assert res[1.0] == 0.0
# Test with spancat. # Test with spancat.
nlp, _ = init_nlp((("spancat", {}),)) nlp, _ = init_nlp((("spancat", {}),))

View File

@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
import siteMetadata from '../../meta/site.json' import siteMetadata from '../../meta/site.json'
import Head from 'next/head' import Head from 'next/head'
import { siteUrl } from '../../meta/dynamicMeta.mjs'
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) { function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
if (sectionTitle && title) { if (sectionTitle && title) {
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : '' const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
if (legacy) return socialImageLegacy if (legacy) return socialImageLegacy
if (section === 'api') return socialImageApi if (section === 'api') return socialImageApi
if (section === 'universe') return socialImageUniverse if (section === 'universe') return socialImageUniverse
return socialImageDefault return `${siteUrl}${socialImageDefault.src}`
} }
export default function SEO({ export default function SEO({
@ -46,7 +48,7 @@ export default function SEO({
nightly, nightly,
legacy legacy
) )
const socialImage = getImage(section, nightly, legacy).src const socialImage = getImage(section, nightly, legacy)
const meta = [ const meta = [
{ {
name: 'description', name: 'description',