mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-15 02:32:37 +03:00
Merge branch 'explosion:master' into master
This commit is contained in:
commit
f6e39a3072
|
@ -5,8 +5,8 @@ from itertools import islice
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
|
||||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
from thinc.types import Floats2d, Ints2d
|
||||||
|
|
||||||
from ._edit_tree_internals.edit_trees import EditTrees
|
from ._edit_tree_internals.edit_trees import EditTrees
|
||||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||||
|
@ -20,6 +20,10 @@ from ..vocab import Vocab
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
|
||||||
|
TOP_K_GUARDRAIL = 20
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
|
||||||
self.cfg: Dict[str, Any] = {"labels": []}
|
self.cfg: Dict[str, Any] = {"labels": []}
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.numpy_ops = NumpyOps()
|
||||||
|
|
||||||
def get_loss(
|
def get_loss(
|
||||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||||
|
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||||
|
if self.top_k == 1:
|
||||||
|
scores2guesses = self._scores2guesses_top_k_equals_1
|
||||||
|
elif self.top_k <= TOP_K_GUARDRAIL:
|
||||||
|
scores2guesses = self._scores2guesses_top_k_greater_1
|
||||||
|
else:
|
||||||
|
scores2guesses = self._scores2guesses_top_k_guardrail
|
||||||
|
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
|
||||||
|
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
|
||||||
|
# for its principal purpose of lemmatizing tokens. However, the code could also
|
||||||
|
# be used for other purposes, and with very large values of *top_k* the method
|
||||||
|
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
|
||||||
|
# instead.
|
||||||
n_docs = len(list(docs))
|
n_docs = len(list(docs))
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
return guesses
|
return guesses
|
||||||
scores = self.model.predict(docs)
|
scores = self.model.predict(docs)
|
||||||
assert len(scores) == n_docs
|
assert len(scores) == n_docs
|
||||||
guesses = self._scores2guesses(docs, scores)
|
guesses = scores2guesses(docs, scores)
|
||||||
assert len(guesses) == n_docs
|
assert len(guesses) == n_docs
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def _scores2guesses(self, docs, scores):
|
def _scores2guesses_top_k_equals_1(self, docs, scores):
|
||||||
guesses = []
|
guesses = []
|
||||||
for doc, doc_scores in zip(docs, scores):
|
for doc, doc_scores in zip(docs, scores):
|
||||||
if self.top_k == 1:
|
doc_guesses = doc_scores.argmax(axis=1)
|
||||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||||
else:
|
|
||||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
|
||||||
|
|
||||||
if not isinstance(doc_guesses, np.ndarray):
|
doc_compat_guesses = []
|
||||||
doc_guesses = doc_guesses.get()
|
for i, token in enumerate(doc):
|
||||||
|
tree_id = self.cfg["labels"][doc_guesses[i]]
|
||||||
|
if self.trees.apply(tree_id, token.text) is not None:
|
||||||
|
doc_compat_guesses.append(tree_id)
|
||||||
|
else:
|
||||||
|
doc_compat_guesses.append(-1)
|
||||||
|
guesses.append(np.array(doc_compat_guesses))
|
||||||
|
|
||||||
|
return guesses
|
||||||
|
|
||||||
|
def _scores2guesses_top_k_greater_1(self, docs, scores):
|
||||||
|
guesses = []
|
||||||
|
top_k = min(self.top_k, len(self.labels))
|
||||||
|
for doc, doc_scores in zip(docs, scores):
|
||||||
|
doc_scores = self.numpy_ops.asarray(doc_scores)
|
||||||
|
doc_compat_guesses = []
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
for _ in range(top_k):
|
||||||
|
candidate = int(doc_scores[i].argmax())
|
||||||
|
candidate_tree_id = self.cfg["labels"][candidate]
|
||||||
|
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||||
|
doc_compat_guesses.append(candidate_tree_id)
|
||||||
|
break
|
||||||
|
doc_scores[i, candidate] = np.finfo(np.float32).min
|
||||||
|
else:
|
||||||
|
doc_compat_guesses.append(-1)
|
||||||
|
guesses.append(np.array(doc_compat_guesses))
|
||||||
|
|
||||||
|
return guesses
|
||||||
|
|
||||||
|
def _scores2guesses_top_k_guardrail(self, docs, scores):
|
||||||
|
guesses = []
|
||||||
|
for doc, doc_scores in zip(docs, scores):
|
||||||
|
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||||
|
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||||
|
|
||||||
doc_compat_guesses = []
|
doc_compat_guesses = []
|
||||||
for token, candidates in zip(doc, doc_guesses):
|
for token, candidates in zip(doc, doc_guesses):
|
||||||
|
|
|
@ -101,14 +101,15 @@ def test_initialize_from_labels():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_no_data():
|
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||||
|
def test_no_data(top_k):
|
||||||
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
||||||
TEXTCAT_DATA = [
|
TEXTCAT_DATA = [
|
||||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||||
]
|
]
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("trainable_lemmatizer")
|
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
|
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -119,10 +120,11 @@ def test_no_data():
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_incomplete_data():
|
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||||
|
def test_incomplete_data(top_k):
|
||||||
# Test that the lemmatizer works with incomplete information
|
# Test that the lemmatizer works with incomplete information
|
||||||
nlp = English()
|
nlp = English()
|
||||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
lemmatizer.min_tree_freq = 1
|
lemmatizer.min_tree_freq = 1
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in PARTIAL_DATA:
|
for t in PARTIAL_DATA:
|
||||||
|
@ -154,9 +156,10 @@ def test_incomplete_data():
|
||||||
assert xp.count_nonzero(dX[1][1]) == 0
|
assert xp.count_nonzero(dX[1][1]) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||||
|
def test_overfitting_IO(top_k):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
lemmatizer.min_tree_freq = 1
|
lemmatizer.min_tree_freq = 1
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
|
@ -189,7 +192,7 @@ def test_overfitting_IO():
|
||||||
# Check model after a {to,from}_bytes roundtrip
|
# Check model after a {to,from}_bytes roundtrip
|
||||||
nlp_bytes = nlp.to_bytes()
|
nlp_bytes = nlp.to_bytes()
|
||||||
nlp3 = English()
|
nlp3 = English()
|
||||||
nlp3.add_pipe("trainable_lemmatizer")
|
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
nlp3.from_bytes(nlp_bytes)
|
nlp3.from_bytes(nlp_bytes)
|
||||||
doc3 = nlp3(test_text)
|
doc3 = nlp3(test_text)
|
||||||
assert doc3[0].lemma_ == "she"
|
assert doc3[0].lemma_ == "she"
|
||||||
|
|
|
@ -1074,7 +1074,7 @@ def test_cli_find_threshold(capsys):
|
||||||
)
|
)
|
||||||
with make_tempdir() as nlp_dir:
|
with make_tempdir() as nlp_dir:
|
||||||
nlp.to_disk(nlp_dir)
|
nlp.to_disk(nlp_dir)
|
||||||
res = find_threshold(
|
best_threshold, best_score, res = find_threshold(
|
||||||
model=nlp_dir,
|
model=nlp_dir,
|
||||||
data_path=docs_dir / "docs.spacy",
|
data_path=docs_dir / "docs.spacy",
|
||||||
pipe_name="tc_multi",
|
pipe_name="tc_multi",
|
||||||
|
@ -1082,10 +1082,10 @@ def test_cli_find_threshold(capsys):
|
||||||
scores_key="cats_macro_f",
|
scores_key="cats_macro_f",
|
||||||
silent=True,
|
silent=True,
|
||||||
)
|
)
|
||||||
assert res[0] != thresholds[0]
|
assert best_threshold != thresholds[0]
|
||||||
assert thresholds[0] < res[0] < thresholds[9]
|
assert thresholds[0] < best_threshold < thresholds[9]
|
||||||
assert res[1] == 1.0
|
assert best_score == max(res.values())
|
||||||
assert res[2][1.0] == 0.0
|
assert res[1.0] == 0.0
|
||||||
|
|
||||||
# Test with spancat.
|
# Test with spancat.
|
||||||
nlp, _ = init_nlp((("spancat", {}),))
|
nlp, _ = init_nlp((("spancat", {}),))
|
||||||
|
|
|
@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
|
||||||
import siteMetadata from '../../meta/site.json'
|
import siteMetadata from '../../meta/site.json'
|
||||||
import Head from 'next/head'
|
import Head from 'next/head'
|
||||||
|
|
||||||
|
import { siteUrl } from '../../meta/dynamicMeta.mjs'
|
||||||
|
|
||||||
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
|
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
|
||||||
if (sectionTitle && title) {
|
if (sectionTitle && title) {
|
||||||
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
|
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
|
||||||
|
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
|
||||||
if (legacy) return socialImageLegacy
|
if (legacy) return socialImageLegacy
|
||||||
if (section === 'api') return socialImageApi
|
if (section === 'api') return socialImageApi
|
||||||
if (section === 'universe') return socialImageUniverse
|
if (section === 'universe') return socialImageUniverse
|
||||||
return socialImageDefault
|
return `${siteUrl}${socialImageDefault.src}`
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function SEO({
|
export default function SEO({
|
||||||
|
@ -46,7 +48,7 @@ export default function SEO({
|
||||||
nightly,
|
nightly,
|
||||||
legacy
|
legacy
|
||||||
)
|
)
|
||||||
const socialImage = getImage(section, nightly, legacy).src
|
const socialImage = getImage(section, nightly, legacy)
|
||||||
const meta = [
|
const meta = [
|
||||||
{
|
{
|
||||||
name: 'description',
|
name: 'description',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user