mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'develop' into feature/spacy-legacy
This commit is contained in:
commit
1090d3d675
|
@ -463,12 +463,14 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
|
||||||
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
|
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
|
||||||
"If you're using a custom function, make sure the code is available. "
|
"If you're using a custom function, make sure the code is available. "
|
||||||
"If the function is provided by a third-party package, e.g. "
|
"If the function is provided by a third-party package, e.g. "
|
||||||
"spacy-transformers, make sure the package is installed in your "
|
"spacy-transformers, make sure the package is installed in your "
|
||||||
"environment.\n\nAvailable names: {available}")
|
"environment.\n\nAvailable names: {available}")
|
||||||
E894 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
|
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
|
||||||
|
"'{lang}'.")
|
||||||
E895 = ("The 'textcat' component received gold-standard annotations with "
|
E895 = ("The 'textcat' component received gold-standard annotations with "
|
||||||
"multiple labels per document. In spaCy 3 you should use the "
|
"multiple labels per document. In spaCy 3 you should use the "
|
||||||
"'textcat_multilabel' component for this instead. "
|
"'textcat_multilabel' component for this instead. "
|
||||||
|
|
|
@ -86,7 +86,7 @@ def like_num(text):
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# CHeck ordinal number
|
# Check ordinal number
|
||||||
if text in _ordinal_words:
|
if text in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -18,8 +18,6 @@ class MacedonianLemmatizer(Lemmatizer):
|
||||||
string = string[:-3]
|
string = string[:-3]
|
||||||
univ_pos = "verb"
|
univ_pos = "verb"
|
||||||
|
|
||||||
if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
|
|
||||||
return [string.lower()]
|
|
||||||
index_table = self.lookups.get_table("lemma_index", {})
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
|
|
@ -697,6 +697,8 @@ class Language:
|
||||||
source_config = source.config.interpolate()
|
source_config = source.config.interpolate()
|
||||||
pipe_config = util.copy_config(source_config["components"][source_name])
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
|
for s in source.vocab.strings:
|
||||||
|
self.vocab.strings.add(s)
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
def add_pipe(
|
def add_pipe(
|
||||||
|
@ -1619,9 +1621,7 @@ class Language:
|
||||||
if model not in source_nlps:
|
if model not in source_nlps:
|
||||||
# We only need the components here and we need to init
|
# We only need the components here and we need to init
|
||||||
# model with the same vocab as the current nlp object
|
# model with the same vocab as the current nlp object
|
||||||
source_nlps[model] = util.load_model(
|
source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
|
||||||
model, vocab=nlp.vocab, disable=["vocab", "tokenizer"]
|
|
||||||
)
|
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
||||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||||
|
|
|
@ -197,13 +197,39 @@ cdef class ArcEagerGold:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
heads, labels = example.get_aligned_parse(projectivize=True)
|
heads, labels = example.get_aligned_parse(projectivize=True)
|
||||||
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
||||||
sent_starts = example.get_aligned_sent_starts()
|
sent_starts = _get_aligned_sent_starts(example)
|
||||||
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
||||||
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
||||||
|
|
||||||
def update(self, StateClass stcls):
|
def update(self, StateClass stcls):
|
||||||
update_gold_state(&self.c, stcls.c)
|
update_gold_state(&self.c, stcls.c)
|
||||||
|
|
||||||
|
def _get_aligned_sent_starts(example):
|
||||||
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
|
If the reference has not sentence starts, return a list of None values.
|
||||||
|
|
||||||
|
This function is slightly different from the one on Example, because we also
|
||||||
|
check whether the reference sentences align across multiple sentences,
|
||||||
|
and return missing values if they do. This prevents a problem where you have
|
||||||
|
the start of a sentence merged onto a token that belongs to two sentences.
|
||||||
|
"""
|
||||||
|
if example.y.has_annotation("SENT_START"):
|
||||||
|
align = example.alignment.y2x
|
||||||
|
sent_starts = [False] * len(example.x)
|
||||||
|
seen_words = set()
|
||||||
|
for y_sent in example.y.sents:
|
||||||
|
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
|
||||||
|
if any(x_idx in seen_words for x_idx in x_indices):
|
||||||
|
# If there are any tokens in X that align across two sentences,
|
||||||
|
# regard the sentence annotations as missing, as we can't
|
||||||
|
# reliably use them.
|
||||||
|
return [None] * len(example.x)
|
||||||
|
seen_words.update(x_indices)
|
||||||
|
sent_starts[x_indices[0]] = True
|
||||||
|
return sent_starts
|
||||||
|
else:
|
||||||
|
return [None] * len(example.x)
|
||||||
|
|
||||||
|
|
||||||
cdef int check_state_gold(char state_bits, char flag) nogil:
|
cdef int check_state_gold(char state_bits, char flag) nogil:
|
||||||
cdef char one = 1
|
cdef char one = 1
|
||||||
|
@ -820,7 +846,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
else:
|
else:
|
||||||
failed = False
|
failed = False
|
||||||
break
|
break
|
||||||
if failed:
|
if failed and _debug not in (False, None):
|
||||||
example = _debug
|
example = _debug
|
||||||
print("Actions")
|
print("Actions")
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
import srsly
|
||||||
|
from thinc.api import Config
|
||||||
|
from typing import Dict, Any
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..util import filter_spans
|
from ..util import filter_spans
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@Language.component(
|
@Language.component(
|
||||||
|
@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
for span in spans:
|
for span in spans:
|
||||||
retokenizer.merge(span)
|
retokenizer.merge(span)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@Language.factory(
|
||||||
|
"token_splitter",
|
||||||
|
default_config={"min_length": 25, "split_length": 10},
|
||||||
|
retokenizes=True,
|
||||||
|
)
|
||||||
|
def make_token_splitter(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
*,
|
||||||
|
min_length=0,
|
||||||
|
split_length=0,
|
||||||
|
):
|
||||||
|
return TokenSplitter(
|
||||||
|
min_length=min_length, split_length=split_length
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TokenSplitter:
|
||||||
|
def __init__(self, min_length: int = 0, split_length: int = 0):
|
||||||
|
self.min_length = min_length
|
||||||
|
self.split_length = split_length
|
||||||
|
|
||||||
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
|
if self.min_length > 0 and self.split_length > 0:
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
for t in doc:
|
||||||
|
if len(t.text) >= self.min_length:
|
||||||
|
orths = []
|
||||||
|
heads = []
|
||||||
|
attrs = {}
|
||||||
|
for i in range(0, len(t.text), self.split_length):
|
||||||
|
orths.append(t.text[i : i + self.split_length])
|
||||||
|
heads.append((t, i / self.split_length))
|
||||||
|
retokenizer.split(t, orths, heads, attrs)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"min_length": self.min_length,
|
||||||
|
"split_length": self.split_length,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
|
self.min_length = config.get("min_length", 0)
|
||||||
|
self.split_length = config.get("split_length", 0)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs):
|
||||||
|
serializers = {
|
||||||
|
"cfg": lambda: srsly.json_dumps(self._get_config()),
|
||||||
|
}
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def from_bytes(self, data, **kwargs):
|
||||||
|
deserializers = {
|
||||||
|
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
|
||||||
|
}
|
||||||
|
util.from_bytes(data, deserializers, [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, **kwargs):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {
|
||||||
|
"cfg": lambda p: srsly.write_json(p, self._get_config()),
|
||||||
|
}
|
||||||
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
|
def from_disk(self, path, **kwargs):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {
|
||||||
|
"cfg": lambda p: self._set_config(srsly.read_json(p)),
|
||||||
|
}
|
||||||
|
util.from_disk(path, serializers, [])
|
||||||
|
|
|
@ -145,6 +145,10 @@ class Morphologizer(Tagger):
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
|
# if both are unset, annotation is missing, so do not add
|
||||||
|
# an empty label
|
||||||
|
if pos == "" and not token.has_morph():
|
||||||
|
continue
|
||||||
morph = str(token.morph)
|
morph = str(token.morph)
|
||||||
# create and add the combined morph+POS label
|
# create and add the combined morph+POS label
|
||||||
morph_dict = Morphology.feats_to_dict(morph)
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
|
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
|
||||||
if norm_label not in self.cfg["labels_morph"]:
|
if norm_label not in self.cfg["labels_morph"]:
|
||||||
self.cfg["labels_morph"][norm_label] = morph
|
self.cfg["labels_morph"][norm_label] = morph
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
if len(self.labels) <= 1:
|
if len(self.labels) < 1:
|
||||||
raise ValueError(Errors.E143.format(name=self.name))
|
raise ValueError(Errors.E143.format(name=self.name))
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
label_sample = []
|
label_sample = []
|
||||||
|
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
|
||||||
pos = pos_tags[i]
|
pos = pos_tags[i]
|
||||||
morph = morphs[i]
|
morph = morphs[i]
|
||||||
# POS may align (same value for multiple tokens) when morph
|
# POS may align (same value for multiple tokens) when morph
|
||||||
# doesn't, so if either is None, treat both as None here so that
|
# doesn't, so if either is misaligned (None), treat the
|
||||||
# truths doesn't end up with an unknown morph+POS combination
|
# annotation as missing so that truths doesn't end up with an
|
||||||
|
# unknown morph+POS combination
|
||||||
if pos is None or morph is None:
|
if pos is None or morph is None:
|
||||||
label = None
|
label = None
|
||||||
|
# If both are unset, the annotation is missing (empty morph
|
||||||
|
# converted from int is "_" rather than "")
|
||||||
|
elif pos == "" and morph == "":
|
||||||
|
label = None
|
||||||
|
# Otherwise, generate the combined label
|
||||||
else:
|
else:
|
||||||
label_dict = Morphology.feats_to_dict(morph)
|
label_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
label_dict[self.POS_FEAT] = pos
|
label_dict[self.POS_FEAT] = pos
|
||||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
|
# As a fail-safe, skip any unrecognized labels
|
||||||
|
if label not in self.labels:
|
||||||
|
label = None
|
||||||
eg_truths.append(label)
|
eg_truths.append(label)
|
||||||
truths.append(eg_truths)
|
truths.append(eg_truths)
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
|
|
|
@ -2,6 +2,8 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
import logging
|
import logging
|
||||||
import mock
|
import mock
|
||||||
|
|
||||||
|
from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.lexeme import Lexeme
|
from spacy.lexeme import Lexeme
|
||||||
|
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
|
||||||
doc.ents = spans
|
doc.ents = spans
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_noun_chunks_not_implemented():
|
||||||
|
"""Test that a language without noun_chunk iterator, throws a NotImplementedError"""
|
||||||
|
text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
|
||||||
|
nlp = MultiLanguage()
|
||||||
|
doc = nlp(text)
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
|
||||||
def test_span_groups(en_tokenizer):
|
def test_span_groups(en_tokenizer):
|
||||||
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
||||||
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
|
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
import numpy
|
|
||||||
from spacy.attrs import HEAD, DEP
|
|
||||||
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
|
||||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_vocab):
|
||||||
|
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
|
||||||
|
heads = [1, 1, 6, 6, 3, 3, 1]
|
||||||
|
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
||||||
|
pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
|
||||||
|
return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||||
doc = en_tokenizer("This is a sentence")
|
doc = en_tokenizer("This is a sentence")
|
||||||
|
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
def test_en_noun_chunks_not_nested(en_vocab):
|
def test_en_noun_chunks_not_nested(doc, en_vocab):
|
||||||
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
|
"""Test that each token only appears in one noun chunk at most"""
|
||||||
heads = [1, 1, 6, 6, 3, 3, 1]
|
|
||||||
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
|
||||||
doc.from_array(
|
|
||||||
[HEAD, DEP],
|
|
||||||
numpy.asarray(
|
|
||||||
[
|
|
||||||
[1, nsubj],
|
|
||||||
[0, root],
|
|
||||||
[4, amod],
|
|
||||||
[3, nmod],
|
|
||||||
[-1, cc],
|
|
||||||
[-2, conj],
|
|
||||||
[-5, dobj],
|
|
||||||
],
|
|
||||||
dtype="uint64",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
doc.noun_chunks_iterator = noun_chunks
|
|
||||||
word_occurred = {}
|
word_occurred = {}
|
||||||
for chunk in doc.noun_chunks:
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert len(chunks) > 1
|
||||||
|
for chunk in chunks:
|
||||||
for word in chunk:
|
for word in chunk:
|
||||||
word_occurred.setdefault(word.text, 0)
|
word_occurred.setdefault(word.text, 0)
|
||||||
word_occurred[word.text] += 1
|
word_occurred[word.text] += 1
|
||||||
|
assert len(word_occurred) > 0
|
||||||
for word, freq in word_occurred.items():
|
for word, freq in word_occurred.items():
|
||||||
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
|
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_span(doc, en_tokenizer):
|
||||||
|
"""Test that the span.noun_chunks property works correctly"""
|
||||||
|
doc_chunks = list(doc.noun_chunks)
|
||||||
|
span = doc[0:3]
|
||||||
|
span_chunks = list(span.noun_chunks)
|
||||||
|
assert 0 < len(span_chunks) < len(doc_chunks)
|
||||||
|
for chunk in span_chunks:
|
||||||
|
assert chunk in doc_chunks
|
||||||
|
assert chunk.start >= 0
|
||||||
|
assert chunk.end <= 3
|
||||||
|
|
|
@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2):
|
||||||
assert len(doc2) == 6
|
assert len(doc2) == 6
|
||||||
assert len(list(doc2.ents)) == 1
|
assert len(list(doc2.ents)) == 1
|
||||||
assert doc2[2].text == "New York"
|
assert doc2[2].text == "New York"
|
||||||
|
|
||||||
|
|
||||||
|
def test_token_splitter():
|
||||||
|
nlp = Language()
|
||||||
|
config = {"min_length": 20, "split_length": 5}
|
||||||
|
token_splitter = nlp.add_pipe("token_splitter", config=config)
|
||||||
|
doc = nlp("aaaaabbbbbcccccdddd e f g")
|
||||||
|
assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"]
|
||||||
|
doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i")
|
||||||
|
assert [t.text for t in doc] == [
|
||||||
|
"aaaaa",
|
||||||
|
"bbbbb",
|
||||||
|
"ccccc",
|
||||||
|
"ddddd",
|
||||||
|
"eeeee",
|
||||||
|
"ff",
|
||||||
|
"g",
|
||||||
|
"h",
|
||||||
|
"i",
|
||||||
|
]
|
||||||
|
assert all(len(t.text) <= token_splitter.split_length for t in doc)
|
||||||
|
|
|
@ -136,3 +136,28 @@ def test_overfitting_IO():
|
||||||
gold_pos_tags = ["", "", "", ""]
|
gold_pos_tags = ["", "", "", ""]
|
||||||
assert [str(t.morph) for t in doc] == gold_morphs
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
|
# Test with unset morph and partial POS
|
||||||
|
nlp.remove_pipe("morphologizer")
|
||||||
|
nlp.add_pipe("morphologizer")
|
||||||
|
for example in train_examples:
|
||||||
|
for token in example.reference:
|
||||||
|
if token.text == "ham":
|
||||||
|
token.pos_ = "NOUN"
|
||||||
|
else:
|
||||||
|
token.pos_ = ""
|
||||||
|
token.set_morph(None)
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
print(nlp.get_pipe("morphologizer").labels)
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["morphologizer"] < 0.00001
|
||||||
|
|
||||||
|
# Test the trained model
|
||||||
|
test_text = "I like blue ham"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
gold_morphs = ["", "", "", ""]
|
||||||
|
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
||||||
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
|
@ -81,7 +81,8 @@ def test_issue3199():
|
||||||
"""
|
"""
|
||||||
words = ["This", "is", "a", "sentence"]
|
words = ["This", "is", "a", "sentence"]
|
||||||
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
|
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
with pytest.raises(NotImplementedError):
|
||||||
|
list(doc[0:3].noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
|
|
|
@ -816,8 +816,10 @@ cdef class Doc:
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
"""Iterate over the base noun phrases in the document. Yields base
|
"""Iterate over the base noun phrases in the document. Yields base
|
||||||
noun-phrase #[code Span] objects, if the document has been
|
noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
|
||||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
Raises a NotImplementedError otherwise.
|
||||||
|
|
||||||
|
A base noun phrase, or "NP chunk", is a noun
|
||||||
phrase that does not permit other NPs to be nested within it – so no
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
NP-level coordination, no prepositional phrases, and no relative
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
clauses.
|
clauses.
|
||||||
|
@ -826,16 +828,17 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#noun_chunks
|
DOCS: https://nightly.spacy.io/api/doc#noun_chunks
|
||||||
"""
|
"""
|
||||||
|
if self.noun_chunks_iterator is None:
|
||||||
|
raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))
|
||||||
|
|
||||||
# Accumulate the result before beginning to iterate over it. This
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
# prevents the tokenisation from being changed out from under us
|
# prevents the tokenization from being changed out from under us
|
||||||
# during the iteration. The tricky thing here is that Span accepts
|
# during the iteration. The tricky thing here is that Span accepts
|
||||||
# its tokenisation changing, so it's okay once we have the Span
|
# its tokenization changing, so it's okay once we have the Span
|
||||||
# objects. See Issue #375.
|
# objects. See Issue #375.
|
||||||
spans = []
|
spans = []
|
||||||
if self.noun_chunks_iterator is not None:
|
for start, end, label in self.noun_chunks_iterator(self):
|
||||||
for start, end, label in self.noun_chunks_iterator(self):
|
spans.append(Span(self, start, end, label=label))
|
||||||
spans.append(Span(self, start, end, label=label))
|
|
||||||
for span in spans:
|
for span in spans:
|
||||||
yield span
|
yield span
|
||||||
|
|
||||||
|
|
|
@ -487,30 +487,25 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
return "".join([t.text_with_ws for t in self])
|
return "".join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
"""Yields base noun-phrase `Span` objects, if the document has been
|
"""Iterate over the base noun phrases in the span. Yields base
|
||||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
|
||||||
|
Raises a NotImplementedError otherwise.
|
||||||
|
|
||||||
|
A base noun phrase, or "NP chunk", is a noun
|
||||||
phrase that does not permit other NPs to be nested within it – so no
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
NP-level coordination, no prepositional phrases, and no relative
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
clauses.
|
clauses.
|
||||||
|
|
||||||
YIELDS (Span): Base noun-phrase `Span` objects.
|
YIELDS (Span): Noun chunks in the span.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
||||||
"""
|
"""
|
||||||
# Accumulate the result before beginning to iterate over it. This
|
for span in self.doc.noun_chunks:
|
||||||
# prevents the tokenisation from being changed out from under us
|
if span.start >= self.start and span.end <= self.end:
|
||||||
# during the iteration. The tricky thing here is that Span accepts
|
yield span
|
||||||
# its tokenisation changing, so it's okay once we have the Span
|
|
||||||
# objects. See Issue #375
|
|
||||||
spans = []
|
|
||||||
cdef attr_t label
|
|
||||||
if self.doc.noun_chunks_iterator is not None:
|
|
||||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
|
||||||
spans.append(Span(self.doc, start, end, label=label))
|
|
||||||
for span in spans:
|
|
||||||
yield span
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def root(self):
|
def root(self):
|
||||||
|
|
|
@ -211,6 +211,14 @@ cdef class Token:
|
||||||
xp = get_array_module(vector)
|
xp = get_array_module(vector)
|
||||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||||
|
|
||||||
|
def has_morph(self):
|
||||||
|
"""Check whether the token has annotated morph information.
|
||||||
|
Return False when the morph annotation is unset/missing.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the morph annotation is set.
|
||||||
|
"""
|
||||||
|
return not self.c.morph == 0
|
||||||
|
|
||||||
property morph:
|
property morph:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||||
|
|
|
@ -200,10 +200,6 @@ cdef class Example:
|
||||||
def get_aligned_sent_starts(self):
|
def get_aligned_sent_starts(self):
|
||||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
If the reference has not sentence starts, return a list of None values.
|
If the reference has not sentence starts, return a list of None values.
|
||||||
|
|
||||||
The aligned sentence starts use the get_aligned_spans method, rather
|
|
||||||
than aligning the list of tags, so that it handles cases where a mistaken
|
|
||||||
tokenization starts the sentence.
|
|
||||||
"""
|
"""
|
||||||
if self.y.has_annotation("SENT_START"):
|
if self.y.has_annotation("SENT_START"):
|
||||||
align = self.alignment.y2x
|
align = self.alignment.y2x
|
||||||
|
|
|
@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
|
||||||
nested within it – so no NP-level coordination, no prepositional phrases, and no
|
nested within it – so no NP-level coordination, no prepositional phrases, and no
|
||||||
relative clauses.
|
relative clauses.
|
||||||
|
|
||||||
|
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
|
||||||
|
not been implemeted for the given language, a `NotImplementedError` is raised.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> doc = nlp("A phrase with another phrase occurs.")
|
> doc = nlp("A phrase with another phrase occurs.")
|
||||||
> chunks = list(doc.noun_chunks)
|
> chunks = list(doc.noun_chunks)
|
||||||
|
> assert len(chunks) == 2
|
||||||
> assert chunks[0].text == "A phrase"
|
> assert chunks[0].text == "A phrase"
|
||||||
> assert chunks[1].text == "another phrase"
|
> assert chunks[1].text == "another phrase"
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
||||||
- ['merge_noun_chunks', 'merge_noun_chunks']
|
- ['merge_noun_chunks', 'merge_noun_chunks']
|
||||||
- ['merge_entities', 'merge_entities']
|
- ['merge_entities', 'merge_entities']
|
||||||
- ['merge_subtokens', 'merge_subtokens']
|
- ['merge_subtokens', 'merge_subtokens']
|
||||||
|
- ['token_splitter', 'token_splitter']
|
||||||
---
|
---
|
||||||
|
|
||||||
## merge_noun_chunks {#merge_noun_chunks tag="function"}
|
## merge_noun_chunks {#merge_noun_chunks tag="function"}
|
||||||
|
@ -107,3 +108,25 @@ end of the pipeline and after all other components.
|
||||||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||||
| `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ |
|
| `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ |
|
||||||
| **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ |
|
| **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ |
|
||||||
|
|
||||||
|
## token_splitter {#token_splitter tag="function" new="3.0"}
|
||||||
|
|
||||||
|
Split tokens longer than a minimum length into shorter tokens. Intended for use
|
||||||
|
with transformer pipelines where long spaCy tokens lead to input text that
|
||||||
|
exceed the transformer model max length. See
|
||||||
|
[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config={"min_length": 20, "split_length": 5}
|
||||||
|
> nlp.add_pipe("token_splitter", config=config, first=True)
|
||||||
|
> doc = nlp("aaaaabbbbbcccccdddddee")
|
||||||
|
> print([token.text for token in doc])
|
||||||
|
> # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee']
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Description |
|
||||||
|
| -------------- | --------------------------------------------------------------------- |
|
||||||
|
| `min_length` | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
|
||||||
|
| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~ |
|
||||||
|
|
|
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ----------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||||
| `start` | The index of the first character of the span. ~~int~~ |
|
| `start` | The index of the first character of the span. ~~int~~ |
|
||||||
| `end` | The index of the last character after the span. ~~int~~ |
|
| `end` | The index of the last character after the span. ~~int~~ |
|
||||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||||
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||||
|
@ -274,6 +274,31 @@ if the entity recognizer has been applied.
|
||||||
| ----------- | ----------------------------------------------------------------- |
|
| ----------- | ----------------------------------------------------------------- |
|
||||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||||
|
|
||||||
|
## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
|
||||||
|
|
||||||
|
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
|
||||||
|
objects, if the document has been syntactically parsed. A base noun phrase, or
|
||||||
|
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
|
||||||
|
it – so no NP-level coordination, no prepositional phrases, and no relative
|
||||||
|
clauses.
|
||||||
|
|
||||||
|
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
|
||||||
|
not been implemeted for the given language, a `NotImplementedError` is raised.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("A phrase with another phrase occurs.")
|
||||||
|
> span = doc[3:5]
|
||||||
|
> chunks = list(span.noun_chunks)
|
||||||
|
> assert len(chunks) == 1
|
||||||
|
> assert chunks[0].text == "another phrase"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | --------------------------------- |
|
||||||
|
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
|
||||||
|
|
||||||
## Span.as_doc {#as_doc tag="method"}
|
## Span.as_doc {#as_doc tag="method"}
|
||||||
|
|
||||||
Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
|
Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
|
||||||
|
|
|
@ -191,6 +191,15 @@ the morph to an unset state.
|
||||||
| -------- | --------------------------------------------------------------------------------- |
|
| -------- | --------------------------------------------------------------------------------- |
|
||||||
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
|
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
|
||||||
|
|
||||||
|
## Token.has_morph {#has_morph tag="method"}
|
||||||
|
|
||||||
|
Check whether the token has annotated morph information. Return `False` when the
|
||||||
|
morph annotation is unset/missing.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------- |
|
||||||
|
| **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
|
||||||
|
|
||||||
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
||||||
|
|
||||||
Check whether this token is a parent, grandparent, etc. of another in the
|
Check whether this token is a parent, grandparent, etc. of another in the
|
||||||
|
|
|
@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also
|
||||||
provide a schedule, allowing you to freeze the shared parameters at the start of
|
provide a schedule, allowing you to freeze the shared parameters at the start of
|
||||||
training.
|
training.
|
||||||
|
|
||||||
|
### Managing transformer model max length limitations {#transformer-max-length}
|
||||||
|
|
||||||
|
Many transformer models have a limit on the maximum number of tokens that the
|
||||||
|
model can process, for example BERT models are limited to 512 tokens. This limit
|
||||||
|
refers to the number of transformer tokens (BPE, WordPiece, etc.), not the
|
||||||
|
number of spaCy tokens.
|
||||||
|
|
||||||
|
To be able to process longer texts, the spaCy [`transformer`](/api/transformer)
|
||||||
|
component uses [`span_getters`](/api/transformer#span_getters) to convert a
|
||||||
|
batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A
|
||||||
|
span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or
|
||||||
|
a window of spaCy tokens (`strided_spans`). If a single span corresponds to more
|
||||||
|
transformer tokens than the transformer model supports, the spaCy pipeline can't
|
||||||
|
process the text because some spaCy tokens would be left without an analysis.
|
||||||
|
|
||||||
|
In general, it is up to the transformer pipeline user to manage the input texts
|
||||||
|
so that the model max length is not exceeded. If you're training a **new
|
||||||
|
pipeline**, you have a number of options to handle the max length limit:
|
||||||
|
|
||||||
|
- Use `doc_spans` with short texts only
|
||||||
|
- Use `sent_spans` with short sentences only
|
||||||
|
- For `strided_spans`, lower the `window` size to be short enough for your input
|
||||||
|
texts (and don't forget to lower the `stride` correspondingly)
|
||||||
|
- Implement a [custom span getter](#transformers-training-custom-settings)
|
||||||
|
|
||||||
|
You may still run into the max length limit if a single spaCy token is very
|
||||||
|
long, like a long URL or a noisy string, or if you're using a **pretrained
|
||||||
|
pipeline** like `en_core_web_trf` with a fixed `window` size for
|
||||||
|
`strided_spans`. In this case, you need to modify either your texts or your
|
||||||
|
pipeline so that you have shorter spaCy tokens. Some options:
|
||||||
|
|
||||||
|
- Preprocess your texts to clean up noise and split long tokens with whitespace
|
||||||
|
- Add a `token_splitter` to the beginning of your pipeline to break up
|
||||||
|
tokens that are longer than a specified length:
|
||||||
|
|
||||||
|
```python
|
||||||
|
config={"min_length": 20, "split_length": 5}
|
||||||
|
nlp.add_pipe("token_splitter", config=config, first=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example, tokens that are at least 20 characters long will be split up
|
||||||
|
into smaller tokens of 5 characters each, resulting in strided spans that
|
||||||
|
correspond to fewer transformer tokens.
|
||||||
|
|
||||||
## Static vectors {#static-vectors}
|
## Static vectors {#static-vectors}
|
||||||
|
|
||||||
If your pipeline includes a **word vectors table**, you'll be able to use the
|
If your pipeline includes a **word vectors table**, you'll be able to use the
|
||||||
|
|
|
@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" – flat phrases that have a noun as their
|
||||||
head. You can think of noun chunks as a noun plus the words describing the noun
|
head. You can think of noun chunks as a noun plus the words describing the noun
|
||||||
– for example, "the lavish green grass" or "the world’s largest tech fund". To
|
– for example, "the lavish green grass" or "the world’s largest tech fund". To
|
||||||
get the noun chunks in a document, simply iterate over
|
get the noun chunks in a document, simply iterate over
|
||||||
[`Doc.noun_chunks`](/api/doc#noun_chunks)
|
[`Doc.noun_chunks`](/api/doc#noun_chunks).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
{
|
{
|
||||||
"id": "spacy-textblob",
|
"id": "spacy-textblob",
|
||||||
"title": "spaCyTextBlob",
|
"title": "spaCyTextBlob",
|
||||||
"slogan": "Easy sentiment analysis for spaCy using TextBlob",
|
"slogan": "Easy sentiment analysis for spaCy using TextBlob",
|
||||||
|
@ -30,7 +30,7 @@
|
||||||
},
|
},
|
||||||
"category": ["pipeline"],
|
"category": ["pipeline"],
|
||||||
"tags": ["sentiment", "textblob"]
|
"tags": ["sentiment", "textblob"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-ray",
|
"id": "spacy-ray",
|
||||||
"title": "spacy-ray",
|
"title": "spacy-ray",
|
||||||
|
@ -2139,7 +2139,7 @@
|
||||||
"from negspacy.negation import Negex",
|
"from negspacy.negation import Negex",
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load(\"en_core_web_sm\")",
|
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||||
"negex = Negex(nlp, ent_types=[\"PERSON\",\"ORG\"])",
|
"negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])",
|
||||||
"nlp.add_pipe(negex, last=True)",
|
"nlp.add_pipe(negex, last=True)",
|
||||||
"",
|
"",
|
||||||
"doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")",
|
"doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")",
|
||||||
|
@ -2619,14 +2619,14 @@
|
||||||
"github": "medspacy"
|
"github": "medspacy"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "rita-dsl",
|
"id": "rita-dsl",
|
||||||
"title": "RITA DSL",
|
"title": "RITA DSL",
|
||||||
"slogan": "Domain Specific Language for creating language rules",
|
"slogan": "Domain Specific Language for creating language rules",
|
||||||
"github": "zaibacu/rita-dsl",
|
"github": "zaibacu/rita-dsl",
|
||||||
"description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format",
|
"description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format",
|
||||||
"pip": "rita-dsl",
|
"pip": "rita-dsl",
|
||||||
"thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png",
|
"thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png",
|
||||||
"code_language": "python",
|
"code_language": "python",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
|
|
57204
website/package-lock.json
generated
57204
website/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
|
@ -3,7 +3,7 @@
|
||||||
"private": true,
|
"private": true,
|
||||||
"description": "spaCy website",
|
"description": "spaCy website",
|
||||||
"version": "3.0.0",
|
"version": "3.0.0",
|
||||||
"author": "Explosion AI <contact@explosion.ai>",
|
"author": "Explosion <contact@explosion.ai>",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@jupyterlab/outputarea": "^0.19.1",
|
"@jupyterlab/outputarea": "^0.19.1",
|
||||||
|
@ -16,7 +16,7 @@
|
||||||
"autoprefixer": "^9.4.7",
|
"autoprefixer": "^9.4.7",
|
||||||
"classnames": "^2.2.6",
|
"classnames": "^2.2.6",
|
||||||
"codemirror": "^5.43.0",
|
"codemirror": "^5.43.0",
|
||||||
"gatsby": "^2.1.18",
|
"gatsby": "^2.11.1",
|
||||||
"gatsby-image": "^2.0.29",
|
"gatsby-image": "^2.0.29",
|
||||||
"gatsby-mdx": "^0.3.6",
|
"gatsby-mdx": "^0.3.6",
|
||||||
"gatsby-plugin-catch-links": "^2.0.11",
|
"gatsby-plugin-catch-links": "^2.0.11",
|
||||||
|
@ -24,12 +24,14 @@
|
||||||
"gatsby-plugin-offline": "^2.0.24",
|
"gatsby-plugin-offline": "^2.0.24",
|
||||||
"gatsby-plugin-plausible": "0.0.6",
|
"gatsby-plugin-plausible": "0.0.6",
|
||||||
"gatsby-plugin-react-helmet": "^3.0.6",
|
"gatsby-plugin-react-helmet": "^3.0.6",
|
||||||
"gatsby-plugin-react-svg": "^2.1.2",
|
"gatsby-plugin-react-svg": "^2.0.0",
|
||||||
|
"gatsby-plugin-robots-txt": "^1.5.1",
|
||||||
"gatsby-plugin-sass": "^2.0.10",
|
"gatsby-plugin-sass": "^2.0.10",
|
||||||
"gatsby-plugin-sharp": "^2.0.20",
|
"gatsby-plugin-sharp": "^2.0.20",
|
||||||
"gatsby-plugin-sitemap": "^2.0.5",
|
"gatsby-plugin-sitemap": "^2.0.5",
|
||||||
"gatsby-plugin-svgr": "^2.0.1",
|
"gatsby-plugin-svgr": "^2.0.1",
|
||||||
"gatsby-remark-copy-linked-files": "^2.0.9",
|
"gatsby-remark-copy-linked-files": "^2.0.9",
|
||||||
|
"gatsby-remark-find-replace": "^0.3.0",
|
||||||
"gatsby-remark-images": "^3.0.4",
|
"gatsby-remark-images": "^3.0.4",
|
||||||
"gatsby-remark-prismjs": "^3.2.4",
|
"gatsby-remark-prismjs": "^3.2.4",
|
||||||
"gatsby-remark-smartypants": "^2.0.8",
|
"gatsby-remark-smartypants": "^2.0.8",
|
||||||
|
@ -39,9 +41,11 @@
|
||||||
"gatsby-transformer-sharp": "^2.1.13",
|
"gatsby-transformer-sharp": "^2.1.13",
|
||||||
"html-to-react": "^1.3.4",
|
"html-to-react": "^1.3.4",
|
||||||
"intersection-observer": "^0.5.1",
|
"intersection-observer": "^0.5.1",
|
||||||
|
"jinja-to-js": "^3.2.3",
|
||||||
"node-sass": "^4.11.0",
|
"node-sass": "^4.11.0",
|
||||||
"parse-numeric-range": "0.0.2",
|
"parse-numeric-range": "0.0.2",
|
||||||
"prismjs": "^1.15.0",
|
"prismjs": "^1.15.0",
|
||||||
|
"prismjs-bibtex": "^1.1.0",
|
||||||
"prop-types": "^15.7.2",
|
"prop-types": "^15.7.2",
|
||||||
"react": "^16.8.2",
|
"react": "^16.8.2",
|
||||||
"react-dom": "^16.8.2",
|
"react-dom": "^16.8.2",
|
||||||
|
@ -50,19 +54,22 @@
|
||||||
"remark-react": "^5.0.1"
|
"remark-react": "^5.0.1"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "gatsby build",
|
"build": "npm run python:install && npm run python:setup && gatsby build",
|
||||||
"dev": "gatsby develop",
|
"dev": "npm run python:setup && gatsby develop",
|
||||||
|
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
||||||
"lint": "eslint **",
|
"lint": "eslint **",
|
||||||
"clear": "rm -rf .cache",
|
"clear": "rm -rf .cache",
|
||||||
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
|
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
|
||||||
|
"python:install": "pip install -r setup/requirements.txt",
|
||||||
|
"python:setup": "cd setup && sh setup.sh"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@sindresorhus/slugify": "^0.8.0",
|
||||||
"browser-monads": "^1.0.0",
|
"browser-monads": "^1.0.0",
|
||||||
"md-attr-parser": "^1.2.1",
|
"md-attr-parser": "^1.2.1",
|
||||||
"prettier": "^1.16.4",
|
"prettier": "^1.16.4",
|
||||||
"raw-loader": "^1.0.0",
|
"raw-loader": "^1.0.0",
|
||||||
"unist-util-visit": "^1.4.0",
|
"unist-util-visit": "^1.4.0"
|
||||||
"@sindresorhus/slugify": "^0.8.0"
|
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|
|
@ -6,7 +6,7 @@ import classNames from 'classnames'
|
||||||
import Link from './link'
|
import Link from './link'
|
||||||
import Grid from './grid'
|
import Grid from './grid'
|
||||||
import Newsletter from './newsletter'
|
import Newsletter from './newsletter'
|
||||||
import ExplosionLogo from '-!svg-react-loader!../images/explosion.svg'
|
import { ReactComponent as ExplosionLogo } from '../images/explosion.svg'
|
||||||
import classes from '../styles/footer.module.sass'
|
import classes from '../styles/footer.module.sass'
|
||||||
|
|
||||||
export default function Footer({ wide = false }) {
|
export default function Footer({ wide = false }) {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import React from 'react'
|
import React, { Fragment } from 'react'
|
||||||
import PropTypes from 'prop-types'
|
import PropTypes from 'prop-types'
|
||||||
import classNames from 'classnames'
|
import classNames from 'classnames'
|
||||||
|
|
||||||
|
@ -19,7 +19,13 @@ import NoIcon from '-!svg-react-loader!../images/icons/no.svg'
|
||||||
import NeutralIcon from '-!svg-react-loader!../images/icons/neutral.svg'
|
import NeutralIcon from '-!svg-react-loader!../images/icons/neutral.svg'
|
||||||
import OfflineIcon from '-!svg-react-loader!../images/icons/offline.svg'
|
import OfflineIcon from '-!svg-react-loader!../images/icons/offline.svg'
|
||||||
import SearchIcon from '-!svg-react-loader!../images/icons/search.svg'
|
import SearchIcon from '-!svg-react-loader!../images/icons/search.svg'
|
||||||
|
import MoonIcon from '-!svg-react-loader!../images/icons/moon.svg'
|
||||||
|
import ClipboardIcon from '-!svg-react-loader!../images/icons/clipboard.svg'
|
||||||
|
import NetworkIcon from '-!svg-react-loader!../images/icons/network.svg'
|
||||||
|
import DownloadIcon from '-!svg-react-loader!../images/icons/download.svg'
|
||||||
|
import PackageIcon from '-!svg-react-loader!../images/icons/package.svg'
|
||||||
|
|
||||||
|
import { isString } from './util'
|
||||||
import classes from '../styles/icon.module.sass'
|
import classes from '../styles/icon.module.sass'
|
||||||
|
|
||||||
const icons = {
|
const icons = {
|
||||||
|
@ -41,9 +47,22 @@ const icons = {
|
||||||
neutral: NeutralIcon,
|
neutral: NeutralIcon,
|
||||||
offline: OfflineIcon,
|
offline: OfflineIcon,
|
||||||
search: SearchIcon,
|
search: SearchIcon,
|
||||||
|
moon: MoonIcon,
|
||||||
|
clipboard: ClipboardIcon,
|
||||||
|
network: NetworkIcon,
|
||||||
|
download: DownloadIcon,
|
||||||
|
package: PackageIcon,
|
||||||
}
|
}
|
||||||
|
|
||||||
const Icon = ({ name, width, height, inline, variant, className }) => {
|
export default function Icon({
|
||||||
|
name,
|
||||||
|
width = 20,
|
||||||
|
height,
|
||||||
|
inline = false,
|
||||||
|
variant,
|
||||||
|
className,
|
||||||
|
...props
|
||||||
|
}) {
|
||||||
const IconComponent = icons[name]
|
const IconComponent = icons[name]
|
||||||
const iconClassNames = classNames(classes.root, className, {
|
const iconClassNames = classNames(classes.root, className, {
|
||||||
[classes.inline]: inline,
|
[classes.inline]: inline,
|
||||||
|
@ -57,15 +76,11 @@ const Icon = ({ name, width, height, inline, variant, className }) => {
|
||||||
aria-hidden="true"
|
aria-hidden="true"
|
||||||
width={width}
|
width={width}
|
||||||
height={height || width}
|
height={height || width}
|
||||||
|
{...props}
|
||||||
/>
|
/>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
Icon.defaultProps = {
|
|
||||||
width: 20,
|
|
||||||
inline: false,
|
|
||||||
}
|
|
||||||
|
|
||||||
Icon.propTypes = {
|
Icon.propTypes = {
|
||||||
name: PropTypes.oneOf(Object.keys(icons)),
|
name: PropTypes.oneOf(Object.keys(icons)),
|
||||||
width: PropTypes.number,
|
width: PropTypes.number,
|
||||||
|
@ -75,4 +90,43 @@ Icon.propTypes = {
|
||||||
className: PropTypes.string,
|
className: PropTypes.string,
|
||||||
}
|
}
|
||||||
|
|
||||||
export default Icon
|
export function replaceEmoji(cellChildren) {
|
||||||
|
const icons = {
|
||||||
|
'✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' },
|
||||||
|
'❌': { name: 'no', variant: 'error', 'aria-label': 'negative' },
|
||||||
|
}
|
||||||
|
const iconRe = new RegExp(`^(${Object.keys(icons).join('|')})`, 'g')
|
||||||
|
let children = isString(cellChildren) ? [cellChildren] : cellChildren
|
||||||
|
let hasIcon = false
|
||||||
|
if (Array.isArray(children)) {
|
||||||
|
children = children.map((child, i) => {
|
||||||
|
if (isString(child)) {
|
||||||
|
const icon = icons[child.trim()]
|
||||||
|
if (icon) {
|
||||||
|
hasIcon = true
|
||||||
|
return (
|
||||||
|
<Icon
|
||||||
|
{...icon}
|
||||||
|
inline={i < children.length}
|
||||||
|
aria-hidden={undefined}
|
||||||
|
key={i}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
} else if (iconRe.test(child)) {
|
||||||
|
hasIcon = true
|
||||||
|
const [, iconName, text] = child.split(iconRe)
|
||||||
|
return (
|
||||||
|
<Fragment key={i}>
|
||||||
|
<Icon {...icons[iconName]} aria-hidden={undefined} inline={true} />
|
||||||
|
{text.replace(/^\s+/g, '')}
|
||||||
|
</Fragment>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
// Work around prettier auto-escape
|
||||||
|
if (child.startsWith('\\')) return child.slice(1)
|
||||||
|
}
|
||||||
|
return child
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return { content: children, hasIcon }
|
||||||
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@ import Link from './link'
|
||||||
import Icon from './icon'
|
import Icon from './icon'
|
||||||
import Dropdown from './dropdown'
|
import Dropdown from './dropdown'
|
||||||
import { github } from './util'
|
import { github } from './util'
|
||||||
import Logo from '-!svg-react-loader!../images/logo.svg'
|
import { ReactComponent as Logo } from '../images/logo.svg'
|
||||||
import classes from '../styles/navigation.module.sass'
|
import classes from '../styles/navigation.module.sass'
|
||||||
|
|
||||||
const NavigationDropdown = ({ items = [], section }) => {
|
const NavigationDropdown = ({ items = [], section }) => {
|
||||||
|
|
|
@ -1,31 +0,0 @@
|
||||||
import AirbnbLogo from '-!svg-react-loader!./airbnb.svg'
|
|
||||||
import UberLogo from '-!svg-react-loader!./uber.svg'
|
|
||||||
import QuoraLogo from '-!svg-react-loader!./quora.svg'
|
|
||||||
import RetrieverLogo from '-!svg-react-loader!./retriever.svg'
|
|
||||||
import StitchfixLogo from '-!svg-react-loader!./stitchfix.svg'
|
|
||||||
import ChartbeatLogo from '-!svg-react-loader!./chartbeat.svg'
|
|
||||||
import AllenAILogo from '-!svg-react-loader!./allenai.svg'
|
|
||||||
|
|
||||||
import RecodeLogo from '-!svg-react-loader!./recode.svg'
|
|
||||||
import WapoLogo from '-!svg-react-loader!./wapo.svg'
|
|
||||||
import BBCLogo from '-!svg-react-loader!./bbc.svg'
|
|
||||||
import MicrosoftLogo from '-!svg-react-loader!./microsoft.svg'
|
|
||||||
import VenturebeatLogo from '-!svg-react-loader!./venturebeat.svg'
|
|
||||||
import ThoughtworksLogo from '-!svg-react-loader!./thoughtworks.svg'
|
|
||||||
|
|
||||||
export default {
|
|
||||||
airbnb: AirbnbLogo,
|
|
||||||
uber: UberLogo,
|
|
||||||
quora: QuoraLogo,
|
|
||||||
retriever: RetrieverLogo,
|
|
||||||
stitchfix: StitchfixLogo,
|
|
||||||
chartbeat: ChartbeatLogo,
|
|
||||||
allenai: AllenAILogo,
|
|
||||||
|
|
||||||
recode: RecodeLogo,
|
|
||||||
wapo: WapoLogo,
|
|
||||||
bbc: BBCLogo,
|
|
||||||
microsoft: MicrosoftLogo,
|
|
||||||
venturebeat: VenturebeatLogo,
|
|
||||||
thoughtworks: ThoughtworksLogo,
|
|
||||||
}
|
|
|
@ -4,7 +4,7 @@ import Grid from '../components/grid'
|
||||||
import { Label } from '../components/typography'
|
import { Label } from '../components/typography'
|
||||||
import Link from '../components/link'
|
import Link from '../components/link'
|
||||||
|
|
||||||
import Logo from '-!svg-react-loader!../images/logo.svg'
|
import { ReactComponent as Logo } from '../images/logo.svg'
|
||||||
import patternBlue from '../images/pattern_blue.jpg'
|
import patternBlue from '../images/pattern_blue.jpg'
|
||||||
import patternGreen from '../images/pattern_green.jpg'
|
import patternGreen from '../images/pattern_green.jpg'
|
||||||
import patternPurple from '../images/pattern_purple.jpg'
|
import patternPurple from '../images/pattern_purple.jpg'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user