From 176a90edeec38ced8c5b1e2f7fd1d28bf1e9e1c1 Mon Sep 17 00:00:00 2001 From: jsnfly <37632631+jsnfly@users.noreply.github.com> Date: Thu, 13 Jan 2022 09:03:23 +0100 Subject: [PATCH 001/177] Fix texcat loss scaling (#9904) (#10002) * add failing test for issue 9904 * remove division by batch size and summation before applying the mean Co-authored-by: jonas --- spacy/pipeline/textcat.py | 4 ++-- spacy/tests/pipeline/test_textcat.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index e20ae87f1..dd5fdc078 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -320,9 +320,9 @@ class TextCategorizer(TrainablePipe): self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) # type: ignore - d_scores = (scores - truths) / scores.shape[0] + d_scores = (scores - truths) d_scores *= not_missing - mean_square_error = (d_scores ** 2).sum(axis=1).mean() + mean_square_error = (d_scores ** 2).mean() return float(mean_square_error), d_scores def add_label(self, label: str) -> int: diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 52bf6ec5c..798dd165e 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -277,6 +277,21 @@ def test_issue7019(): print_prf_per_type(msg, scores, name="foo", type="bar") +@pytest.mark.issue(9904) +def test_issue9904(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + get_examples = make_get_examples_single_label(nlp) + nlp.initialize(get_examples) + + examples = get_examples() + scores = textcat.predict([eg.predicted for eg in examples]) + + loss = textcat.get_loss(examples, scores)[0] + loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] + assert loss == pytest.approx(loss_double_bs) + + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): nlp = Language() From 28299644fc14ed7693a26bf03e2ec0cbef9c28e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 13 Jan 2022 09:03:55 +0100 Subject: [PATCH 002/177] Speed up the StateC::L feature function (#10019) * Speed up the StateC::L feature function This function gets the n-th most-recent left-arc with a particular head. Before this change, StateC::L would construct a vector of all left-arcs with the given head and then pick the n-th most recent from that vector. Since the number of left-arcs strongly correlates with the doc length and the feature is constructed for every transition, this can make transition-parsing quadratic. With this change StateC::L: - Searches left-arcs backwards. - Stops early when the n-th matching transition is found. - Does not construct a vector (reducing memory pressure). This change doesn't avoid the linear search when the transition that is queried does not occur in the left-arcs. Regardless, performance is improved quite a bit with very long docs: Before: N Time 400 3.3 800 5.4 1600 11.6 3200 30.7 After: N Time 400 3.2 800 5.0 1600 9.5 3200 23.2 We can probably do better with more tailored data structures, but I first wanted to make a low-impact PR. Found while investigating #9858. * StateC::L: simplify loop --- spacy/pipeline/_parser_internals/_state.pxd | 23 +++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 161f3ca48..27623e7c6 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,3 +1,4 @@ +from cython.operator cimport dereference as deref, preincrement as incr from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t @@ -184,16 +185,20 @@ cdef cppclass StateC: int L(int head, int idx) nogil const: if idx < 1 or this._left_arcs.size() == 0: return -1 - cdef vector[int] lefts - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) + + # Work backwards through left-arcs to find the arc at the + # requested index more quickly. + cdef size_t child_index = 0 + it = this._left_arcs.const_rbegin() + while it != this._left_arcs.rend(): + arc = deref(it) if arc.head == head and arc.child != -1 and arc.child < head: - lefts.push_back(arc.child) - idx = (lefts.size()) - idx - if idx < 0: - return -1 - else: - return lefts.at(idx) + child_index += 1 + if child_index == idx: + return arc.child + incr(it) + + return -1 int R(int head, int idx) nogil const: if idx < 1 or this._right_arcs.size() == 0: From 677c1a35072ff2deb3af6638802f506d623ed8f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 13 Jan 2022 09:03:55 +0100 Subject: [PATCH 003/177] Speed up the StateC::L feature function (#10019) * Speed up the StateC::L feature function This function gets the n-th most-recent left-arc with a particular head. Before this change, StateC::L would construct a vector of all left-arcs with the given head and then pick the n-th most recent from that vector. Since the number of left-arcs strongly correlates with the doc length and the feature is constructed for every transition, this can make transition-parsing quadratic. With this change StateC::L: - Searches left-arcs backwards. - Stops early when the n-th matching transition is found. - Does not construct a vector (reducing memory pressure). This change doesn't avoid the linear search when the transition that is queried does not occur in the left-arcs. Regardless, performance is improved quite a bit with very long docs: Before: N Time 400 3.3 800 5.4 1600 11.6 3200 30.7 After: N Time 400 3.2 800 5.0 1600 9.5 3200 23.2 We can probably do better with more tailored data structures, but I first wanted to make a low-impact PR. Found while investigating #9858. * StateC::L: simplify loop --- spacy/pipeline/_parser_internals/_state.pxd | 23 +++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 161f3ca48..27623e7c6 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,3 +1,4 @@ +from cython.operator cimport dereference as deref, preincrement as incr from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t @@ -184,16 +185,20 @@ cdef cppclass StateC: int L(int head, int idx) nogil const: if idx < 1 or this._left_arcs.size() == 0: return -1 - cdef vector[int] lefts - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) + + # Work backwards through left-arcs to find the arc at the + # requested index more quickly. + cdef size_t child_index = 0 + it = this._left_arcs.const_rbegin() + while it != this._left_arcs.rend(): + arc = deref(it) if arc.head == head and arc.child != -1 and arc.child < head: - lefts.push_back(arc.child) - idx = (lefts.size()) - idx - if idx < 0: - return -1 - else: - return lefts.at(idx) + child_index += 1 + if child_index == idx: + return arc.child + incr(it) + + return -1 int R(int head, int idx) nogil const: if idx < 1 or this._right_arcs.size() == 0: From 63fa55089dff3b5a5208c24914cd0faa5909108a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Thu, 13 Jan 2022 10:33:30 +0100 Subject: [PATCH 004/177] Use constant-time head lookups in StateC::{L,R} This change changes the type of left/right-arc collections from vector[ArcC] to unordered_map[int, vector[Arc]], so that the arcs are keyed by the head. This allows us to find all the left/right arcs for a particular head in constant time in StateC::{L,R}. Benchmarks with long docs (N is the number of text repetitions): Before (using #10019): N Time (s) 400 3.2 800 5.0 1600 9.5 3200 23.2 6400 66.8 12800 220.0 After (this commit): N Time (s) 400 3.1 800 4.3 1600 6.7 3200 12.0 6400 22.0 12800 42.0 Related to #9858 and #10019. --- spacy/pipeline/_parser_internals/_state.pxd | 120 ++++++++++++-------- 1 file changed, 70 insertions(+), 50 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 27623e7c6..a1262bb61 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t cimport libcpp +from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from libcpp.set cimport set from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno @@ -30,8 +31,8 @@ cdef cppclass StateC: vector[int] _stack vector[int] _rebuffer vector[SpanC] _ents - vector[ArcC] _left_arcs - vector[ArcC] _right_arcs + unordered_map[int, vector[ArcC]] _left_arcs + unordered_map[int, vector[ArcC]] _right_arcs vector[libcpp.bool] _unshiftable set[int] _sent_starts TokenC _empty_token @@ -160,15 +161,22 @@ cdef cppclass StateC: else: return &this._sent[i] - void get_arcs(vector[ArcC]* arcs) nogil const: - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head != -1 and arc.child != -1: - arcs.push_back(arc) - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head != -1 and arc.child != -1: - arcs.push_back(arc) + void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const: + cdef const vector[ArcC]* arcs + head_arcs_it = heads_arcs.const_begin() + while head_arcs_it != heads_arcs.const_end(): + arcs = &deref(head_arcs_it).second + arcs_it = arcs.const_begin() + while arcs_it != arcs.const_end(): + arc = deref(arcs_it) + if arc.head != -1 and arc.child != -1: + out.push_back(arc) + incr(arcs_it) + incr(head_arcs_it) + + void get_arcs(vector[ArcC]* out) nogil const: + this.map_get_arcs(this._left_arcs, out) + this.map_get_arcs(this._right_arcs, out) int H(int child) nogil const: if child >= this.length or child < 0: @@ -182,37 +190,35 @@ cdef cppclass StateC: else: return this._ents.back().start - int L(int head, int idx) nogil const: - if idx < 1 or this._left_arcs.size() == 0: + int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const: + if idx < 1: return -1 - # Work backwards through left-arcs to find the arc at the + head_arcs_it = heads_arcs.const_find(head) + if head_arcs_it == heads_arcs.const_end(): + return -1 + + cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second + + # Work backwards through arcs to find the arc at the # requested index more quickly. cdef size_t child_index = 0 - it = this._left_arcs.const_rbegin() - while it != this._left_arcs.rend(): - arc = deref(it) - if arc.head == head and arc.child != -1 and arc.child < head: + arcs_it = arcs.const_rbegin() + while arcs_it != arcs.const_rend() and child_index != idx: + arc = deref(arcs_it) + if arc.child != -1: child_index += 1 if child_index == idx: return arc.child - incr(it) + incr(arcs_it) return -1 + int L(int head, int idx) nogil const: + return this.nth_child(this._left_arcs, head, idx) + int R(int head, int idx) nogil const: - if idx < 1 or this._right_arcs.size() == 0: - return -1 - cdef vector[int] rights - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child > head: - rights.push_back(arc.child) - idx = (rights.size()) - idx - if idx < 0: - return -1 - else: - return rights.at(idx) + return this.nth_child(this._right_arcs, head, idx) bint empty() nogil const: return this._stack.size() == 0 @@ -253,22 +259,29 @@ cdef cppclass StateC: int r_edge(int word) nogil const: return word - - int n_L(int head) nogil const: + + int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const: cdef int n = 0 - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child < arc.head: + head_arcs_it = heads_arcs.const_find(head) + if head_arcs_it == heads_arcs.const_end(): + return n + + cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second + arcs_it = arcs.const_begin() + while arcs_it != arcs.end(): + arc = deref(arcs_it) + if arc.child != -1: n += 1 + incr(arcs_it) + return n + + int n_L(int head) nogil const: + return n_arcs(this._left_arcs, head) + int n_R(int head) nogil const: - cdef int n = 0 - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child > arc.head: - n += 1 - return n + return n_arcs(this._right_arcs, head) bint stack_is_connected() nogil const: return False @@ -328,19 +341,20 @@ cdef cppclass StateC: arc.child = child arc.label = label if head > child: - this._left_arcs.push_back(arc) + this._left_arcs[arc.head].push_back(arc) else: - this._right_arcs.push_back(arc) + this._right_arcs[arc.head].push_back(arc) this._heads[child] = head - void del_arc(int h_i, int c_i) nogil: - cdef vector[ArcC]* arcs - if h_i > c_i: - arcs = &this._left_arcs - else: - arcs = &this._right_arcs + void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil: + arcs_it = heads_arcs.find(h_i) + if arcs_it == heads_arcs.end(): + return + + arcs = &deref(arcs_it).second if arcs.size() == 0: return + arc = arcs.back() if arc.head == h_i and arc.child == c_i: arcs.pop_back() @@ -353,6 +367,12 @@ cdef cppclass StateC: arc.label = 0 break + void del_arc(int h_i, int c_i) nogil: + if h_i > c_i: + this.map_del_arc(&this._left_arcs, h_i, c_i) + else: + this.map_del_arc(&this._right_arcs, h_i, c_i) + SpanC get_ent() nogil const: cdef SpanC ent if this._ents.size() == 0: From a784b12eff48df9281b184cb7005e66bbd2e3aca Mon Sep 17 00:00:00 2001 From: ColleterVi <36503688+ColleterVi@users.noreply.github.com> Date: Thu, 13 Jan 2022 12:25:06 +0100 Subject: [PATCH 005/177] fix: new restcountries url (#10043) Url extension "eu" and path "rest" are no longer available. Replacing them for a working url. --- website/docs/usage/processing-pipelines.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 0264a2825..11fd1459d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3} This example shows the implementation of a pipeline component that fetches -country meta data via the [REST Countries API](https://restcountries.eu), sets +country meta data via the [REST Countries API](https://restcountries.com), sets entity annotations for countries and sets custom attributes on the `Doc` and `Span` – for example, the capital, latitude/longitude coordinates and even the country flag. @@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token @Language.factory("rest_countries") class RESTCountriesComponent: def __init__(self, nlp, name, label="GPE"): - r = requests.get("https://restcountries.eu/rest/v2/all") + r = requests.get("https://restcountries.com/v2/all") r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup From 58bdd8607bb917f3437fdf5993dec5b6e58930c8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 17 Jan 2022 16:16:22 +0900 Subject: [PATCH 006/177] Bump sudachipy version (#9917) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Edited Slovenian stop words list (#9707) * Noun chunks for Italian (#9662) * added it vocab * copied portuguese * added possessive determiner * added conjed Nps * added nmoded Nps * test misc * more examples * fixed typo * fixed parenth * fixed comma * comma fix * added syntax iters * fix some index problems * fixed index * corrected heads for test case * fixed tets case * fixed determiner gender * cleaned left over * added example with apostophe * French NP review (#9667) * adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix * Add Japanese kana characters to default exceptions (fix #9693) (#9742) This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension * Remove NER words from stop words in Norwegian (#9820) Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations. Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data. See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831 * Bump sudachipy version * Update sudachipy versions * Bump versions Bumping to the most recent dictionary just to keep thing current. Bumping sudachipy to 5.2 because older versions don't support recent dictionaries. Co-authored-by: Sofie Van Landeghem Co-authored-by: Richard Hudson Co-authored-by: Duygu Altinok Co-authored-by: Haakon Meland Eriksen --- setup.cfg | 4 +- spacy/lang/char_classes.py | 5 + spacy/lang/fr/syntax_iterators.py | 72 ++++++-- spacy/lang/it/__init__.py | 4 +- spacy/lang/it/syntax_iterators.py | 86 +++++++++ spacy/lang/nb/stop_words.py | 30 ++-- spacy/lang/sl/stop_words.py | 130 +------------- spacy/tests/conftest.py | 10 ++ spacy/tests/lang/fr/test_noun_chunks.py | 224 +++++++++++++++++++++++- spacy/tests/lang/it/test_noun_chunks.py | 221 +++++++++++++++++++++++ 10 files changed, 624 insertions(+), 162 deletions(-) create mode 100644 spacy/lang/it/syntax_iterators.py create mode 100644 spacy/tests/lang/it/test_noun_chunks.py diff --git a/setup.cfg b/setup.cfg index 50e982cbf..586a044ff 100644 --- a/setup.cfg +++ b/setup.cfg @@ -108,8 +108,8 @@ apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = - sudachipy>=0.4.9 - sudachidict_core>=20200330 + sudachipy>=0.5.2,!=0.6.1 + sudachidict_core>=20211220 ko = natto-py==0.9.0 th = diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 9e5441a4f..b15bb3cf3 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF" _hangul_jamo = r"\u1100-\u11FF" _hangul = _hangul_syllables + _hangul_jamo +_hiragana = r"\u3040-\u309F" +_katakana = r"\u30A0-\u30FFー" +_kana = _hiragana + _katakana + # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh _latin_u_extendedA = ( r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" @@ -244,6 +248,7 @@ _uncased = ( + _tamil + _telugu + _hangul + + _kana + _cjk ) diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index d86662693..5f7ba5c10 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -6,16 +6,35 @@ from ...tokens import Doc, Span def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: - """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" - # fmt: off - labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] - # fmt: on + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "obl:arg", + "obl:mod", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"] doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] - conj = doc.vocab.strings.add("conj") + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_label = doc.vocab.strings.add("det") + det_pos = doc.vocab.strings.add("DET") + adp_pos = doc.vocab.strings.add("ADP") + conj_label = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): @@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label - elif word.dep == conj: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep == det_label and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_pos else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj_label: head = word.head - while head.dep == conj and head.head.i < head.i: + while head.dep == conj_label and head.head.i < head.i: head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 1edebc837..ecf322bd7 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language, BaseDefaults from .lemmatizer import ItalianLemmatizer +from .syntax_iterators import SYNTAX_ITERATORS class ItalianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS - stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Italian(Language): diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py new file mode 100644 index 000000000..f63df3fad --- /dev/null +++ b/spacy/lang/it/syntax_iterators.py @@ -0,0 +1,86 @@ +from typing import Union, Iterator, Tuple + +from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "fixed", "compound"] + dets = ["det", "det:poss"] + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} + np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_labels = {doc.vocab.strings.add(det) for det in dets} + det_pos = doc.vocab.strings.add("DET") + adp_label = doc.vocab.strings.add("ADP") + conj = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep in det_labels and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_label else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index fd65dd788..d9ed414ef 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både -da dag de del dem den denne der dermed det dette disse drept du +da dag de del dem den denne der dermed det dette disse du eller en enn er et ett etter -fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag +fem fikk fire fjor flere folk for fortsatt fra fram funnet få får fått før først første gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går -ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan -hvorfor +ha hadde ham han hans har hele helt henne hennes her hun i ifølge igjen ikke ingen inn ja jeg kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld -kvinner -la laget land landet langt leder ligger like litt løpet lørdag +la laget land landet langt leder ligger like litt løpet -man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer -millioner minutter mot msci mye må mål måtte +man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte -ned neste noe noen nok norge norsk norske ntb ny nye nå når +ned neste noe noen nok ny nye nå når -og også om onsdag opp opplyser oslo oss over +og også om opp opplyser oss over -personer plass poeng politidistrikt politiet president prosent på +personer plass poeng på -regjeringen runde rundt russland +runde rundt -sa saken samme sammen samtidig satt se seg seks selv senere september ser sett +sa saken samme sammen samtidig satt se seg seks selv senere ser sett siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor -store står sverige svært så søndag +store står svært så -ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror -tyskland +ta tatt tid tidligere til tilbake tillegg tok tror -under usa ut uten utenfor +under ut uten utenfor vant var ved veldig vi videre viktig vil ville viser vår være vært diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 6fb01a183..c9004ed5d 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,13 +1,10 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -# TODO: probably needs to be tidied up – the list seems to have month names in -# it, which shouldn't be considered stop words. +# Removed various words that are not normally considered stop words, such as months. STOP_WORDS = set( """ a ali -april -avgust b bi bil @@ -19,7 +16,6 @@ biti blizu bo bodo -bojo bolj bom bomo @@ -37,16 +33,6 @@ da daleč dan danes -datum -december -deset -deseta -deseti -deseto -devet -deveta -deveti -deveto do dober dobra @@ -54,16 +40,7 @@ dobri dobro dokler dol -dolg -dolga -dolgi dovolj -drug -druga -drugi -drugo -dva -dve e eden en @@ -74,7 +51,6 @@ enkrat eno etc. f -februar g g. ga @@ -93,16 +69,12 @@ iv ix iz j -januar jaz je ji jih jim jo -julij -junij -jutri k kadarkoli kaj @@ -123,41 +95,23 @@ kje kjer kjerkoli ko -koder koderkoli koga komu kot -kratek -kratka -kratke -kratki l -lahka -lahke -lahki -lahko le lep lepa lepe lepi lepo -leto m -maj -majhen -majhna -majhni -malce -malo manj -marec me med medtem mene -mesec mi midva midve @@ -183,7 +137,6 @@ najmanj naju največ nam -narobe nas nato nazaj @@ -192,7 +145,6 @@ naša naše ne nedavno -nedelja nek neka nekaj @@ -236,7 +188,6 @@ njuna njuno no nocoj -november npr. o ob @@ -244,51 +195,23 @@ oba obe oboje od -odprt -odprta -odprti okoli -oktober on onadva one oni onidve -osem -osma -osmi -osmo oz. p pa -pet -peta -petek -peti -peto po pod pogosto poleg -poln -polna -polni -polno ponavadi -ponedeljek ponovno potem povsod -pozdravljen -pozdravljeni -prav -prava -prave -pravi -pravo -prazen -prazna -prazno prbl. precej pred @@ -297,19 +220,10 @@ preko pri pribl. približno -primer -pripravljen -pripravljena -pripravljeni proti -prva -prvi -prvo r -ravno redko res -reč s saj sam @@ -321,29 +235,17 @@ se sebe sebi sedaj -sedem -sedma -sedmi -sedmo sem -september seveda si sicer skoraj skozi -slab smo so -sobota spet -sreda -srednja -srednji sta ste -stran -stvar sva t ta @@ -358,10 +260,6 @@ te tebe tebi tega -težak -težka -težki -težko ti tista tiste @@ -371,11 +269,6 @@ tj. tja to toda -torek -tretja -tretje -tretji -tri tu tudi tukaj @@ -392,10 +285,6 @@ vaša vaše ve vedno -velik -velika -veliki -veliko vendar ves več @@ -403,10 +292,6 @@ vi vidva vii viii -visok -visoka -visoke -visoki vsa vsaj vsak @@ -420,34 +305,21 @@ vsega vsi vso včasih -včeraj x z za zadaj zadnji zakaj -zaprta -zaprti -zaprto zdaj zelo zunaj č če često -četrta -četrtek -četrti -četrto čez čigav š -šest -šesta -šesti -šesto -štiri ž že """.split() diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ffca79bb9..ee90a9f38 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -155,6 +155,11 @@ def fr_tokenizer(): return get_lang_class("fr")().tokenizer +@pytest.fixture(scope="session") +def fr_vocab(): + return get_lang_class("fr")().vocab + + @pytest.fixture(scope="session") def ga_tokenizer(): return get_lang_class("ga")().tokenizer @@ -205,6 +210,11 @@ def it_tokenizer(): return get_lang_class("it")().tokenizer +@pytest.fixture(scope="session") +def it_vocab(): + return get_lang_class("it")().vocab + + @pytest.fixture(scope="session") def ja_tokenizer(): pytest.importorskip("sudachipy") diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 48ac88ead..25b95f566 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,8 +1,230 @@ +from spacy.tokens import Doc import pytest +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un nom -> un nom + ( + ["un", "nom"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + noun starting with vowel + # l'heure -> l'heure + ( + ["l'", "heure"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + plural noun + # les romans -> les romans + ( + ["les", "romans"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # det + adj + noun + # Le vieux Londres -> Le vieux Londres + ( + ['Les', 'vieux', 'Londres'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + noun + adj + # le nom propre -> le nom propre a proper noun + ( + ["le", "nom", "propre"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # det + noun + adj plural + # Les chiens bruns -> les chiens bruns + ( + ["Les", "chiens", "bruns"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # multiple adjectives: one adj before the noun, one adj after the noun + # un nouveau film intéressant -> un nouveau film intéressant + ( + ["un", "nouveau", "film", "intéressant"], + [2, 2, 2, 2], + ["det", "amod", "ROOT", "amod"], + ["DET", "ADJ", "NOUN", "ADJ"], + [(0,4)] + ), + # multiple adjectives, both adjs after the noun + # une personne intelligente et drôle -> une personne intelligente et drôle + ( + ["une", "personne", "intelligente", "et", "drôle"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # relative pronoun + # un bus qui va au ville -> un bus, qui, ville + ( + ['un', 'bus', 'qui', 'va', 'au', 'ville'], + [1, 1, 3, 1, 5, 3], + ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'], + ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'], + [(0,2), (2,3), (5,6)] + ), + # relative subclause + # Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy. + ( + ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'], + [0, 2, 0, 5, 5, 2, 5], + ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'], + ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'], + [(1,3), (4,5)] + ), + # Person name and title by flat + # Louis XIV -> Louis XIV + ( + ["Louis", "XIV"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Organization name by flat + # Nations Unies -> Nations Unies + ( + ["Nations", "Unies"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Noun compound, person name created by two flats + # Louise de Bratagne -> Louise de Bratagne + ( + ["Louise", "de", "Bratagne"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound, person name created by two flats + # Louis François Joseph -> Louis François Joseph + ( + ["Louis", "François", "Joseph"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # quelques agriculteurs très riches -> quelques agriculteurs très riches + ( + ["quelques", "agriculteurs", "très", "riches"], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Il a un chien et un chat -> Il, un chien, un chat + ( + ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], + [1, 1, 3, 1, 6, 6, 3], + ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(0,1), (2,4), (5,7)] + + ), + # Two NPs together + # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado + ( + ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'appos', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # nmod relation between NPs + # la destruction de la ville -> la destruction, la ville + ( + ['la', 'destruction', 'de', 'la', 'ville'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'case', 'det', 'nmod'], + ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'], + [(0,2), (3,5)] + ), + # nmod relation between NPs + # Archiduchesse d’Autriche -> Archiduchesse, Autriche + ( + ['Archiduchesse', 'd’', 'Autriche'], + [0, 2, 0], + ['ROOT', 'case', 'nmod'], + ['NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3)] + ), + # Compounding by nmod, several NPs chained together + # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement + ( + ["la", "première", "usine", "de", "drogue", "du", "gouvernement"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduction du rapport de Susana -> Traduction, rapport, Susana + ( + ['Traduction', 'du', 'raport', 'de', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie + ( + ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], + [2, 2, 2, 4, 2, 7, 7, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'], + [(0,3), (4,5), (6,8)] + ), + # Passive subject + # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton + ( + ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'], + [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'], + [(0, 3), (6, 10), (11, 12)] + ) + ], +) +# fmt: on +def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + def test_noun_chunks_is_parsed_fr(fr_tokenizer): """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" - doc = fr_tokenizer("trouver des travaux antérieurs") + doc = fr_tokenizer("Je suis allé à l'école") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py new file mode 100644 index 000000000..0a8c10e79 --- /dev/null +++ b/spacy/tests/lang/it/test_noun_chunks.py @@ -0,0 +1,221 @@ +from spacy.tokens import Doc +import pytest + + +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un pollo -> un pollo + ( + ["un", "pollo"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0,2)], + ), + # two determiners + noun + # il mio cane -> il mio cane + ( + ["il", "mio", "cane"], + [2, 2, 2], + ["det", "det:poss", "ROOT"], + ["DET", "DET", "NOUN"], + [(0,3)], + ), + # two determiners, one is after noun. rare usage but still testing + # il cane mio-> il cane mio + ( + ["il", "cane", "mio"], + [1, 1, 1], + ["det", "ROOT", "det:poss"], + ["DET", "NOUN", "DET"], + [(0,3)], + ), + # relative pronoun + # È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty. + ( + ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"], + [2, 2, 2, 4, 2, 7, 7, 4], + ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'], + ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(3,5), (5,6)] + ), + # relative subclause + # il computer che hai comprato -> il computer, che the computer that you bought + ( + ['il', 'computer', 'che', 'hai', 'comprato'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'], + ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(0,2), (2,3)] + ), + # det + noun + adj + # Una macchina grande -> Una macchina grande + ( + ["Una", "macchina", "grande"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0,3)], + ), + # noun + adj plural + # mucche bianche + ( + ["mucche", "bianche"], + [0, 0], + ["ROOT", "amod"], + ["NOUN", "ADJ"], + [(0,2)], + ), + # det + adj + noun + # Una grande macchina -> Una grande macchina + ( + ['Una', 'grande', 'macchina'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + adj + noun, det with apostrophe + # un'importante associazione -> un'importante associazione + ( + ["Un'", 'importante', 'associazione'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # multiple adjectives + # Un cane piccolo e marrone -> Un cane piccolo e marrone + ( + ["Un", "cane", "piccolo", "e", "marrone"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # determiner, adjective, compound created by flat + # le Nazioni Unite -> le Nazioni Unite + ( + ["le", "Nazioni", "Unite"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers + ( + ['alcuni', 'contadini', 'molto', 'ricchi'], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Ho un cane e un gatto -> un cane, un gatto + ( + ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], + [0, 2, 0, 5, 5, 0], + ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(1,3), (4,6)] + + ), + # Two NPs together + # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado + ( + ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'nmod', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # Noun compound, person name and titles + # Dom Pedro II -> Dom Pedro II + ( + ["Dom", "Pedro", "II"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound created by flat + # gli Stati Uniti + ( + ["gli", "Stati", "Uniti"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # nmod relation between NPs + # la distruzione della città -> la distruzione, città + ( + ['la', 'distruzione', 'della', 'città'], + [1, 1, 3, 1], + ['det', 'ROOT', 'case', 'nmod'], + ['DET', 'NOUN', 'ADP', 'NOUN'], + [(0,2), (3,4)] + ), + # Compounding by nmod, several NPs chained together + # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo + ( + ["la", "prima", "fabbrica", "di", "droga", "del", "governo"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana + ( + ['Traduzione', 'del', 'rapporto', 'di', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica + ( + ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], + [1, 1, 1, 4, 1, 8, 8, 8, 1], + ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], + ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'], + [(0,3), (4,5), (6,9)] + ), + # Passive subject + # La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton + ( + ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'], + [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0, 3), (6, 8), (9, 10), (11,12)] + ), + # Misc + # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti + ( + ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'], + [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17], + ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'], + ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(2,4), (9,12), (13,14), (17,18), (19,20)] + ) + ], +) +# fmt: on +def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + +def test_noun_chunks_is_parsed_it(it_tokenizer): + """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed.""" + doc = it_tokenizer("Sei andato a Oxford") + with pytest.raises(ValueError): + list(doc.noun_chunks) From 6a8619dd736f03e0fa8eec173a9277a3adbc46f9 Mon Sep 17 00:00:00 2001 From: Tuomo Hiippala Date: Mon, 17 Jan 2022 09:28:51 +0200 Subject: [PATCH 007/177] Update the entry for Applied Language Technology in spaCy Universe (#10068) * add entry for Applied Language Technology under "Courses" Added the following entry into `universe.json`: ``` { "type": "education", "id": "applt-course", "title": "Applied Language Technology", "slogan": "NLP for newcomers using spaCy and Stanza", "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", "url": "https://applied-language-technology.readthedocs.io/", "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png", "author": "Tuomo Hiippala", "author_links": { "twitter": "tuomo_h", "github": "thiippal", "website": "https://www.mv.helsinki.fi/home/thiippal/" }, "category": ["courses"] }, ``` * Update the entry for "Applied Language Technology" --- website/meta/universe.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 384a7e070..0fde2d612 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1770,9 +1770,9 @@ "title": "Applied Language Technology", "slogan": "NLP for newcomers using spaCy and Stanza", "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", - "url": "https://applied-language-technology.readthedocs.io/", + "url": "https://applied-language-technology.mooc.fi", "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", - "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png", + "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png", "author": "Tuomo Hiippala", "author_links": { "twitter": "tuomo_h", From add52935ff273c9c8f37ae244803aebe02c12193 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 17 Jan 2022 10:38:37 +0100 Subject: [PATCH 008/177] Revert "Bump sudachipy version (#9917)" (#10071) This reverts commit 58bdd8607bb917f3437fdf5993dec5b6e58930c8. --- setup.cfg | 4 +- spacy/lang/char_classes.py | 5 - spacy/lang/fr/syntax_iterators.py | 72 ++------ spacy/lang/it/__init__.py | 4 +- spacy/lang/it/syntax_iterators.py | 86 --------- spacy/lang/nb/stop_words.py | 30 ++-- spacy/lang/sl/stop_words.py | 130 +++++++++++++- spacy/tests/conftest.py | 10 -- spacy/tests/lang/fr/test_noun_chunks.py | 224 +----------------------- spacy/tests/lang/it/test_noun_chunks.py | 221 ----------------------- 10 files changed, 162 insertions(+), 624 deletions(-) delete mode 100644 spacy/lang/it/syntax_iterators.py delete mode 100644 spacy/tests/lang/it/test_noun_chunks.py diff --git a/setup.cfg b/setup.cfg index 586a044ff..50e982cbf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -108,8 +108,8 @@ apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = - sudachipy>=0.5.2,!=0.6.1 - sudachidict_core>=20211220 + sudachipy>=0.4.9 + sudachidict_core>=20200330 ko = natto-py==0.9.0 th = diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index b15bb3cf3..9e5441a4f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF" _hangul_jamo = r"\u1100-\u11FF" _hangul = _hangul_syllables + _hangul_jamo -_hiragana = r"\u3040-\u309F" -_katakana = r"\u30A0-\u30FFー" -_kana = _hiragana + _katakana - # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh _latin_u_extendedA = ( r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" @@ -248,7 +244,6 @@ _uncased = ( + _tamil + _telugu + _hangul - + _kana + _cjk ) diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 5f7ba5c10..d86662693 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -6,35 +6,16 @@ from ...tokens import Doc, Span def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ - labels = [ - "nsubj", - "nsubj:pass", - "obj", - "obl", - "obl:agent", - "obl:arg", - "obl:mod", - "nmod", - "pcomp", - "appos", - "ROOT", - ] - post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"] + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) - np_deps = {doc.vocab.strings.add(label) for label in labels} - np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} + np_deps = [doc.vocab.strings[label] for label in labels] + conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - adj_label = doc.vocab.strings.add("amod") - det_label = doc.vocab.strings.add("det") - det_pos = doc.vocab.strings.add("DET") - adp_pos = doc.vocab.strings.add("ADP") - conj_label = doc.vocab.strings.add("conj") - conj_pos = doc.vocab.strings.add("CCONJ") prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): @@ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - right_childs = list(word.rights) - right_child = right_childs[0] if right_childs else None - - if right_child: - if ( - right_child.dep == adj_label - ): # allow chain of adjectives by expanding to right - right_end = right_child.right_edge - elif ( - right_child.dep == det_label and right_child.pos == det_pos - ): # cut relative pronouns here - right_end = right_child - elif right_child.dep in np_modifs: # Check if we can expand to right - right_end = word.right_edge - else: - right_end = word - else: - right_end = word - prev_end = right_end.i - - left_index = word.left_edge.i - left_index = ( - left_index + 1 if word.left_edge.pos == adp_pos else left_index - ) - - yield left_index, right_end.i + 1, np_label - elif word.dep == conj_label: + prev_end = word.right_edge.i + yield word.left_edge.i, word.right_edge.i + 1, np_label + elif word.dep == conj: head = word.head - while head.dep == conj_label and head.head.i < head.i: + while head.dep == conj and head.head.i < head.i: head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i - - left_index = word.left_edge.i # eliminate left attached conjunction - left_index = ( - left_index + 1 if word.left_edge.pos == conj_pos else left_index - ) - yield left_index, word.i + 1, np_label + prev_end = word.right_edge.i + yield word.left_edge.i, word.right_edge.i + 1, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index ecf322bd7..1edebc837 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language, BaseDefaults from .lemmatizer import ItalianLemmatizer -from .syntax_iterators import SYNTAX_ITERATORS class ItalianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES - stop_words = STOP_WORDS - syntax_iterators = SYNTAX_ITERATORS class Italian(Language): diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py deleted file mode 100644 index f63df3fad..000000000 --- a/spacy/lang/it/syntax_iterators.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Union, Iterator, Tuple - -from ...symbols import NOUN, PROPN, PRON -from ...errors import Errors -from ...tokens import Doc, Span - - -def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: - """ - Detect base noun phrases from a dependency parse. Works on both Doc and Span. - """ - labels = [ - "nsubj", - "nsubj:pass", - "obj", - "obl", - "obl:agent", - "nmod", - "pcomp", - "appos", - "ROOT", - ] - post_modifiers = ["flat", "flat:name", "fixed", "compound"] - dets = ["det", "det:poss"] - doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.has_annotation("DEP"): - raise ValueError(Errors.E029) - np_deps = {doc.vocab.strings.add(label) for label in labels} - np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} - np_label = doc.vocab.strings.add("NP") - adj_label = doc.vocab.strings.add("amod") - det_labels = {doc.vocab.strings.add(det) for det in dets} - det_pos = doc.vocab.strings.add("DET") - adp_label = doc.vocab.strings.add("ADP") - conj = doc.vocab.strings.add("conj") - conj_pos = doc.vocab.strings.add("CCONJ") - prev_end = -1 - for i, word in enumerate(doclike): - if word.pos not in (NOUN, PROPN, PRON): - continue - # Prevent nested chunks from being produced - if word.left_edge.i <= prev_end: - continue - if word.dep in np_deps: - right_childs = list(word.rights) - right_child = right_childs[0] if right_childs else None - - if right_child: - if ( - right_child.dep == adj_label - ): # allow chain of adjectives by expanding to right - right_end = right_child.right_edge - elif ( - right_child.dep in det_labels and right_child.pos == det_pos - ): # cut relative pronouns here - right_end = right_child - elif right_child.dep in np_modifs: # Check if we can expand to right - right_end = word.right_edge - else: - right_end = word - else: - right_end = word - prev_end = right_end.i - - left_index = word.left_edge.i - left_index = ( - left_index + 1 if word.left_edge.pos == adp_label else left_index - ) - - yield left_index, right_end.i + 1, np_label - elif word.dep == conj: - head = word.head - while head.dep == conj and head.head.i < head.i: - head = head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - prev_end = word.i - - left_index = word.left_edge.i # eliminate left attached conjunction - left_index = ( - left_index + 1 if word.left_edge.pos == conj_pos else left_index - ) - yield left_index, word.i + 1, np_label - - -SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index d9ed414ef..fd65dd788 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både -da dag de del dem den denne der dermed det dette disse du +da dag de del dem den denne der dermed det dette disse drept du eller en enn er et ett etter -fem fikk fire fjor flere folk for fortsatt fra fram +fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag funnet få får fått før først første gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går -ha hadde ham han hans har hele helt henne hennes her hun +ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan +hvorfor i ifølge igjen ikke ingen inn ja jeg kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld +kvinner -la laget land landet langt leder ligger like litt løpet +la laget land landet langt leder ligger like litt løpet lørdag -man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte +man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer +millioner minutter mot msci mye må mål måtte -ned neste noe noen nok ny nye nå når +ned neste noe noen nok norge norsk norske ntb ny nye nå når -og også om opp opplyser oss over +og også om onsdag opp opplyser oslo oss over -personer plass poeng på +personer plass poeng politidistrikt politiet president prosent på -runde rundt +regjeringen runde rundt russland -sa saken samme sammen samtidig satt se seg seks selv senere ser sett +sa saken samme sammen samtidig satt se seg seks selv senere september ser sett siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor -store står svært så +store står sverige svært så søndag -ta tatt tid tidligere til tilbake tillegg tok tror +ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror +tyskland -under ut uten utenfor +under usa ut uten utenfor vant var ved veldig vi videre viktig vil ville viser vår være vært diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index c9004ed5d..6fb01a183 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,10 +1,13 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -# Removed various words that are not normally considered stop words, such as months. +# TODO: probably needs to be tidied up – the list seems to have month names in +# it, which shouldn't be considered stop words. STOP_WORDS = set( """ a ali +april +avgust b bi bil @@ -16,6 +19,7 @@ biti blizu bo bodo +bojo bolj bom bomo @@ -33,6 +37,16 @@ da daleč dan danes +datum +december +deset +deseta +deseti +deseto +devet +deveta +deveti +deveto do dober dobra @@ -40,7 +54,16 @@ dobri dobro dokler dol +dolg +dolga +dolgi dovolj +drug +druga +drugi +drugo +dva +dve e eden en @@ -51,6 +74,7 @@ enkrat eno etc. f +februar g g. ga @@ -69,12 +93,16 @@ iv ix iz j +januar jaz je ji jih jim jo +julij +junij +jutri k kadarkoli kaj @@ -95,23 +123,41 @@ kje kjer kjerkoli ko +koder koderkoli koga komu kot +kratek +kratka +kratke +kratki l +lahka +lahke +lahki +lahko le lep lepa lepe lepi lepo +leto m +maj +majhen +majhna +majhni +malce +malo manj +marec me med medtem mene +mesec mi midva midve @@ -137,6 +183,7 @@ najmanj naju največ nam +narobe nas nato nazaj @@ -145,6 +192,7 @@ naša naše ne nedavno +nedelja nek neka nekaj @@ -188,6 +236,7 @@ njuna njuno no nocoj +november npr. o ob @@ -195,23 +244,51 @@ oba obe oboje od +odprt +odprta +odprti okoli +oktober on onadva one oni onidve +osem +osma +osmi +osmo oz. p pa +pet +peta +petek +peti +peto po pod pogosto poleg +poln +polna +polni +polno ponavadi +ponedeljek ponovno potem povsod +pozdravljen +pozdravljeni +prav +prava +prave +pravi +pravo +prazen +prazna +prazno prbl. precej pred @@ -220,10 +297,19 @@ preko pri pribl. približno +primer +pripravljen +pripravljena +pripravljeni proti +prva +prvi +prvo r +ravno redko res +reč s saj sam @@ -235,17 +321,29 @@ se sebe sebi sedaj +sedem +sedma +sedmi +sedmo sem +september seveda si sicer skoraj skozi +slab smo so +sobota spet +sreda +srednja +srednji sta ste +stran +stvar sva t ta @@ -260,6 +358,10 @@ te tebe tebi tega +težak +težka +težki +težko ti tista tiste @@ -269,6 +371,11 @@ tj. tja to toda +torek +tretja +tretje +tretji +tri tu tudi tukaj @@ -285,6 +392,10 @@ vaša vaše ve vedno +velik +velika +veliki +veliko vendar ves več @@ -292,6 +403,10 @@ vi vidva vii viii +visok +visoka +visoke +visoki vsa vsaj vsak @@ -305,21 +420,34 @@ vsega vsi vso včasih +včeraj x z za zadaj zadnji zakaj +zaprta +zaprti +zaprto zdaj zelo zunaj č če često +četrta +četrtek +četrti +četrto čez čigav š +šest +šesta +šesti +šesto +štiri ž že """.split() diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ee90a9f38..ffca79bb9 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -155,11 +155,6 @@ def fr_tokenizer(): return get_lang_class("fr")().tokenizer -@pytest.fixture(scope="session") -def fr_vocab(): - return get_lang_class("fr")().vocab - - @pytest.fixture(scope="session") def ga_tokenizer(): return get_lang_class("ga")().tokenizer @@ -210,11 +205,6 @@ def it_tokenizer(): return get_lang_class("it")().tokenizer -@pytest.fixture(scope="session") -def it_vocab(): - return get_lang_class("it")().vocab - - @pytest.fixture(scope="session") def ja_tokenizer(): pytest.importorskip("sudachipy") diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 25b95f566..48ac88ead 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,230 +1,8 @@ -from spacy.tokens import Doc import pytest -# fmt: off -@pytest.mark.parametrize( - "words,heads,deps,pos,chunk_offsets", - [ - # determiner + noun - # un nom -> un nom - ( - ["un", "nom"], - [1, 1], - ["det", "ROOT"], - ["DET", "NOUN"], - [(0, 2)], - ), - # determiner + noun starting with vowel - # l'heure -> l'heure - ( - ["l'", "heure"], - [1, 1], - ["det", "ROOT"], - ["DET", "NOUN"], - [(0, 2)], - ), - # determiner + plural noun - # les romans -> les romans - ( - ["les", "romans"], - [1, 1], - ["det", "ROOT"], - ["DET", "NOUN"], - [(0, 2)], - ), - # det + adj + noun - # Le vieux Londres -> Le vieux Londres - ( - ['Les', 'vieux', 'Londres'], - [2, 2, 2], - ["det", "amod", "ROOT"], - ["DET", "ADJ", "NOUN"], - [(0,3)] - ), - # det + noun + adj - # le nom propre -> le nom propre a proper noun - ( - ["le", "nom", "propre"], - [1, 1, 1], - ["det", "ROOT", "amod"], - ["DET", "NOUN", "ADJ"], - [(0, 3)], - ), - # det + noun + adj plural - # Les chiens bruns -> les chiens bruns - ( - ["Les", "chiens", "bruns"], - [1, 1, 1], - ["det", "ROOT", "amod"], - ["DET", "NOUN", "ADJ"], - [(0, 3)], - ), - # multiple adjectives: one adj before the noun, one adj after the noun - # un nouveau film intéressant -> un nouveau film intéressant - ( - ["un", "nouveau", "film", "intéressant"], - [2, 2, 2, 2], - ["det", "amod", "ROOT", "amod"], - ["DET", "ADJ", "NOUN", "ADJ"], - [(0,4)] - ), - # multiple adjectives, both adjs after the noun - # une personne intelligente et drôle -> une personne intelligente et drôle - ( - ["une", "personne", "intelligente", "et", "drôle"], - [1, 1, 1, 4, 2], - ["det", "ROOT", "amod", "cc", "conj"], - ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], - [(0,5)] - ), - # relative pronoun - # un bus qui va au ville -> un bus, qui, ville - ( - ['un', 'bus', 'qui', 'va', 'au', 'ville'], - [1, 1, 3, 1, 5, 3], - ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'], - ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'], - [(0,2), (2,3), (5,6)] - ), - # relative subclause - # Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy. - ( - ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'], - [0, 2, 0, 5, 5, 2, 5], - ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'], - ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'], - [(1,3), (4,5)] - ), - # Person name and title by flat - # Louis XIV -> Louis XIV - ( - ["Louis", "XIV"], - [0, 0], - ["ROOT", "flat:name"], - ["PROPN", "PROPN"], - [(0,2)] - ), - # Organization name by flat - # Nations Unies -> Nations Unies - ( - ["Nations", "Unies"], - [0, 0], - ["ROOT", "flat:name"], - ["PROPN", "PROPN"], - [(0,2)] - ), - # Noun compound, person name created by two flats - # Louise de Bratagne -> Louise de Bratagne - ( - ["Louise", "de", "Bratagne"], - [0, 0, 0], - ["ROOT", "flat:name", "flat:name"], - ["PROPN", "PROPN", "PROPN"], - [(0,3)] - ), - # Noun compound, person name created by two flats - # Louis François Joseph -> Louis François Joseph - ( - ["Louis", "François", "Joseph"], - [0, 0, 0], - ["ROOT", "flat:name", "flat:name"], - ["PROPN", "PROPN", "PROPN"], - [(0,3)] - ), - # one determiner + one noun + one adjective qualified by an adverb - # quelques agriculteurs très riches -> quelques agriculteurs très riches - ( - ["quelques", "agriculteurs", "très", "riches"], - [1, 1, 3, 1], - ['det', 'ROOT', 'advmod', 'amod'], - ['DET', 'NOUN', 'ADV', 'ADJ'], - [(0,4)] - ), - # Two NPs conjuncted - # Il a un chien et un chat -> Il, un chien, un chat - ( - ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], - [1, 1, 3, 1, 6, 6, 3], - ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], - ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], - [(0,1), (2,4), (5,7)] - - ), - # Two NPs together - # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado - ( - ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'], - [1, 1, 1, 1, 3], - ['det', 'ROOT', 'amod', 'appos', 'flat:name'], - ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], - [(0, 3), (3, 5)] - ), - # nmod relation between NPs - # la destruction de la ville -> la destruction, la ville - ( - ['la', 'destruction', 'de', 'la', 'ville'], - [1, 1, 4, 4, 1], - ['det', 'ROOT', 'case', 'det', 'nmod'], - ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'], - [(0,2), (3,5)] - ), - # nmod relation between NPs - # Archiduchesse d’Autriche -> Archiduchesse, Autriche - ( - ['Archiduchesse', 'd’', 'Autriche'], - [0, 2, 0], - ['ROOT', 'case', 'nmod'], - ['NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3)] - ), - # Compounding by nmod, several NPs chained together - # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement - ( - ["la", "première", "usine", "de", "drogue", "du", "gouvernement"], - [2, 2, 2, 4, 2, 6, 2], - ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], - ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], - [(0, 3), (4, 5), (6, 7)] - ), - # several NPs - # Traduction du rapport de Susana -> Traduction, rapport, Susana - ( - ['Traduction', 'du', 'raport', 'de', 'Susana'], - [0, 2, 0, 4, 2], - ['ROOT', 'case', 'nmod', 'case', 'nmod'], - ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - - ), - # Several NPs - # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie - ( - ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], - [2, 2, 2, 4, 2, 7, 7, 2], - ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], - ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'], - [(0,3), (4,5), (6,8)] - ), - # Passive subject - # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton - ( - ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'], - [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8], - ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'], - ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'], - [(0, 3), (6, 10), (11, 12)] - ) - ], -) -# fmt: on -def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets): - doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos) - assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets - - def test_noun_chunks_is_parsed_fr(fr_tokenizer): """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" - doc = fr_tokenizer("Je suis allé à l'école") + doc = fr_tokenizer("trouver des travaux antérieurs") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py deleted file mode 100644 index 0a8c10e79..000000000 --- a/spacy/tests/lang/it/test_noun_chunks.py +++ /dev/null @@ -1,221 +0,0 @@ -from spacy.tokens import Doc -import pytest - - -# fmt: off -@pytest.mark.parametrize( - "words,heads,deps,pos,chunk_offsets", - [ - # determiner + noun - # un pollo -> un pollo - ( - ["un", "pollo"], - [1, 1], - ["det", "ROOT"], - ["DET", "NOUN"], - [(0,2)], - ), - # two determiners + noun - # il mio cane -> il mio cane - ( - ["il", "mio", "cane"], - [2, 2, 2], - ["det", "det:poss", "ROOT"], - ["DET", "DET", "NOUN"], - [(0,3)], - ), - # two determiners, one is after noun. rare usage but still testing - # il cane mio-> il cane mio - ( - ["il", "cane", "mio"], - [1, 1, 1], - ["det", "ROOT", "det:poss"], - ["DET", "NOUN", "DET"], - [(0,3)], - ), - # relative pronoun - # È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty. - ( - ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"], - [2, 2, 2, 4, 2, 7, 7, 4], - ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'], - ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'], - [(3,5), (5,6)] - ), - # relative subclause - # il computer che hai comprato -> il computer, che the computer that you bought - ( - ['il', 'computer', 'che', 'hai', 'comprato'], - [1, 1, 4, 4, 1], - ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'], - ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'], - [(0,2), (2,3)] - ), - # det + noun + adj - # Una macchina grande -> Una macchina grande - ( - ["Una", "macchina", "grande"], - [1, 1, 1], - ["det", "ROOT", "amod"], - ["DET", "NOUN", "ADJ"], - [(0,3)], - ), - # noun + adj plural - # mucche bianche - ( - ["mucche", "bianche"], - [0, 0], - ["ROOT", "amod"], - ["NOUN", "ADJ"], - [(0,2)], - ), - # det + adj + noun - # Una grande macchina -> Una grande macchina - ( - ['Una', 'grande', 'macchina'], - [2, 2, 2], - ["det", "amod", "ROOT"], - ["DET", "ADJ", "NOUN"], - [(0,3)] - ), - # det + adj + noun, det with apostrophe - # un'importante associazione -> un'importante associazione - ( - ["Un'", 'importante', 'associazione'], - [2, 2, 2], - ["det", "amod", "ROOT"], - ["DET", "ADJ", "NOUN"], - [(0,3)] - ), - # multiple adjectives - # Un cane piccolo e marrone -> Un cane piccolo e marrone - ( - ["Un", "cane", "piccolo", "e", "marrone"], - [1, 1, 1, 4, 2], - ["det", "ROOT", "amod", "cc", "conj"], - ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], - [(0,5)] - ), - # determiner, adjective, compound created by flat - # le Nazioni Unite -> le Nazioni Unite - ( - ["le", "Nazioni", "Unite"], - [1, 1, 1], - ["det", "ROOT", "flat:name"], - ["DET", "PROPN", "PROPN"], - [(0,3)] - ), - # one determiner + one noun + one adjective qualified by an adverb - # alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers - ( - ['alcuni', 'contadini', 'molto', 'ricchi'], - [1, 1, 3, 1], - ['det', 'ROOT', 'advmod', 'amod'], - ['DET', 'NOUN', 'ADV', 'ADJ'], - [(0,4)] - ), - # Two NPs conjuncted - # Ho un cane e un gatto -> un cane, un gatto - ( - ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], - [0, 2, 0, 5, 5, 0], - ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], - ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], - [(1,3), (4,6)] - - ), - # Two NPs together - # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado - ( - ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'], - [1, 1, 1, 1, 3], - ['det', 'ROOT', 'amod', 'nmod', 'flat:name'], - ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], - [(0, 3), (3, 5)] - ), - # Noun compound, person name and titles - # Dom Pedro II -> Dom Pedro II - ( - ["Dom", "Pedro", "II"], - [0, 0, 0], - ["ROOT", "flat:name", "flat:name"], - ["PROPN", "PROPN", "PROPN"], - [(0,3)] - ), - # Noun compound created by flat - # gli Stati Uniti - ( - ["gli", "Stati", "Uniti"], - [1, 1, 1], - ["det", "ROOT", "flat:name"], - ["DET", "PROPN", "PROPN"], - [(0,3)] - ), - # nmod relation between NPs - # la distruzione della città -> la distruzione, città - ( - ['la', 'distruzione', 'della', 'città'], - [1, 1, 3, 1], - ['det', 'ROOT', 'case', 'nmod'], - ['DET', 'NOUN', 'ADP', 'NOUN'], - [(0,2), (3,4)] - ), - # Compounding by nmod, several NPs chained together - # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo - ( - ["la", "prima", "fabbrica", "di", "droga", "del", "governo"], - [2, 2, 2, 4, 2, 6, 2], - ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], - ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], - [(0, 3), (4, 5), (6, 7)] - ), - # several NPs - # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana - ( - ['Traduzione', 'del', 'rapporto', 'di', 'Susana'], - [0, 2, 0, 4, 2], - ['ROOT', 'case', 'nmod', 'case', 'nmod'], - ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - - ), - # Several NPs - # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica - ( - ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], - [1, 1, 1, 4, 1, 8, 8, 8, 1], - ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], - ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'], - [(0,3), (4,5), (6,9)] - ), - # Passive subject - # La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton - ( - ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'], - [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9], - ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'], - ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0, 3), (6, 8), (9, 10), (11,12)] - ), - # Misc - # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti - ( - ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'], - [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17], - ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'], - ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'], - [(2,4), (9,12), (13,14), (17,18), (19,20)] - ) - ], -) -# fmt: on -def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets): - doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos) - assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets - - -def test_noun_chunks_is_parsed_it(it_tokenizer): - """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed.""" - doc = it_tokenizer("Sei andato a Oxford") - with pytest.raises(ValueError): - list(doc.noun_chunks) From 47ea6704f1045ee3a04ac7ffbfedba01d944e233 Mon Sep 17 00:00:00 2001 From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com> Date: Mon, 17 Jan 2022 03:17:49 -0700 Subject: [PATCH 009/177] Span richcmp fix (#9956) * Corrected Span's __richcmp__ implementation to take end, label and kb_id in consideration * Updated test * Updated test * Removed formatting from a test for readability sake * Use same tuples for all comparisons Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_span.py | 49 ++++++++++++++++++++++++++++++++++++ spacy/tokens/span.pyx | 28 ++++++--------------- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 10aba5b94..bdf34c1c1 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -573,6 +573,55 @@ def test_span_with_vectors(doc): doc.vocab.vectors = prev_vectors +# fmt: off +def test_span_comparison(doc): + + # Identical start, end, only differ in label and kb_id + assert Span(doc, 0, 3) == Span(doc, 0, 3) + assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3) + assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID")) + assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")) + + assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3)) + assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3)) + + # Different end + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4) + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4) + assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + # Different start + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3) + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3) + assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + # Different start & different end + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3) + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3) + assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID") + assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID") +# fmt: on + + @pytest.mark.parametrize( "start,end,expected_sentences,expected_sentences_with_hook", [ diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cd02cab36..5484b25fd 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -126,38 +126,26 @@ cdef class Span: return False else: return True + self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.doc) + other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.doc) # < if op == 0: - return self.c.start_char < other.c.start_char + return self_tuple < other_tuple # <= elif op == 1: - return self.c.start_char <= other.c.start_char + return self_tuple <= other_tuple # == elif op == 2: - # Do the cheap comparisons first - return ( - (self.c.start_char == other.c.start_char) and \ - (self.c.end_char == other.c.end_char) and \ - (self.c.label == other.c.label) and \ - (self.c.kb_id == other.c.kb_id) and \ - (self.doc == other.doc) - ) + return self_tuple == other_tuple # != elif op == 3: - # Do the cheap comparisons first - return not ( - (self.c.start_char == other.c.start_char) and \ - (self.c.end_char == other.c.end_char) and \ - (self.c.label == other.c.label) and \ - (self.c.kb_id == other.c.kb_id) and \ - (self.doc == other.doc) - ) + return self_tuple != other_tuple # > elif op == 4: - return self.c.start_char > other.c.start_char + return self_tuple > other_tuple # >= elif op == 5: - return self.c.start_char >= other.c.start_char + return self_tuple >= other_tuple def __hash__(self): return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id)) From 39f1b13e7729c5fa41fd28972539cc35fce9398a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 17 Jan 2022 11:48:39 +0100 Subject: [PATCH 010/177] Update sudachipy extras (#10072) By @polm, redone from #9917 after incorrect (reverted) rebase. `sudachipy>=0.5.2` is needed for newer dictionaries. `sudachipy<0.6.0` is kept for users who might still prefer the older version, in particular to be able to compile it without rust. --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 50e982cbf..586a044ff 100644 --- a/setup.cfg +++ b/setup.cfg @@ -108,8 +108,8 @@ apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = - sudachipy>=0.4.9 - sudachidict_core>=20200330 + sudachipy>=0.5.2,!=0.6.1 + sudachidict_core>=20211220 ko = natto-py==0.9.0 th = From c28e33637bf7c7beef8658db7bfc33182adeca87 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 18 Jan 2022 17:36:28 +0900 Subject: [PATCH 011/177] Mark flaky spancat test so it doesn't fail the build (#10075) * Mark flaky spancat test so it doesn't fail the build * Skip, don't run and ignore --- spacy/tests/pipeline/test_spancat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 2f7e952d3..39d2e97da 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -79,7 +79,8 @@ def test_explicit_labels(): nlp.initialize() assert spancat.labels == ("PERSON", "LOC") - +#TODO figure out why this is flaky +@pytest.mark.skip(reason="Test is unreliable for unknown reason") def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards nlp = Language() @@ -97,6 +98,7 @@ def test_doc_gc(): assert isinstance(spangroups, SpanGroups) for key, spangroup in spangroups.items(): assert isinstance(spangroup, SpanGroup) + # XXX This fails with length 0 sometimes assert len(spangroup) > 0 with pytest.raises(RuntimeError): span = spangroup[0] From 4dfd559e5569f73846d5280d86487104f8550b0d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 18 Jan 2022 17:12:42 +0100 Subject: [PATCH 012/177] Fix spaces in Doc.from_docs for empty docs (#10052) Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an doc ending in whitespace is followed by an empty doc. --- spacy/tests/doc/test_doc_api.py | 5 +++-- spacy/tokens/doc.pyx | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index c6195d7e2..10700b787 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): "Merging the docs is fun.", "", "They don't think alike. ", + "", "Another doc.", ] en_texts_without_empty = [t for t in en_texts if len(t)] @@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - en_docs[3].spans["group"] = [en_docs[3][0:1]] + en_docs[4].spans["group"] = [en_docs[4][0:1]] span_group_texts = sorted( - [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text] ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 362a17784..2f82a0d1b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1183,7 +1183,7 @@ cdef class Doc: token_offset = -1 for doc in docs[:-1]: token_offset += len(doc) - if not (len(doc) > 0 and doc[-1].is_space): + if len(doc) > 0 and not doc[-1].is_space: concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays) From 50d2a2c93071f4d96606ba0d5985c54b59184cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 18 Jan 2022 17:14:35 +0100 Subject: [PATCH 013/177] User fewer Vector internals (#9879) * Use Vectors.shape rather than Vectors.data.shape * Use Vectors.size rather than Vectors.data.size * Add Vectors.to_ops to move data between different ops * Add documentation for Vector.to_ops --- spacy/language.py | 8 ++++---- spacy/ml/models/multi_task.py | 4 ++-- spacy/ml/staticvectors.py | 2 +- spacy/tests/vocab_vectors/test_vectors.py | 10 +++++----- spacy/tokens/doc.pyx | 4 ++-- spacy/tokens/span.pyx | 2 +- spacy/training/initialize.py | 2 +- spacy/vectors.pyx | 7 +++++-- spacy/vocab.pyx | 4 ++-- website/docs/api/vectors.md | 17 +++++++++++++++++ 10 files changed, 40 insertions(+), 20 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 638616316..798254b80 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1285,9 +1285,9 @@ class Language: ) except IOError: raise IOError(Errors.E884.format(vectors=I["vectors"])) - if self.vocab.vectors.data.shape[1] >= 1: + if self.vocab.vectors.shape[1] >= 1: ops = get_current_ops() - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + self.vocab.vectors.to_ops(ops) if hasattr(self.tokenizer, "initialize"): tok_settings = validate_init_settings( self.tokenizer.initialize, # type: ignore[union-attr] @@ -1332,8 +1332,8 @@ class Language: DOCS: https://spacy.io/api/language#resume_training """ ops = get_current_ops() - if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + if self.vocab.vectors.shape[1] >= 1: + self.vocab.vectors.to_ops(ops) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 37473b7f4..9e1face63 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -23,7 +23,7 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: - if vocab.vectors.data.shape[1] == 0: + if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces @@ -116,7 +116,7 @@ def build_multi_task_model( def build_cloze_multi_task_model( vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int ) -> Model: - nO = vocab.vectors.data.shape[1] + nO = vocab.vectors.shape[1] output_layer = chain( cast(Model[List["Floats2d"], Floats2d], list2array()), Maxout( diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 8dd65833b..8d9b1af9b 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -94,7 +94,7 @@ def init( nM = model.get_dim("nM") if model.has_dim("nM") else None nO = model.get_dim("nO") if model.has_dim("nO") else None if X is not None and len(X): - nM = X[0].vocab.vectors.data.shape[1] + nM = X[0].vocab.vectors.shape[1] if Y is not None: nO = Y.data.shape[1] diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 9dc40b499..0650a7487 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -421,7 +421,7 @@ def test_vector_is_oov(): def test_init_vectors_unset(): v = Vectors(shape=(10, 10)) assert v.is_full is False - assert v.data.shape == (10, 10) + assert v.shape == (10, 10) with pytest.raises(ValueError): v = Vectors(shape=(10, 10), mode="floret") @@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # rows: 2 rows per ngram rows = OPS.xp.asarray( [ - h % nlp.vocab.vectors.data.shape[0] + h % nlp.vocab.vectors.shape[0] for ngram in ngrams for h in nlp.vocab.vectors._get_ngram_hashes(ngram) ], @@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # an empty key returns 0s assert_equal( OPS.to_numpy(nlp.vocab[""].vector), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # an empty batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), - numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), + numpy.zeros((1, nlp.vocab.vectors.shape[0])), ) # an empty key within a batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # the loaded ngram vector table cannot be modified diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2f82a0d1b..5a0db115d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -616,7 +616,7 @@ cdef class Doc: """ if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) - elif self.vocab.vectors.data.size: + elif self.vocab.vectors.size: return True elif self.tensor.size: return True @@ -641,7 +641,7 @@ cdef class Doc: if not len(self): self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") return self._vector - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector elif self.tensor.size > 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cd02cab36..9bb6bf2e7 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -497,7 +497,7 @@ cdef class Span: """ if "has_vector" in self.doc.user_span_hooks: return self.doc.user_span_hooks["has_vector"](self) - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) elif self.doc.tensor.size > 0: return True diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 084204389..b59288e38 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -164,7 +164,7 @@ def load_vectors_into_model( len(vectors_nlp.vocab.vectors.keys()) == 0 and vectors_nlp.vocab.vectors.mode != VectorsMode.floret ) or ( - vectors_nlp.vocab.vectors.data.shape[0] == 0 + vectors_nlp.vocab.vectors.shape[0] == 0 and vectors_nlp.vocab.vectors.mode == VectorsMode.floret ): logger.warning(Warnings.W112.format(name=name)) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 345e8df68..bc4863703 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -10,7 +10,7 @@ from typing import cast import warnings from enum import Enum import srsly -from thinc.api import get_array_module, get_current_ops +from thinc.api import Ops, get_array_module, get_current_ops from thinc.backends import get_array_ops from thinc.types import Floats2d @@ -146,7 +146,7 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#size """ - return self.data.shape[0] * self.data.shape[1] + return self.data.size @property def is_full(self): @@ -517,6 +517,9 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) + def to_ops(self, ops: Ops): + self.data = ops.asarray(self.data) + def _get_cfg(self): if self.mode == Mode.default: return { diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e2e7ad1db..badd291ed 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -283,7 +283,7 @@ cdef class Vocab: @property def vectors_length(self): - return self.vectors.data.shape[1] + return self.vectors.shape[1] def reset_vectors(self, *, width=None, shape=None): """Drop the current vector table. Because all vectors must be the same @@ -294,7 +294,7 @@ cdef class Vocab: elif shape is not None: self.vectors = Vectors(strings=self.strings, shape=shape) else: - width = width if width is not None else self.vectors.data.shape[1] + width = width if width is not None else self.vectors.shape[1] self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) def prune_vectors(self, nr_row, batch_size=1024): diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 84d2c00ad..b3bee822c 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch. | ------ | --------------------------------------- | | `keys` | The keys. ~~Iterable[Union[int, str]]~~ | +## Vectors.to_ops {#to_ops tag="method"} + +Change the embedding matrix to use different Thinc ops. + +> #### Example +> +> ```python +> from thinc.api import NumpyOps +> +> vectors.to_ops(NumpyOps()) +> +> ``` + +| Name | Description | +|-------|----------------------------------------------------------| +| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | + ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. From 2ff53834bb09eea2af3b7715a2516bcf7913a370 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 19 Jan 2022 18:45:11 +0900 Subject: [PATCH 014/177] Add link to pattern file info in EntityRuler.initialize docs (#10091) * Add link to pattern file info in EntityRuler.initialize docs * Update website/docs/api/entityruler.md Co-authored-by: Sofie Van Landeghem --- website/docs/api/entityruler.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 6d8f835bf..1ef283870 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example: ## EntityRuler.initialize {#initialize tag="method" new="3"} Initialize the component with data and used before training to load in rules -from a file. This method is typically called by -[`Language.initialize`](/api/language#initialize) and lets you customize -arguments it receives via the +from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method +is typically called by [`Language.initialize`](/api/language#initialize) and +lets you customize arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. From 7d528e607c0c6cd267d42b2ea36e96bc25e7bd80 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Jan 2022 10:53:40 +0100 Subject: [PATCH 015/177] Update quickstart install steps (#10092) * For conda: * Use conda environment rather than venv * Install `spacy-transformers` as a conda package * For pip: * Add quotes if extras are included --- website/src/widgets/quickstart-install.js | 42 +++++++++++++++++------ 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 628e1c533..1c8ad19da 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => { { id: 'venv', title: 'virtual env', - help: - 'Use a virtual environment and install spaCy into a user directory', + help: 'Use a virtual environment', }, { id: 'train', @@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => { setters={setters} showDropdown={showDropdown} > - python -m venv .env - + + python -m venv .env + + source .env/bin/activate - + source .env/bin/activate - + .env\Scripts\activate + + python -m venv .env + + + source .env/bin/activate + + + source .env/bin/activate + + + .env\Scripts\activate + + + conda create -n venv + + + conda activate venv + pip install -U pip setuptools wheel pip install -U pip setuptools wheel - pip install -U {pkg} - {pipExtras && `[${pipExtras}]`} + {pipExtras + ? `pip install -U '${pkg}[${pipExtras}]'` + : `pip install -U ${pkg}`} {nightly ? ' --pre' : ''} conda install -c conda-forge spacy conda install -c conda-forge cupy + + conda install -c conda-forge spacy-transformers + git clone https://github.com/{repo} {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''} @@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => { # packages only available via pip - - pip install spacy-transformers - pip install spacy-lookups-data From e9c631453968288f224a1ab5861bf59a9c109f63 Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Thu, 20 Jan 2022 11:40:46 +0100 Subject: [PATCH 016/177] Bugfix for similarity return types (#10051) --- spacy/lexeme.pyx | 6 ++-- spacy/tests/vocab_vectors/test_similarity.py | 34 ++++++++++++++++---- spacy/tokens/span.pyx | 6 ++-- spacy/tokens/token.pyx | 6 ++-- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 792e405dd..6c66effde 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -130,8 +130,10 @@ cdef class Lexeme: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + @property def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index 3b9308f4d..47cd1f060 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors): assert lex1.vector_norm != 0 assert lex2.vector_norm != 0 assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1] + assert isinstance(lex1.similarity(lex2), float) assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2)) assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) @@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors): assert doc[0].vector_norm != 0 assert doc[1].vector_norm != 0 assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1] + assert isinstance(doc[0].similarity(doc[1]), float) assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2)) assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) +def test_vectors_similarity_SS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc[0:1].similarity(doc[0:2]), float) + assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1]) + + +def test_vectors_similarity_DD(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc1 = Doc(vocab, words=[word1, word2]) + doc2 = Doc(vocab, words=[word2, word1]) + assert isinstance(doc1.similarity(doc2), float) + assert doc1.similarity(doc2) == doc2.similarity(doc1) + + def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc.similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc), float) assert doc.similarity(doc[0]) == doc[0].similarity(doc) -def test_vectors_similarity_DS(vocab, vectors): - [(word1, vec1), (word2, vec2)] = vectors - doc = Doc(vocab, words=[word1, word2]) - assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) - - def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc[:2].similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc[-2]), float) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + + +def test_vectors_similarity_DS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc.similarity(doc[:2]), float) + assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9bb6bf2e7..f7ddc5136 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -364,8 +364,10 @@ cdef class Span: return 0.0 vector = self.vector xp = get_array_module(vector) - return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index aa97e2b07..c09ec28d6 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -209,8 +209,10 @@ cdef class Token: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + def has_morph(self): """Check whether the token has annotated morph information. Return False when the morph annotation is unset/missing. From a55212fca01f97beaf6f07e8ff3fc6e81a0b7de4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Jan 2022 11:42:52 +0100 Subject: [PATCH 017/177] Determine labels by factory name in debug data (#10079) * Determine labels by factory name in debug data For all components, return labels for all components with the corresponding factory name rather than for only the default name. For `spancat`, return labels as a dict keyed by `spans_key`. * Refactor for typing * Add test * Use assert instead of cast, removed unneeded arg * Mark test as slow --- spacy/cli/debug_data.py | 38 ++++++++++++++++++++++++++++++++------ spacy/tests/test_cli.py | 27 +++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 688b07a9b..b9831fe0c 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals.nonproj import DELIMITER -from ..pipeline import Morphologizer +from ..pipeline import Morphologizer, SpanCategorizer from ..morphology import Morphology from ..language import Language from ..util import registry, resolve_dot_names @@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int: return count -def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]: - if pipe_name not in nlp.pipe_names: - return set() - pipe = nlp.get_pipe(pipe_name) - return set(pipe.labels) +def _get_labels_from_model( + nlp: Language, factory_name: str +) -> Set[str]: + pipe_names = [ + pipe_name + for pipe_name in nlp.pipe_names + if nlp.get_pipe_meta(pipe_name).factory == factory_name + ] + labels: Set[str] = set() + for pipe_name in pipe_names: + pipe = nlp.get_pipe(pipe_name) + labels.update(pipe.labels) + return labels + + +def _get_labels_from_spancat( + nlp: Language +) -> Dict[str, Set[str]]: + pipe_names = [ + pipe_name + for pipe_name in nlp.pipe_names + if nlp.get_pipe_meta(pipe_name).factory == "spancat" + ] + labels: Dict[str, Set[str]] = {} + for pipe_name in pipe_names: + pipe = nlp.get_pipe(pipe_name) + assert isinstance(pipe, SpanCategorizer) + if pipe.key not in labels: + labels[pipe.key] = set() + labels[pipe.key].update(pipe.labels) + return labels diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index b0862eab6..253469909 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands +from spacy.cli.debug_data import _get_labels_from_model +from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies @@ -665,3 +667,28 @@ def test_get_third_party_dependencies(): ) def test_is_subpath_of(parent, child, expected): assert is_subpath_of(parent, child) == expected + + +@pytest.mark.slow +@pytest.mark.parametrize( + "factory_name,pipe_name", + [ + ("ner", "ner"), + ("ner", "my_ner"), + ("spancat", "spancat"), + ("spancat", "my_spancat"), + ], +) +def test_get_labels_from_model(factory_name, pipe_name): + labels = ("A", "B") + + nlp = English() + pipe = nlp.add_pipe(factory_name, name=pipe_name) + for label in labels: + pipe.add_label(label) + nlp.initialize() + assert nlp.get_pipe(pipe_name).labels == labels + if factory_name == "spancat": + assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) + else: + assert _get_labels_from_model(nlp, factory_name) == set(labels) From 32bd3856b3b8fe749b77dca7d755366eaa87a2fd Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 20 Jan 2022 20:00:28 +0900 Subject: [PATCH 018/177] Rename FACILITY to FAC in color list (#10067) This matches the English models --- spacy/displacy/render.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 14d741a3d..a032d843b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = { "LOC": "#ff9561", "PERSON": "#aa9cfc", "NORP": "#c887fb", - "FACILITY": "#9cc9cc", + "FAC": "#9cc9cc", "EVENT": "#ffeb80", "LAW": "#ff8197", "LANGUAGE": "#ff8197", From 268ddf8a0611b86ca84ddd0a36a5ead0d177d1f1 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Thu, 20 Jan 2022 13:18:39 +0100 Subject: [PATCH 019/177] Add ENT_IOB key to Matcher (#9649) * added new field * added exception for IOb strings * minor refinement to schema * removed field * fixed typo * imported numeriacla val * changed the code bit * cosmetics * added test for matcher * set ents of moc docs * added invalid pattern * minor update to documentation * blacked matcher * added pattern validation * add IOB vals to schema * changed into test * mypy compat * cleaned left over * added compat import * changed type * added compat import * changed literal a bit * went back to old * made explicit type * Update spacy/schemas.py Co-authored-by: Adriane Boyd * Update spacy/schemas.py Co-authored-by: Adriane Boyd * Update spacy/schemas.py Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/matcher/matcher.pyx | 7 +++-- spacy/schemas.py | 3 +++ spacy/tests/matcher/test_matcher_api.py | 27 +++++++++++++++++++ .../tests/matcher/test_pattern_validation.py | 1 + website/docs/api/matcher.md | 1 + 5 files changed, 37 insertions(+), 2 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 745d7cf43..6aa58f0e3 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis -from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH +from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store): attr = "SENT_START" attr = IDS.get(attr) if isinstance(value, str): - value = string_store.add(value) + if attr == ENT_IOB and value in Token.iob_strings(): + value = Token.iob_strings().index(value) + else: + value = string_store.add(value) elif isinstance(value, bool): value = int(value) elif isinstance(value, int): diff --git a/spacy/schemas.py b/spacy/schemas.py index cf58688ef..1dfd8ee85 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,5 +1,6 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING +from .compat import Literal from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool @@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ TokenPatternString, TokenPatternNumber, str, int, float, list, bool ] +IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3] class TokenPattern(BaseModel): @@ -222,6 +224,7 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_iob: Optional[IobValue] = None ent_id: Optional[StringValue] = None ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c02d65cdf..a27baf130 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab): matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0 + + +def test_matcher_ent_iob_key(en_vocab): + """Test that patterns with ent_iob works correctly.""" + matcher = Matcher(en_vocab) + matcher.add("Rule", [[{"ENT_IOB": "I"}]]) + doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"]) + doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")] + doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"]) + doc2.ents = [Span(doc2, 4, 5, label="PERSON")] + matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)] + matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)] + assert len(matches1) == 1 + assert matches1[0] == "York" + assert len(matches2) == 0 + + matcher = Matcher(en_vocab) # Test iob pattern with operators + matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]]) + doc = Doc( + en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"] + ) + doc.ents = [Span(doc, 4, 7, label="PERSON")] + matches = [doc[start:end].text for _, start, end in matcher(doc)] + assert len(matches) == 3 + assert matches[0] == "Maria" + assert matches[1] == "Maria Esperanza" + assert matches[2] == "Esperanza" diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 74feb7c5d..8c265785c 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -12,6 +12,7 @@ TEST_PATTERNS = [ ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), + ([{"ENT_IOB": "foo"}], 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 803105ba2..3e7f9dc04 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,7 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | From 47a29168013cf077896d784344c00ac230642207 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Thu, 20 Jan 2022 13:19:38 +0100 Subject: [PATCH 020/177] Intify IOB (#9738) * added iob to int * added tests * added iob strings * added error * blacked attrs * Update spacy/tests/lang/test_attrs.py Co-authored-by: Adriane Boyd * Update spacy/attrs.pyx Co-authored-by: Adriane Boyd * added iob strings as global * minor refinement with iob * removed iob strings from token * changed to uppercase * cleaned and went back to master version * imported iob from attrs * Update and format errors * Support and test both str and int ENT_IOB key Co-authored-by: Adriane Boyd --- spacy/attrs.pyx | 88 +++++++++++++++++++++++++--------- spacy/errors.py | 9 ++-- spacy/tests/lang/test_attrs.py | 33 +++++++++++++ spacy/tokens/token.pyx | 3 +- 4 files changed, 107 insertions(+), 26 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 640fb2f3c..dc8eed7c3 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,3 +1,6 @@ +from .errors import Errors + +IOB_STRINGS = ("", "I", "O", "B") IDS = { "": NULL_ATTR, @@ -64,7 +67,6 @@ IDS = { "FLAG61": FLAG61, "FLAG62": FLAG62, "FLAG63": FLAG63, - "ID": ID, "ORTH": ORTH, "LOWER": LOWER, @@ -72,7 +74,6 @@ IDS = { "SHAPE": SHAPE, "PREFIX": PREFIX, "SUFFIX": SUFFIX, - "LENGTH": LENGTH, "LEMMA": LEMMA, "POS": POS, @@ -87,7 +88,7 @@ IDS = { "SPACY": SPACY, "LANG": LANG, "MORPH": MORPH, - "IDX": IDX + "IDX": IDX, } @@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): """ inty_attrs = {} if _do_deprecated: - if 'F' in stringy_attrs: + if "F" in stringy_attrs: stringy_attrs["ORTH"] = stringy_attrs.pop("F") - if 'L' in stringy_attrs: + if "L" in stringy_attrs: stringy_attrs["LEMMA"] = stringy_attrs.pop("L") - if 'pos' in stringy_attrs: + if "pos" in stringy_attrs: stringy_attrs["TAG"] = stringy_attrs.pop("pos") - if 'morph' in stringy_attrs: - morphs = stringy_attrs.pop('morph') - if 'number' in stringy_attrs: - stringy_attrs.pop('number') - if 'tenspect' in stringy_attrs: - stringy_attrs.pop('tenspect') + if "morph" in stringy_attrs: + morphs = stringy_attrs.pop("morph") + if "number" in stringy_attrs: + stringy_attrs.pop("number") + if "tenspect" in stringy_attrs: + stringy_attrs.pop("tenspect") morph_keys = [ - 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', - 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', - 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', - 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', - 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', - 'NumValue', 'PartType', 'Polite', 'StyleVariant', - 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', - 'Polarity', 'PrepCase', 'Animacy' # U20 + "PunctType", + "PunctSide", + "Other", + "Degree", + "AdvType", + "Number", + "VerbForm", + "PronType", + "Aspect", + "Tense", + "PartType", + "Poss", + "Hyph", + "ConjType", + "NumType", + "Foreign", + "VerbType", + "NounType", + "Gender", + "Mood", + "Negative", + "Tense", + "Voice", + "Abbr", + "Derivation", + "Echo", + "Foreign", + "NameType", + "NounType", + "NumForm", + "NumValue", + "PartType", + "Polite", + "StyleVariant", + "PronType", + "AdjType", + "Person", + "Variant", + "AdpType", + "Reflex", + "Negative", + "Mood", + "Aspect", + "Case", + "Polarity", + "PrepCase", + "Animacy", # U20 ] for key in morph_keys: if key in stringy_attrs: @@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: + if int_key == ENT_IOB: + if value in IOB_STRINGS: + value = IOB_STRINGS.index(value) + elif isinstance(value, str): + raise ValueError(Errors.E1025.format(value=value)) if strings_map is not None and isinstance(value, str): - if hasattr(strings_map, 'add'): + if hasattr(strings_map, "add"): value = strings_map.add(value) else: value = strings_map[value] diff --git a/spacy/errors.py b/spacy/errors.py index 673674222..390612123 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") - E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") - E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.") + E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't " + "exist.") + E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler " + "patterns.") + E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " + "supported values are: 'I', 'O', 'B' and ''") - # Deprecated model shortcuts, only used in errors and warnings OLD_MODEL_SHORTCUTS = { "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 5350c1fe5..1c27c1744 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,4 +1,5 @@ import pytest +from spacy.attrs import intify_attrs, ENT_IOB from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs from spacy.lang.en.stop_words import STOP_WORDS @@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text): assert int_attrs == {ORTH: 10, IS_ALPHA: True} +def test_attrs_ent_iob_intify(): + int_attrs = intify_attrs({"ENT_IOB": ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({"ENT_IOB": "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({"ENT_IOB": "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({"ENT_IOB": "B"}) + assert int_attrs == {ENT_IOB: 3} + + int_attrs = intify_attrs({ENT_IOB: ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({ENT_IOB: "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({ENT_IOB: "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({ENT_IOB: "B"}) + assert int_attrs == {ENT_IOB: 3} + + with pytest.raises(ValueError): + int_attrs = intify_attrs({"ENT_IOB": "XX"}) + + with pytest.raises(ValueError): + int_attrs = intify_attrs({ENT_IOB: "XX"}) + + @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) def test_lex_attrs_is_punct(text, match): assert is_punct(text) == match diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c09ec28d6..b515ab67b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads from .. import parts_of_speech from ..errors import Errors, Warnings +from ..attrs import IOB_STRINGS from .underscore import Underscore, get_ext_args @@ -745,7 +746,7 @@ cdef class Token: @classmethod def iob_strings(cls): - return ("", "I", "O", "B") + return IOB_STRINGS @property def ent_iob_(self): From 2abd380f2d17010fe22fe52e8aab529d70cbeec6 Mon Sep 17 00:00:00 2001 From: pepemedigu Date: Thu, 20 Jan 2022 15:44:13 +0100 Subject: [PATCH 021/177] Update lex_attrs.py for Spanish with ordinals (#10038) * Update lex_attrs.py Add ordinal words * black formatting Co-authored-by: Sofie Van Landeghem --- spacy/lang/es/lex_attrs.py | 41 +++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 988dbaba1..9d1fa93b8 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -47,6 +47,41 @@ _num_words = [ ] +_ordinal_words = [ + "primero", + "segundo", + "tercero", + "cuarto", + "quinto", + "sexto", + "séptimo", + "octavo", + "noveno", + "décimo", + "undécimo", + "duodécimo", + "decimotercero", + "decimocuarto", + "decimoquinto", + "decimosexto", + "decimoséptimo", + "decimoctavo", + "decimonoveno", + "vigésimo", + "trigésimo", + "cuadragésimo", + "quincuagésimo", + "sexagésimo", + "septuagésimo", + "octogésima", + "nonagésima", + "centésima", + "milésima", + "millonésima", + "billonésima", +] + + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -57,7 +92,11 @@ def like_num(text): num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True - if text.lower() in _num_words: + text_lower = text.lower() + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: return True return False From a69005037a3f48b8ddc964a0914acaf60228617b Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Thu, 20 Jan 2022 17:02:13 -0500 Subject: [PATCH 022/177] Docker Image for Website Dev (#10098) * add docker instructions * Update website/README.md Co-authored-by: Sofie Van Landeghem * Update website/README.md Co-authored-by: Sofie Van Landeghem * clarifying language on docker image * fix markdown formatting Co-authored-by: Sofie Van Landeghem --- website/Dockerfile | 16 ++++++++++++++++ website/README.md | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 website/Dockerfile diff --git a/website/Dockerfile b/website/Dockerfile new file mode 100644 index 000000000..f71733e55 --- /dev/null +++ b/website/Dockerfile @@ -0,0 +1,16 @@ +FROM node:11.15.0 + +WORKDIR /spacy-io + +RUN npm install -g gatsby-cli@2.7.4 + +COPY package.json . +COPY package-lock.json . + +RUN npm install + +# This is so the installed node_modules will be up one directory +# from where a user mounts files, so that they don't accidentally mount +# their own node_modules from a different build +# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders +WORKDIR /spacy-io/website/ diff --git a/website/README.md b/website/README.md index 076032d92..db050cf03 100644 --- a/website/README.md +++ b/website/README.md @@ -554,6 +554,42 @@ extensions for your code editor. The [`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc) file in the root defines the settings used in this codebase. +## Building & developing the site with Docker {#docker} +Sometimes it's hard to get a local environment working due to rapid updates to node dependencies, +so it may be easier to use docker for building the docs. + +If you'd like to do this, +**be sure you do *not* include your local `node_modules` folder**, +since there are some dependencies that need to be built for the image system. +Rename it before using. + +```bash +docker run -it \ + -v $(pwd):/spacy-io/website \ + -p 8000:8000 \ + ghcr.io/explosion/spacy-io \ + gatsby develop -H 0.0.0.0 +``` + +This will allow you to access the built website at http://0.0.0.0:8000/ +in your browser, and still edit code in your editor while having the site +reflect those changes. + +**Note**: If you're working on a Mac with an M1 processor, +you might see segfault errors from `qemu` if you use the default image. +To fix this use the `arm64` tagged image in the `docker run` command +(ghcr.io/explosion/spacy-io:arm64). + +### Building the Docker image {#docker-build} + +If you'd like to build the image locally, you can do so like this: + +```bash +docker build -t spacy-io . +``` + +This will take some time, so if you want to use the prebuilt image you'll save a bit of time. + ## Markdown reference {#markdown} All page content and page meta lives in the `.md` files in the `/docs` From 34ed93ef687b040a9dc6d615b8ded7e03d042ea9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Jan 2022 23:21:26 +0100 Subject: [PATCH 023/177] Support version tags in universe and add note about reporting (#10093) * Support version tags in universe and add note about reporting * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- website/meta/universe.json | 3 +- website/src/templates/universe.js | 62 +++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 0fde2d612..ba770a3fd 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -141,7 +141,8 @@ "website": "https://www.nr.no/~plison" }, "category": ["pipeline", "standalone", "research", "training"], - "tags": [] + "tags": [], + "spacy_version": 3 }, { "id": "numerizer", diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js index cfc8fdd0e..10f2520d9 100644 --- a/website/src/templates/universe.js +++ b/website/src/templates/universe.js @@ -8,10 +8,11 @@ import Title from '../components/title' import Grid from '../components/grid' import Button from '../components/button' import Icon from '../components/icon' +import Tag from '../components/tag' import CodeBlock, { InlineCode } from '../components/code' import Aside from '../components/aside' import Sidebar from '../components/sidebar' -import Section from '../components/section' +import Section, { Hr } from '../components/section' import Main from '../components/main' import Footer from '../components/footer' import { H3, H5, Label, InlineList } from '../components/typography' @@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp )} +
+

Found a mistake or something isn't working?

+

+ If you've come across a universe project that isn't working or is + incompatible with the reported spaCy version, let us know by{' '} + + opening a discussion thread + + . +

+
+

Submit your project

@@ -168,25 +181,41 @@ UniverseContent.propTypes = { mdxComponents: PropTypes.object, } +const SpaCyVersion = ({ version }) => { + const versions = !Array.isArray(version) ? [version] : version + return versions.map((v, i) => ( + <> + spaCy v{v}{' '} + + )) +} + const Project = ({ data, components }) => ( <> - {data.github && ( + {(data.github || data.spacy_version) && ( <p> - <Link to={`https://github.com/${data.github}`} hidden> - {[ - `release/${data.github}/all.svg?style=flat-square`, - `license/${data.github}.svg?style=flat-square`, - `stars/${data.github}.svg?style=social&label=Stars`, - ].map((url, i) => ( - <img - style={{ borderRadius: '1em', marginRight: '0.5rem' }} - key={i} - src={`https://img.shields.io/github/${url}`} - alt="" - /> - ))} - </Link> + {data.spacy_version && <SpaCyVersion version={data.spacy_version} />} + {data.github && ( + <Link to={`https://github.com/${data.github}`} hidden> + {[ + `release/${data.github}/all.svg?style=flat-square`, + `license/${data.github}.svg?style=flat-square`, + `stars/${data.github}.svg?style=social&label=Stars`, + ].map((url, i) => ( + <img + style={{ + borderRadius: '1em', + marginRight: '0.5rem', + verticalAlign: 'middle', + }} + key={i} + src={`https://img.shields.io/github/${url}`} + alt="" + /> + ))} + </Link> + )} </p> )} @@ -335,6 +364,7 @@ const query = graphql` url github description + spacy_version pip cran category From 6d4db5c3c757581a34b7229fff50fdd6d63ec880 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 21 Jan 2022 10:01:10 +0100 Subject: [PATCH 024/177] Auto-format code with black (#10106) Co-authored-by: explosion-bot --- spacy/cli/debug_data.py | 8 ++------ spacy/tests/pipeline/test_spancat.py | 3 ++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index b9831fe0c..ab7c20d48 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -699,9 +699,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int: return count -def _get_labels_from_model( - nlp: Language, factory_name: str -) -> Set[str]: +def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]: pipe_names = [ pipe_name for pipe_name in nlp.pipe_names @@ -714,9 +712,7 @@ def _get_labels_from_model( return labels -def _get_labels_from_spancat( - nlp: Language -) -> Dict[str, Set[str]]: +def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: pipe_names = [ pipe_name for pipe_name in nlp.pipe_names diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 39d2e97da..8060bc621 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -79,7 +79,8 @@ def test_explicit_labels(): nlp.initialize() assert spancat.labels == ("PERSON", "LOC") -#TODO figure out why this is flaky + +# TODO figure out why this is flaky @pytest.mark.skip(reason="Test is unreliable for unknown reason") def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards From 09734c56fcbc76881b7d5bba18d1c9792a84abed Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 28 Jan 2022 09:34:23 +0100 Subject: [PATCH 025/177] Use simple suggester for spancat initialization (#10143) Instead of the running the actual suggester, which may require annotation from annotating components that is not necessarily present in the reference docs, use the built-in 1-gram suggester. --- spacy/pipeline/spancat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 829def1eb..32c1275a6 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -412,7 +412,7 @@ class SpanCategorizer(TrainablePipe): self._require_labels() if subbatch: docs = [eg.x for eg in subbatch] - spans = self.suggester(docs) + spans = build_ngram_suggester(sizes=[1])(docs) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) self.model.initialize(X=(docs, spans), Y=Y) else: From 30cf9d6a053ce6ab8dc2885cf9c8103124f58202 Mon Sep 17 00:00:00 2001 From: Eduard Zorita Date: Fri, 28 Jan 2022 16:59:54 +0100 Subject: [PATCH 026/177] Update typing hints (#10109) * Improve typing hints for Matcher.__call__ * Add typing hints for DependencyMatcher * Add typing hints to underscore extensions * Update Doc.tensor type (requires numpy 1.21) * Fix typing hints for Language.component decorator * Use generic np.ndarray type in Doc to avoid numpy version update * Fix mypy errors * Fix cyclic import caused by Underscore typing hints * Use Literal type from spacy.compat * Update matcher.pyi import format Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/language.py | 2 +- spacy/matcher/dependencymatcher.pyi | 66 +++++++++++++++++++++++++++++ spacy/matcher/matcher.pyi | 18 ++++++-- spacy/matcher/phrasematcher.pyi | 18 +++++--- spacy/tokens/doc.pyi | 6 +-- spacy/tokens/underscore.py | 42 ++++++++++++------ 6 files changed, 127 insertions(+), 25 deletions(-) create mode 100644 spacy/matcher/dependencymatcher.pyi diff --git a/spacy/language.py b/spacy/language.py index 798254b80..217356b4c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -522,7 +522,7 @@ class Language: requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, func: Optional["Pipe"] = None, - ) -> Callable: + ) -> Callable[..., Any]: """Register a new pipeline component. Can be used for stateless function components that don't require a separate factory. Can be used as a decorator on a function or classmethod, or called as a function with the diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi new file mode 100644 index 000000000..c19d3a71c --- /dev/null +++ b/spacy/matcher/dependencymatcher.pyi @@ -0,0 +1,66 @@ +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from .matcher import Matcher +from ..vocab import Vocab +from ..tokens.doc import Doc +from ..tokens.span import Span + +class DependencyMatcher: + """Match dependency parse tree based on pattern rules.""" + + _patterns: Dict[str, List[Any]] + _raw_patterns: Dict[str, List[Any]] + _tokens_to_key: Dict[str, List[Any]] + _root: Dict[str, List[Any]] + _tree: Dict[str, List[Any]] + _callbacks: Dict[ + Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ] + _ops: Dict[str, Any] + vocab: Vocab + _matcher: Matcher + def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ... + def __reduce__( + self, + ) -> Tuple[ + Callable[ + [Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher + ], + Tuple[ + Vocab, + Dict[str, List[Any]], + Dict[ + str, + Callable[ + [DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any + ], + ], + ], + None, + None, + ]: ... + def __len__(self) -> int: ... + def __contains__(self, key: Union[str, int]) -> bool: ... + def add( + self, + key: Union[str, int], + patterns: List[List[Dict[str, Any]]], + *, + on_match: Optional[ + Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ] = ... + ) -> None: ... + def has_key(self, key: Union[str, int]) -> bool: ... + def get( + self, key: Union[str, int], default: Optional[Any] = ... + ) -> Tuple[ + Optional[ + Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ], + List[List[Dict[str, Any]]], + ]: ... + def remove(self, key: Union[str, int]) -> None: ... + def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ... + +def unpickle_matcher( + vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]] +) -> DependencyMatcher: ... diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index ec4a88eaf..390629ff8 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -1,4 +1,6 @@ -from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable +from typing import Any, List, Dict, Tuple, Optional, Callable, Union +from typing import Iterator, Iterable, overload +from ..compat import Literal from ..vocab import Vocab from ..tokens import Doc, Span @@ -31,12 +33,22 @@ class Matcher: ) -> Union[ Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc] ]: ... + @overload def __call__( self, doclike: Union[Doc, Span], *, - as_spans: bool = ..., + as_spans: Literal[False] = ..., allow_missing: bool = ..., with_alignments: bool = ... - ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + ) -> List[Tuple[int, int, int]]: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[True], + allow_missing: bool = ..., + with_alignments: bool = ... + ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 741bf7bb6..82a194835 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -1,6 +1,6 @@ -from typing import List, Tuple, Union, Optional, Callable, Any, Dict - -from . import Matcher +from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload +from ..compat import Literal +from .matcher import Matcher from ..vocab import Vocab from ..tokens import Doc, Span @@ -21,9 +21,17 @@ class PhraseMatcher: ] = ..., ) -> None: ... def remove(self, key: str) -> None: ... + @overload def __call__( self, doclike: Union[Doc, Span], *, - as_spans: bool = ..., - ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + as_spans: Literal[False] = ..., + ) -> List[Tuple[int, int, int]]: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[True], + ) -> List[Span]: ... diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f540002c9..7e9340d58 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -10,7 +10,7 @@ from ..lexeme import Lexeme from ..vocab import Vocab from .underscore import Underscore from pathlib import Path -import numpy +import numpy as np class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] @@ -26,7 +26,7 @@ class Doc: user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]] - tensor: numpy.ndarray + tensor: np.ndarray[Any, np.dtype[np.float_]] user_data: Dict[str, Any] has_unknown_spaces: bool _context: Any @@ -144,7 +144,7 @@ class Doc: ) -> Doc: ... def to_array( self, py_attr_ids: Union[int, str, List[Union[int, str]]] - ) -> numpy.ndarray: ... + ) -> np.ndarray[Any, np.dtype[np.float_]]: ... @staticmethod def from_docs( docs: List[Doc], diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 7fa7bf095..e9a4e1862 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,17 +1,31 @@ -from typing import Dict, Any +from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING import functools import copy - from ..errors import Errors +if TYPE_CHECKING: + from .doc import Doc + from .span import Span + from .token import Token + class Underscore: mutable_types = (dict, list, set) doc_extensions: Dict[Any, Any] = {} span_extensions: Dict[Any, Any] = {} token_extensions: Dict[Any, Any] = {} + _extensions: Dict[str, Any] + _obj: Union["Doc", "Span", "Token"] + _start: Optional[int] + _end: Optional[int] - def __init__(self, extensions, obj, start=None, end=None): + def __init__( + self, + extensions: Dict[str, Any], + obj: Union["Doc", "Span", "Token"], + start: Optional[int] = None, + end: Optional[int] = None, + ): object.__setattr__(self, "_extensions", extensions) object.__setattr__(self, "_obj", obj) # Assumption is that for doc values, _start and _end will both be None @@ -23,12 +37,12 @@ class Underscore: object.__setattr__(self, "_start", start) object.__setattr__(self, "_end", end) - def __dir__(self): + def __dir__(self) -> List[str]: # Hack to enable autocomplete on custom extensions extensions = list(self._extensions.keys()) return ["set", "get", "has"] + extensions - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: if name not in self._extensions: raise AttributeError(Errors.E046.format(name=name)) default, method, getter, setter = self._extensions[name] @@ -56,7 +70,7 @@ class Underscore: return new_default return default - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: Any): if name not in self._extensions: raise AttributeError(Errors.E047.format(name=name)) default, method, getter, setter = self._extensions[name] @@ -65,28 +79,30 @@ class Underscore: else: self._doc.user_data[self._get_key(name)] = value - def set(self, name, value): + def set(self, name: str, value: Any): return self.__setattr__(name, value) - def get(self, name): + def get(self, name: str) -> Any: return self.__getattr__(name) - def has(self, name): + def has(self, name: str) -> bool: return name in self._extensions - def _get_key(self, name): + def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]: return ("._.", name, self._start, self._end) @classmethod - def get_state(cls): + def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]: return cls.token_extensions, cls.span_extensions, cls.doc_extensions @classmethod - def load_state(cls, state): + def load_state( + cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]] + ) -> None: cls.token_extensions, cls.span_extensions, cls.doc_extensions = state -def get_ext_args(**kwargs): +def get_ext_args(**kwargs: Any): """Validate and convert arguments. Reused in Doc, Token and Span.""" default = kwargs.get("default") getter = kwargs.get("getter") From 4f441dfa24a890f4a112ae375ef7daf1a7c21ffd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 28 Jan 2022 17:00:54 +0100 Subject: [PATCH 027/177] Fix infix as prefix in Tokenizer.explain (#10140) * Fix infix as prefix in Tokenizer.explain Update `Tokenizer.explain` to align with the `Tokenizer` algorithm: * skip infix matches that are prefixes in the current substring * Update tokenizer pseudocode in docs --- spacy/tests/tokenizer/test_tokenizer.py | 18 ++++++++++++++++++ spacy/tokenizer.pyx | 2 ++ website/docs/usage/linguistic-features.md | 2 ++ 3 files changed, 22 insertions(+) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c2aeffcb5..a7270cb1e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer from spacy.tokens import Doc from spacy.training import Example from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path +from spacy.util import compile_infix_regex from spacy.vocab import Vocab from spacy.symbols import ORTH @@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens + + +def test_tokenizer_infix_prefix(en_vocab): + # the prefix and suffix matches overlap in the suffix lookbehind + infixes = ["±"] + suffixes = ["%"] + infix_re = compile_infix_regex(infixes) + suffix_re = compile_suffix_regex(suffixes) + tokenizer = Tokenizer( + en_vocab, + infix_finditer=infix_re.finditer, + suffix_search=suffix_re.search, + ) + tokens = [t.text for t in tokenizer("±10%")] + assert tokens == ["±10", "%"] + explain_tokens = [t[1] for t in tokenizer.explain("±10%")] + assert tokens == explain_tokens diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4a148b356..91f228032 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -683,6 +683,8 @@ cdef class Tokenizer: infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue if substring[offset : match.start()]: tokens.append(("TOKEN", substring[offset : match.start()])) if substring[match.start() : match.end()]: diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f748fa8d6..f8baf5588 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -831,6 +831,8 @@ def tokenizer_pseudo_code( infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue tokens.append(substring[offset : match.start()]) tokens.append(substring[match.start() : match.end()]) offset = match.end() From 67ecac633fa5a35828291a9916ebb0a02170d700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20=C5=A0uppa?= Date: Sun, 30 Jan 2022 08:43:29 +0100 Subject: [PATCH 028/177] fix: Add missing comma to `examples.py` (#10167) * This comma has been most probably been left out unintentionally, leading to string concatenation between the two consecutive lines. This issue has been found automatically using a regular expression. --- spacy/lang/xx/examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 8d63c3c20..34570d747 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -59,7 +59,7 @@ sentences = [ "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.", - "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.." + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..", "São Francisco considera banir os robôs de entrega que andam pelas calçadas.", "Londres é a maior cidade do Reino Unido.", # Translations from English: From f09c799a967ce52a85d1a64357fd5687426d974c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20=C5=A0uppa?= Date: Sun, 30 Jan 2022 08:45:06 +0100 Subject: [PATCH 029/177] fix: Add missing comma to `_eleven_to_beyond` (#10166) * This comma has been most probably been left out unintentionally, leading to string concatenation between the two consecutive lines. This issue has been found automatically using a regular expression. --- spacy/lang/hi/lex_attrs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py index a18c2e513..ee845e8b1 100644 --- a/spacy/lang/hi/lex_attrs.py +++ b/spacy/lang/hi/lex_attrs.py @@ -90,7 +90,7 @@ _eleven_to_beyond = [ "अड़सठ", "उनहत्तर", "सत्तर", - "इकहत्तर" + "इकहत्तर", "बहत्तर", "तिहत्तर", "चौहत्तर", From 345e7f6bc4f768d41f5c2aaa6b2b3d4d2dc67c21 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Mon, 31 Jan 2022 15:41:42 +0800 Subject: [PATCH 030/177] Clarify Span.ents documentation (#10154) * Clarify Span.ents documentation Ref: #10135 Retain current behaviour. Span.ents will only include entities within said span. You can't get tokens outside of the original span. * Reword docstrings Co-authored-by: Adriane Boyd * Update API docs in the website Co-authored-by: Adriane Boyd --- spacy/tokens/span.pyx | 4 ++-- website/docs/api/span.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index f7ddc5136..970c09d60 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -471,8 +471,8 @@ cdef class Span: @property def ents(self): - """The named entities in the span. Returns a tuple of named entity - `Span` objects, if the entity recognizer has been applied. + """The named entities that fall completely within the span. Returns + a tuple of `Span` objects. RETURNS (tuple): Entities in the span, one `Span` per entity. diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 7ecebf93e..ff7905bc0 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -257,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be ## Span.ents {#ents tag="property" new="2.0.13" model="ner"} -The named entities in the span. Returns a tuple of named entity `Span` objects, -if the entity recognizer has been applied. +The named entities that fall completely within the span. Returns a tuple of +`Span` objects. > #### Example > From fc3d446c7188e128f851b2a9c8ec446748bdc02f Mon Sep 17 00:00:00 2001 From: Evgen Kytonin Date: Tue, 1 Feb 2022 13:24:00 +0200 Subject: [PATCH 031/177] Update Ukrainian tokenizer_exceptions --- spacy/lang/uk/tokenizer_exceptions.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index 94016fd52..7e168a27c 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -6,19 +6,30 @@ from ...util import update_exc _exc = {} for exc_data in [ + {ORTH: "обл.", NORM: "область"}, + {ORTH: "р-н.", NORM: "район"}, + {ORTH: "р-н", NORM: "район"}, + {ORTH: "м.", NORM: "місто"}, {ORTH: "вул.", NORM: "вулиця"}, - {ORTH: "ім.", NORM: "імені"}, {ORTH: "просп.", NORM: "проспект"}, + {ORTH: "пр-кт", NORM: "проспект"}, {ORTH: "бул.", NORM: "бульвар"}, {ORTH: "пров.", NORM: "провулок"}, {ORTH: "пл.", NORM: "площа"}, + {ORTH: "майд.", NORM: "майдан"}, + {ORTH: "мкр.", NORM: "мікрорайон"}, + {ORTH: "ст.", NORM: "станція"}, + {ORTH: "ж/м", NORM: "житловий масив"}, + {ORTH: "наб.", NORM: "набережна"}, + {ORTH: "в/ч", NORM: "військова частина"}, + {ORTH: "в/м", NORM: "військове містечко"}, + {ORTH: "оз.", NORM: "озеро"}, + {ORTH: "ім.", NORM: "імені"}, {ORTH: "г.", NORM: "гора"}, {ORTH: "п.", NORM: "пан"}, - {ORTH: "м.", NORM: "місто"}, {ORTH: "проф.", NORM: "професор"}, {ORTH: "акад.", NORM: "академік"}, {ORTH: "доц.", NORM: "доцент"}, - {ORTH: "оз.", NORM: "озеро"}, ]: _exc[exc_data[ORTH]] = [exc_data] From a2f27ff83aec1aac8c1798efdf75519c9e49cdde Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 3 Feb 2022 12:30:09 +0100 Subject: [PATCH 032/177] Added spacy-wrap to universe (#10168) * Added spacy-wrap to universe Added spacy-wrap to universe a small package for wrapping fine-tuned huggingface transformers to a spacy pipeline following the same API as spacy-transformers. (Currently limited to classification models) * Update website/meta/universe.json * Update website/meta/universe.json * Update website/meta/universe.json * Update website/meta/universe.json Co-authored-by: Adriane Boyd --- website/meta/universe.json | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index ba770a3fd..b1a61598e 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -978,6 +978,48 @@ "category": ["pipeline"], "tags": ["pipeline", "danish"] }, + { + "id": "spacy-wrap", + "title": "spaCy-wrap", + "slogan": "For Wrapping fine-tuned transformers in spaCy pipelines", + "description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.", + "github": "kennethenevoldsen/spacy-wrap", + "pip": "spacy_wrap", + "code_example": [ + "import spacy", + "import spacy_wrap", + "", + "nlp = spacy.blank('en')", + "config = {", + " 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass", + " 'doc_extension_prediction': 'sentiment', # document extention for the prediction", + " 'labels': ['negative', 'neutral', 'positive'],", + " 'model': {", + " 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model", + "},", + "}", + "", + "transformer = nlp.add_pipe('classification_transformer', config=config)", + "transformer.model.initialize()", + "", + "doc = nlp('spaCy is a wonderful tool')", + "", + "print(doc._.clf_trf_data)", + "# TransformerData(wordpieces=...", + "print(doc._.sentiment)", + "# 'positive'", + "print(doc._.sentiment_prob)", + "# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}" + ], + "thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "KennethEnevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["pipeline", "models", "training"], + "tags": ["pipeline", "models", "transformers"] + }, { "id": "textdescriptives", "title": "TextDescriptives", From fef896ce49093357247d223e4f4d65d8811ac380 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 3 Feb 2022 17:01:53 +0100 Subject: [PATCH 033/177] Allow Example to align whitespace annotation (#10189) Remove exception for whitespace tokens in `Example.get_aligned` so that annotation on whitespace tokens is aligned in the same way as for non-whitespace tokens. --- spacy/tests/training/test_new_example.py | 10 ++++++++++ spacy/training/example.pyx | 21 +++++++++------------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 4dd90f416..a39d40ded 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -421,3 +421,13 @@ def test_Example_missing_heads(): # Ensure that the missing head doesn't create an artificial new sentence start expected = [True, False, False, False, False, False] assert example.get_aligned_sent_starts() == expected + + +def test_Example_aligned_whitespace(en_vocab): + words = ["a", " ", "b"] + tags = ["A", "SPACE", "B"] + predicted = Doc(en_vocab, words=words) + reference = Doc(en_vocab, words=words, tags=tags) + + example = Example(predicted, reference) + assert example.get_aligned("TAG", as_string=True) == tags diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 732203e7b..d792c9bbf 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -159,20 +159,17 @@ cdef class Example: gold_values = self.reference.to_array([field]) output = [None] * len(self.predicted) for token in self.predicted: - if token.is_space: + values = gold_values[align[token.i].dataXd] + values = values.ravel() + if len(values) == 0: output[token.i] = None + elif len(values) == 1: + output[token.i] = values[0] + elif len(set(list(values))) == 1: + # If all aligned tokens have the same value, use it. + output[token.i] = values[0] else: - values = gold_values[align[token.i].dataXd] - values = values.ravel() - if len(values) == 0: - output[token.i] = None - elif len(values) == 1: - output[token.i] = values[0] - elif len(set(list(values))) == 1: - # If all aligned tokens have the same value, use it. - output[token.i] = values[0] - else: - output[token.i] = None + output[token.i] = None if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] return output From 6f551043e4dd2f248d8c2a62f0489fea437c999d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 4 Feb 2022 21:09:48 +0100 Subject: [PATCH 034/177] Use paths.vectors for vectors in init config (#10146) So that overriding `paths.vectors` works consistently in generated configs, set vectors model in `paths.vectors` and always refer to this path in `initialize.vectors`. --- spacy/cli/templates/quickstart_training.jinja | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b78806fec..fb79a4f60 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements. [paths] train = null dev = null +{% if use_transformer or optimize == "efficiency" or not word_vectors -%} +vectors = null +{% else -%} +vectors = "{{ word_vectors }}" +{% endif -%} [system] {% if use_transformer -%} @@ -421,8 +426,4 @@ compound = 1.001 {% endif %} [initialize] -{% if use_transformer or optimize == "efficiency" or not word_vectors -%} vectors = ${paths.vectors} -{% else -%} -vectors = "{{ word_vectors }}" -{% endif -%} From 0668a449ba68d7831626d9e2031cb84930db982c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sat, 5 Feb 2022 17:59:24 +0100 Subject: [PATCH 035/177] Add Pipe.hide_labels to omit labels from pipeline meta (#10175) --- spacy/language.py | 5 ++++- spacy/pipeline/pipe.pyi | 2 ++ spacy/pipeline/pipe.pyx | 4 ++++ spacy/pipeline/senter.pyx | 4 ++++ spacy/tests/pipeline/test_senter.py | 4 ++++ 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 217356b4c..fdce34ac4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -354,12 +354,15 @@ class Language: @property def pipe_labels(self) -> Dict[str, List[str]]: """Get the labels set by the pipeline components, if available (if - the component exposes a labels property). + the component exposes a labels property and the labels are not + hidden). RETURNS (Dict[str, List[str]]): Labels keyed by component name. """ labels = {} for name, pipe in self._components: + if hasattr(pipe, "hide_labels") and pipe.hide_labels is True: + continue if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) return SimpleFrozenDict(labels) diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi index c7c0568f9..9dd6a9d50 100644 --- a/spacy/pipeline/pipe.pyi +++ b/spacy/pipeline/pipe.pyi @@ -26,6 +26,8 @@ class Pipe: @property def labels(self) -> Tuple[str, ...]: ... @property + def hide_labels(self) -> bool: ... + @property def label_data(self) -> Any: ... def _require_labels(self) -> None: ... def set_error_handler( diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 9eddc1e3f..d24e4d574 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -102,6 +102,10 @@ cdef class Pipe: def labels(self) -> Tuple[str, ...]: return tuple() + @property + def hide_labels(self) -> bool: + return False + @property def label_data(self): """Optional JSON-serializable data that would be sufficient to recreate diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 54ce021af..5d2688463 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def hide_labels(self): + return True + @property def label_data(self): return None diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 7a256f79b..047f59bef 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -97,3 +97,7 @@ def test_overfitting_IO(): ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + + # test internal pipe labels vs. Language.pipe_labels with hidden labels + assert nlp.get_pipe("senter").labels == ("I", "S") + assert "senter" not in nlp.pipe_labels From 91ccacea12a46c62ccb5e7f6de891a37cb71e184 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 6 Feb 2022 16:30:30 +0100 Subject: [PATCH 036/177] Auto-format code with black (#10209) * Auto-format code with black * add black requirement to dev dependencies and pin to 22.x * ignore black dependency for comparison with setup.cfg Co-authored-by: explosion-bot Co-authored-by: svlandeg --- requirements.txt | 1 + spacy/language.py | 2 +- spacy/ml/models/multi_task.py | 2 +- spacy/pipeline/spancat.py | 2 +- spacy/pipeline/textcat.py | 4 ++-- spacy/tests/package/test_requirements.py | 1 + 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8d7372cfe..ca4099be5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,4 @@ mypy==0.910 types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-requests +black>=22.0,<23.0 diff --git a/spacy/language.py b/spacy/language.py index fdce34ac4..e8fd2720c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -131,7 +131,7 @@ class Language: self, vocab: Union[Vocab, bool] = True, *, - max_length: int = 10 ** 6, + max_length: int = 10**6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, batch_size: int = 1000, diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 9e1face63..a7d67c6dd 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char): target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") target = target.reshape((-1, 256 * nr_char)) diff = prediction - target - loss = (diff ** 2).sum() + loss = (diff**2).sum() d_target = diff / float(prediction.shape[0]) return loss, d_target diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 32c1275a6..5d0d8f17e 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -377,7 +377,7 @@ class SpanCategorizer(TrainablePipe): # If the prediction is 0.9 and it's false, the gradient will be # 0.9 (0.9 - 0.0) d_scores = scores - target - loss = float((d_scores ** 2).sum()) + loss = float((d_scores**2).sum()) return loss, d_scores def initialize( diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 30a65ec52..7f5510933 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -281,7 +281,7 @@ class TextCategorizer(TrainablePipe): bp_scores(gradient) if sgd is not None: self.finish_update(sgd) - losses[self.name] += (gradient ** 2).sum() + losses[self.name] += (gradient**2).sum() return losses def _examples_to_truth( @@ -315,7 +315,7 @@ class TextCategorizer(TrainablePipe): not_missing = self.model.ops.asarray(not_missing) # type: ignore d_scores = (scores - truths) / scores.shape[0] d_scores *= not_missing - mean_square_error = (d_scores ** 2).sum(axis=1).mean() + mean_square_error = (d_scores**2).sum(axis=1).mean() return float(mean_square_error), d_scores def add_label(self, label: str) -> int: diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 75908df59..e20227455 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -12,6 +12,7 @@ def test_build_dependencies(): "flake8", "hypothesis", "pre-commit", + "black", "mypy", "types-dataclasses", "types-mock", From 63e1e4e8f637085b6dfa42d2918cf30e149d7474 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Feb 2022 08:53:30 +0100 Subject: [PATCH 037/177] Fix debug data check for ents that cross sents (#10188) * Fix debug data check for ents that cross sents * Use aligned sent starts to have the same indices for the NER and sent start annotation * Add a temporary, insufficient hack for the case where a sentence-initial reference token is split into multiple tokens in the predicted doc, since `Example.get_aligned("SENT_START")` currently aligns `True` to all the split tokens. * Improve test example * Use Example.get_aligned_sent_starts * Add test for crossing entity --- spacy/cli/debug_data.py | 3 ++- spacy/tests/test_cli.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index ab7c20d48..4be749204 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -603,6 +603,7 @@ def _compile_gold( if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in factory_names: + sent_starts = eg.get_aligned_sent_starts() for i, label in enumerate(eg.get_aligned_ner()): if label is None: continue @@ -612,7 +613,7 @@ def _compile_gold( if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 - if gold[i].is_sent_start and label.startswith(("I-", "L-")): + if sent_starts[i] == True and label.startswith(("I-", "L-")): data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 253469909..9d5bdfab2 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands -from spacy.cli.debug_data import _get_labels_from_model +from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config @@ -22,6 +22,7 @@ from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import Language from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.tokens import Doc from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs @@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name): assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) else: assert _get_labels_from_model(nlp, factory_name) == set(labels) + + +def test_debug_data_compile_gold(): + nlp = English() + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"]) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 0 + + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"]) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 1 From 72fece712f2706c3338365fa6eed179b6b7f8848 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Mon, 7 Feb 2022 21:55:53 +0800 Subject: [PATCH 038/177] Add shuffle parameter to Corpus API docs (#10220) * Add shuffle parameter to Corpus API docs * Update website/docs/api/corpus.md Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- website/docs/api/corpus.md | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 986c6f458..35afc8fea 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -79,6 +79,7 @@ train/test skew. | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | +| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} From 42072f4468b353d785214a82b67ace38b728f9b5 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Mon, 7 Feb 2022 22:03:36 +0800 Subject: [PATCH 039/177] Add spancat pipeline in spacy debug data (#10070) * Setup debug data for spancat * Add check for missing labels * Add low-level data warning error * Improve logic when compiling the gold train data * Implement check for negative examples * Remove breakpoint * Remove ws_ents and missing entity checks * Fix mypy errors * Make variable name spans_key consistent * Rename pipeline -> component for consistency * Account for missing labels per spans_key * Cleanup variable names for consistency * Improve brevity of conditional statements * Remove unused variables * Include spans_key as an argument for _get_examples * Add a conditional check for spans_key * Update spancat debug data based on new API - Instead of using _get_labels_from_model(), I'm now using _get_labels_from_spancat() (cf. https://github.com/explosion/spaCy/pull10079) - The way information is displayed was also changed (text -> table) * Rename model_labels to ensure mypy works * Update wording on warning messages Use "span type" instead of "entity type" in wording the warning messages. This is because Spans aren't necessarily entities. * Update component type into a Literal This is to make it clear that the component parameter should only accept either 'spancat' or 'ner'. * Update checks to include actual model span_keys Instead of looking at everything in the data, we only check those span_keys from the actual spancat component. Instead of doing the filter inside the for-loop, I just made another dictionary, data_labels_in_component to hold this value. * Update spacy/cli/debug_data.py * Show label counts only when verbose is True Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/cli/debug_data.py | 102 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 7 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4be749204..a63795148 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -193,6 +193,70 @@ def debug_data( else: msg.info("No word vectors present in the package") + if "spancat" in factory_names: + model_labels_spancat = _get_labels_from_spancat(nlp) + has_low_data_warning = False + has_no_neg_warning = False + + msg.divider("Span Categorization") + msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True) + + msg.text("Label counts in train data: ", show=verbose) + for spans_key, data_labels in gold_train_data["spancat"].items(): + msg.text( + f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}", + show=verbose, + ) + # Data checks: only take the spans keys in the actual spancat components + data_labels_in_component = { + spans_key: gold_train_data["spancat"][spans_key] + for spans_key in model_labels_spancat.keys() + } + for spans_key, data_labels in data_labels_in_component.items(): + for label, count in data_labels.items(): + # Check for missing labels + spans_key_in_model = spans_key in model_labels_spancat.keys() + if (spans_key_in_model) and ( + label not in model_labels_spancat[spans_key] + ): + msg.warn( + f"Label '{label}' is not present in the model labels of key '{spans_key}'. " + "Performance may degrade after training." + ) + # Check for low number of examples per label + if count <= NEW_LABEL_THRESHOLD: + msg.warn( + f"Low number of examples for label '{label}' in key '{spans_key}' ({count})" + ) + has_low_data_warning = True + # Check for negative examples + with msg.loading("Analyzing label distribution..."): + neg_docs = _get_examples_without_label( + train_dataset, label, "spancat", spans_key + ) + if neg_docs == 0: + msg.warn(f"No examples for texts WITHOUT new label '{label}'") + has_no_neg_warning = True + + if has_low_data_warning: + msg.text( + f"To train a new span type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", + show=verbose, + ) + else: + msg.good("Good amount of examples for all labels") + + if has_no_neg_warning: + msg.text( + "Training data should always include examples of spans " + "in context, as well as examples without a given span " + "type.", + show=verbose, + ) + else: + msg.good("Examples without ocurrences available for all labels") + if "ner" in factory_names: # Get all unique NER labels present in the data labels = set( @@ -238,7 +302,7 @@ def debug_data( has_low_data_warning = True with msg.loading("Analyzing label distribution..."): - neg_docs = _get_examples_without_label(train_dataset, label) + neg_docs = _get_examples_without_label(train_dataset, label, "ner") if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True @@ -573,6 +637,7 @@ def _compile_gold( "deps": Counter(), "words": Counter(), "roots": Counter(), + "spancat": dict(), "ws_ents": 0, "boundary_cross_ents": 0, "n_words": 0, @@ -617,6 +682,15 @@ def _compile_gold( data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 + if "spancat" in factory_names: + for span_key in list(eg.reference.spans.keys()): + if span_key not in data["spancat"]: + data["spancat"][span_key] = Counter() + for i, span in enumerate(eg.reference.spans[span_key]): + if span.label_ is None: + continue + else: + data["spancat"][span_key][span.label_] += 1 if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) if any(val not in (0, 1) for val in gold.cats.values()): @@ -687,14 +761,28 @@ def _format_labels( return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) -def _get_examples_without_label(data: Sequence[Example], label: str) -> int: +def _get_examples_without_label( + data: Sequence[Example], + label: str, + component: Literal["ner", "spancat"] = "ner", + spans_key: Optional[str] = "sc", +) -> int: count = 0 for eg in data: - labels = [ - label.split("-")[1] - for label in eg.get_aligned_ner() - if label not in ("O", "-", None) - ] + if component == "ner": + labels = [ + label.split("-")[1] + for label in eg.get_aligned_ner() + if label not in ("O", "-", None) + ] + + if component == "spancat": + labels = ( + [span.label_ for span in eg.reference.spans[spans_key]] + if spans_key in eg.reference.spans + else [] + ) + if label not in labels: count += 1 return count From e4625d2fc3c0580dcdd62d9b817554c31fe8b75e Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 8 Feb 2022 08:32:11 +0100 Subject: [PATCH 040/177] Added Augmenty to universe (#10229) * Added Augmenty to universe * Update website/meta/universe.json Co-authored-by: Adriane Boyd * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem Co-authored-by: Adriane Boyd Co-authored-by: Sofie Van Landeghem --- website/meta/universe.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index b1a61598e..1a67de67b 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -953,6 +953,37 @@ "category": ["pipeline"], "tags": ["lemmatizer", "danish"] }, + { + "id": "augmenty", + "title": "Augmenty", + "slogan": "The cherry on top of your NLP pipeline", + "description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.", + "github": "kennethenevoldsen/augmenty", + "pip": "augmenty", + "code_example": [ + "import spacy", + "import augmenty", + "", + "nlp = spacy.load('en_core_web_md')", + "", + "docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])", + "", + "ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}", + "entity_augmenter = augmenty.load('ents_replace.v1',", + " ent_dict = ent_dict, level=1)", + "", + "for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):", + " print(doc)" + ], + "thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "kennethenevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["training", "research"], + "tags": ["training", "research", "augmentation"] + }, { "id": "dacy", "title": "DaCy", From 836f689cc7b2729d071174629a58fc09f3e12cec Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 8 Feb 2022 02:35:09 -0500 Subject: [PATCH 041/177] YAML multiline tip for project.yml files (#10187) * MultiHashEmbed vector docs correction * add in multi-line tip * convert to sidebar tip --- website/docs/usage/projects.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index e0e787a1d..57d226913 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up a quick web demo. It looks pretty similar to a config file used to define CI pipelines. +> #### Tip: Multi-line YAML syntax for long values +> +> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be +> helpful for readability with longer values such as project descriptions or +> commands that take several arguments. + ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` From deb143fa709461ea6b8fddd17006908f7bea7f55 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 8 Feb 2022 08:35:37 +0100 Subject: [PATCH 042/177] Token sent attributes more consistent (#10164) * remove duplicate line * add sent start/end token attributes to the docs * let has_annotation work with IS_SENT_END * elif instead of if * add has_annotation test for sent attributes * fix typo * remove duplicate is_sent_start entry in docs --- spacy/glossary.py | 1 - spacy/tests/doc/test_doc_api.py | 22 ++++++++++++++++++++++ spacy/tokens/doc.pyx | 2 ++ spacy/tokens/token.pyx | 2 -- website/docs/api/doc.md | 2 +- website/docs/api/token.md | 19 ++----------------- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index e45704fc5..57254330f 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -310,7 +310,6 @@ GLOSSARY = { "re": "repeated element", "rs": "reported speech", "sb": "subject", - "sb": "subject", "sbp": "passivized subject (PP)", "sp": "subject or predicate", "svp": "separable verb prefix", diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 10700b787..858c7cbb6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -684,6 +684,7 @@ def test_has_annotation(en_vocab): attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") for attr in attrs: assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) doc[0].tag_ = "A" doc[0].pos_ = "X" @@ -709,6 +710,27 @@ def test_has_annotation(en_vocab): assert doc.has_annotation(attr, require_complete=True) +def test_has_annotation_sents(en_vocab): + doc = Doc(en_vocab, words=["Hello", "beautiful", "world"]) + attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END") + for attr in attrs: + assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + # The first token (index 0) is always assumed to be a sentence start, + # and ignored by the check in doc.has_annotation + + doc[1].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + doc[2].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert doc.has_annotation(attr, require_complete=True) + + def test_is_flags_deprecated(en_tokenizer): doc = en_tokenizer("test") with pytest.deprecated_call(): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5a0db115d..d33764ac9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -420,6 +420,8 @@ cdef class Doc: cdef int range_start = 0 if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: attr = SENT_START + elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]: + attr = SENT_START attr = intify_attr(attr) # adjust attributes if attr == HEAD: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b515ab67b..d14930348 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -487,8 +487,6 @@ cdef class Token: RETURNS (bool / None): Whether the token starts a sentence. None if unknown. - - DOCS: https://spacy.io/api/token#is_sent_start """ def __get__(self): if self.c.sent_start == 0: diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 9836b8c21..c21328caf 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. ## Doc.has_annotation {#has_annotation tag="method"} -Check whether the doc contains annotation on a token attribute. +Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes). diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 44a2ea9e8..3c3d12d54 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants. | ---------- | ------------------------------------------------------------------------------------ | | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ | -## Token.is_sent_start {#is_sent_start tag="property" new="2"} - -A boolean value indicating whether the token starts a sentence. `None` if -unknown. Defaults to `True` for the first token in the `Doc`. - -> #### Example -> -> ```python -> doc = nlp("Give it back! He pleaded.") -> assert doc[4].is_sent_start -> assert not doc[5].is_sent_start -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------- | -| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ | - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -465,6 +448,8 @@ The L2 norm of the token's vector representation. | `is_punct` | Is the token punctuation? ~~bool~~ | | `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | | `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. | +| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. | | `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | | `is_bracket` | Is the token a bracket? ~~bool~~ | | `is_quote` | Is the token a quotation mark? ~~bool~~ | From e9c26f2ee9f03c2aa6b7cd724f4c0b3717507211 Mon Sep 17 00:00:00 2001 From: Antti Ajanki Date: Tue, 8 Feb 2022 09:44:11 +0200 Subject: [PATCH 043/177] Add a noun chunker for Finnish (#10214) with test cases --- spacy/lang/fi/__init__.py | 2 + spacy/lang/fi/syntax_iterators.py | 79 +++++++++++ spacy/tests/lang/fi/test_noun_chunks.py | 174 ++++++++++++++++++++++++ 3 files changed, 255 insertions(+) create mode 100644 spacy/lang/fi/syntax_iterators.py create mode 100644 spacy/tests/lang/fi/test_noun_chunks.py diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 86a834170..c3a0cf451 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults @@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Finnish(Language): diff --git a/spacy/lang/fi/syntax_iterators.py b/spacy/lang/fi/syntax_iterators.py new file mode 100644 index 000000000..6b481e51f --- /dev/null +++ b/spacy/lang/fi/syntax_iterators.py @@ -0,0 +1,79 @@ +from typing import Iterator, Tuple, Union +from ...tokens import Doc, Span +from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """Detect base noun phrases from a dependency parse. Works on both Doc and Span.""" + labels = [ + "appos", + "nsubj", + "nsubj:cop", + "obj", + "obl", + "ROOT", + ] + extend_labels = [ + "amod", + "compound", + "compound:nn", + "flat:name", + "nmod", + "nmod:gobj", + "nmod:gsubj", + "nmod:poss", + "nummod", + ] + + def potential_np_head(word): + return word.pos in (NOUN, PROPN) and ( + word.dep in np_deps or word.head.pos == PRON + ) + + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + np_deps = [doc.vocab.strings[label] for label in labels] + extend_deps = [doc.vocab.strings[label] for label in extend_labels] + np_label = doc.vocab.strings.add("NP") + conj_label = doc.vocab.strings.add("conj") + + rbracket = 0 + prev_end = -1 + for i, word in enumerate(doclike): + if i < rbracket: + continue + + # Is this a potential independent NP head or coordinated with + # a NOUN that is itself an independent NP head? + # + # e.g. "Terveyden ja hyvinvoinnin laitos" + if potential_np_head(word) or ( + word.dep == conj_label and potential_np_head(word.head) + ): + # Try to extend to the left to include adjective/num + # modifiers, compound words etc. + lbracket = word.i + for ldep in word.lefts: + if ldep.dep in extend_deps: + lbracket = ldep.left_edge.i + break + + # Prevent nested chunks from being produced + if lbracket <= prev_end: + continue + + rbracket = word.i + # Try to extend the span to the right to capture + # appositions and noun modifiers + for rdep in word.rights: + if rdep.dep in extend_deps: + rbracket = rdep.i + prev_end = rbracket + + yield lbracket, rbracket + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/lang/fi/test_noun_chunks.py b/spacy/tests/lang/fi/test_noun_chunks.py new file mode 100644 index 000000000..cc3b5aa36 --- /dev/null +++ b/spacy/tests/lang/fi/test_noun_chunks.py @@ -0,0 +1,174 @@ +import pytest +from spacy.tokens import Doc + + +FI_NP_TEST_EXAMPLES = [ + ( + "Kaksi tyttöä potkii punaista palloa", + ["NUM", "NOUN", "VERB", "ADJ", "NOUN"], + ["nummod", "nsubj", "ROOT", "amod", "obj"], + [1, 1, 0, 1, -2], + ["Kaksi tyttöä", "punaista palloa"], + ), + ( + "Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä", + ["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"], + ["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"], + [1, 1, 1, 0, 1, 1, -3], + ["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"], + ), + ( + "Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä", + ["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"], + ["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"], + [3, 1, -2, 0, 1, -2, -1], + ["Leijona raidallisine tassuineen", "Porin kaupungin"], + ), + ( + "Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä", + ["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"], + ["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"], + [1, 0, -1, 2, 1, -3, 2, 1, -6], + ["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"], + ), + ( + "Minua houkuttaa maalle muuttaminen talven jälkeen", + ["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"], + ["obj", "ROOT", "nmod", "nsubj", "obl", "case"], + [1, 0, 1, -2, -3, -1], + ["maalle muuttaminen", "talven"], + ), + ( + "Päivän kohokohta oli vierailu museossa kummilasten kanssa", + ["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"], + ["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"], + [1, 2, 1, 0, -1, -2, -1], + ["Päivän kohokohta", "vierailu museossa", "kummilasten"], + ), + ( + "Yrittäjät maksoivat tuomioistuimen määräämät korvaukset", + ["NOUN", "VERB", "NOUN", "VERB", "NOUN"], + ["nsubj", "ROOT", "nsubj", "acl", "obj"], + [1, 0, 1, 1, -3], + ["Yrittäjät", "tuomioistuimen", "korvaukset"], + ), + ( + "Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia", + ["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"], + ["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"], + [4, 3, 1, 1, 3, 2, 1, 0], + ["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"], + ), + ( + "Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta", + ["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"], + ["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"], + [3, 2, 1, 0, 1, -2, 2, 1, -3], + ["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"], + ), + ( + "Isä souti veneellä, jonka hän oli vuokrannut", + ["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"], + ["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"], + [1, 0, -1, 4, 3, 2, 1, -5], + ["Isä", "veneellä"], + ), + ( + "Kirja, jonka poimin hyllystä, kertoo norsuista", + ["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"], + ["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"], + [6, 2, 1, -3, -1, 1, 0, -1], + ["Kirja", "hyllystä", "norsuista"], + ), + ( + "Huomenna on päivä, jota olemme odottaneet", + ["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"], + ["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"], + [0, -1, -2, 3, 2, 1, -4], + ["Huomenna", "päivä"], + ), + ( + "Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista", + ["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"], + ["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"], + [1, 2, 1, 0, 2, 1, -3], + [ + "Liikkuvuuden lisääminen", + "korkeakoulutuksen keskeisistä kehittämiskohteista", + ], + ), + ( + "Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi", + ["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"], + ["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"], + [1, 1, 0, 1, 1, -3], + ["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"], + ), + ( + "New York tunnetaan kaupunkina, joka ei koskaan nuku", + ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"], + ["obj", "flat:name", "ROOT", "obl", "punct", "nsubj", "aux", "advmod", "acl:relcl"], + [2, -1, 0, -1, 4, 3, 2, 1, -5], + ["New York", "kaupunkina"], + ), + ( + "Loput vihjeet saat herra Möttöseltä", + ["NOUN", "NOUN", "VERB", "NOUN", "PROPN"], + ["compound:nn", "obj", "ROOT", "compound:nn", "obj"], + [1, 1, 0, 1, -2], + ["Loput vihjeet", "herra Möttöseltä"], + ), + ( + "mahdollisuus tukea muita päivystysyksiköitä", + ["NOUN", "VERB", "PRON", "NOUN"], + ["ROOT", "acl", "det", "obj"], + [0, -1, 1, -2], + ["mahdollisuus", "päivystysyksiköitä"], + ), + ( + "sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa", + ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"], + ["nsubj", "ROOT", "obj", "obl", "amod", "obl"], + [1, 0, -1, -1, 1, -3], + ["sairaanhoitopiirit", "leikkaustoimintaa", "alueellaan", "useammassa sairaalassa"], + ), + ( + "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa", + ["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"], + ["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"], + [5, -1, 3, 2, 1, 0], + ["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"], + ), +] + + +def test_noun_chunks_is_parsed(fi_tokenizer): + """Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = fi_tokenizer("Tämä on testi") + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +@pytest.mark.parametrize( + "text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES +) +def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks): + tokens = fi_tokenizer(text) + + assert len(heads) == len(pos) + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[head + i for i, head in enumerate(heads)], + deps=deps, + pos=pos, + ) + + noun_chunks = list(doc.noun_chunks) + assert len(noun_chunks) == len(expected_noun_chunks) + for i, np in enumerate(noun_chunks): + assert np.text == expected_noun_chunks[i] From f939da0bfa7f53fdb8ad1a200a8702e184443694 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Tue, 8 Feb 2022 11:05:35 +0200 Subject: [PATCH 044/177] Add github actions for slow and gpu tests (#10225) * Add github actions for slow and gpu tests * change weekly GPU tests to also run slow tests, and change the time * only run the tests if there were commits in the past day --- .github/workflows/gputests.yml | 19 +++++++++++++++++++ .github/workflows/slowtests.yml | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 .github/workflows/gputests.yml create mode 100644 .github/workflows/slowtests.yml diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml new file mode 100644 index 000000000..7c062fe4c --- /dev/null +++ b/.github/workflows/gputests.yml @@ -0,0 +1,19 @@ +on: + schedule: + - cron: '0 1 * * MON' + +jobs: + weekly-gputests: + strategy: + matrix: + branch: [master, develop, v4] + runs-on: ubuntu-latest + steps: + - name: Trigger buildkite build + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-gpu-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action" + secrets: + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml new file mode 100644 index 000000000..4d4441679 --- /dev/null +++ b/.github/workflows/slowtests.yml @@ -0,0 +1,32 @@ +on: + schedule: + - cron: '0 0 * * *' + +jobs: + daily-slowtests: + strategy: + matrix: + branch: [master, develop, v4] + runs-on: ubuntu-latest + steps: + - name: Get commits from past 24 hours + id: check_commits + run: | + today=$(date '+%Y-%m-%d %H:%M:%S') + yesterday=$(date -v-1d '+%Y-%m-%d %H:%M:%S') + if git log --after=$yesterday --before=$today | grep commit ; then + echo "::set-output name=run_tests::true" + else + echo "::set-output name=run_tests::false" + fi + + - name: Trigger buildkite build + needs: check_commits + if: needs.check_commits.outputs.run_tests == 'true' + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action" + secrets: + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} From a9ee5bff98b40126dce1625a5b48f86e4ffabc77 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 8 Feb 2022 10:52:46 +0100 Subject: [PATCH 045/177] Support mixed case model package names (#10223) --- spacy/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 14714143c..2a8b9f5cc 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path: name (str): Package name. RETURNS (Path): Path to installed package. """ - name = name.lower() # use lowercase version to be safe # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. pkg = importlib.import_module(name) From f2c2b97e56f4f6d73e7cede8a98f7ba9668e83b0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 8 Feb 2022 11:46:42 +0100 Subject: [PATCH 046/177] Add spaCy Tailored Pipelines --- README.md | 31 ++++---- .../images/spacy-tailored-pipelines_wide.png | Bin 0 -> 44724 bytes website/meta/sidebars.json | 6 +- website/meta/site.json | 6 +- website/src/components/list.js | 9 ++- website/src/styles/list.module.sass | 10 +++ website/src/widgets/landing.js | 71 +++++++++++++----- 7 files changed, 94 insertions(+), 39 deletions(-) create mode 100644 website/docs/images/spacy-tailored-pipelines_wide.png diff --git a/README.md b/README.md index 57d76fb45..05c912ffa 100644 --- a/README.md +++ b/README.md @@ -32,19 +32,20 @@ open-source software, released under the MIT license. ## 📖 Documentation -| Documentation | | -| -------------------------- | -------------------------------------------------------------- | -| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | -| 📚 **[Usage Guides]** | How to use spaCy and its features. | -| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | -| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | -| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | -| 📦 **[Models]** | Download trained pipelines for spaCy. | -| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | -| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | -| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | -| 🛠 **[Changelog]** | Changes and version history. | -| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | +| Documentation | | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | +| 📚 **[Usage Guides]** | How to use spaCy and its features. | +| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | +| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | +| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | +| 📦 **[Models]** | Download trained pipelines for spaCy. | +| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | +| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | +| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | +| 🛠 **[Changelog]** | Changes and version history. | +| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | +| spaCy Tailored Pipelines | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** | [spacy 101]: https://spacy.io/usage/spacy-101 [new in v3.0]: https://spacy.io/usage/v3 @@ -60,9 +61,7 @@ open-source software, released under the MIT license. ## 💬 Where to ask questions -The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, -**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. diff --git a/website/docs/images/spacy-tailored-pipelines_wide.png b/website/docs/images/spacy-tailored-pipelines_wide.png new file mode 100644 index 0000000000000000000000000000000000000000..d1a762ebe06a330ba00c5593803221db300c8bcd GIT binary patch literal 44724 zcmeFYbyQs2voP2Y+%3V~gS$Jyr6CX?I2|lFjZ1*wA;BTRB|thrkYK?H?%KFRu*Rh! zSeSwVw6HGj@}XTj?3b87GM+O>Dpi3VyZ<6==@0RRA8RTV{D001ok06-GM zKttT|(Z?`Ad|OEDZawldJ!zmj4w5Ypt;bXBmnvsdwRw|?%Yr3dnJ0ExX~ zkdda7^bto);AHJ#N$2C_=nNM3kz#n5R~+&E{x%N--2;e+gA{|@JwiG|O(30ui@PGPhY#^5#Kk8fE+{H4z)$y|ivfYn{gsWluAF#H=v~uzEkYYek`WFjMuK!Hy4E~Rp5CY@zv2^9(hN+=pQ&& zPj|-$N*rNVf`Y<=yj-tD`E0lZ1+A>OtVCW3atVnDgM>jsLLx!}mVb{| zbOCwZ8^Qhfzx?5q3kWgC|4p8VjgYV)zko28l@*AWOH@!$kV}+TOpr^2*AgVkD=G@I z78d+_Hf?u%giBgF{(IJYs$L;xwC3aI6%@4*<9a2)XUQeVXJf-9Dk>_>CB$oEWogYT z%qRE?bZ__%wj{1%4@PL&_dl-m+}iCw-yH4f9?VDF5_IntQVgJbd$4}R@SofE|AjpM zTbTbd-rLR^G3o!oO8&wQcCqpBwsg0avqi}6zhQ?w|2y)*mR|p7;{SWf{EMo86aQb@ z?f*C8|05cZou#v_H6qUPFx<1lb03lZQVh@kQ)ds){uRCc21kU%`>+4RV#JSsqPev* zVrq9ptX^pj5Ci}qAXP;I2TA)5(R=-Udk3Rv zlT`W1Ti;{Q7orgU>(W^;n@1w}_l*nQXvKeDiEILjmtGNUQalW=fQwKb2@}N zA}X_9C^dutx=EQ;Cqn5bl7~kw@K2rQH(mL8+e~5ad&pyK$-lgpLf;@QZ`Lcqin3U@ zHm&qiMCG|HS(a9~sV~@#Iy`l4HXbRTDmUb4)~*_9#cFoRPF-R-6$b<-e_hvJ7E%*1a}A&L_ltYvMT)a}wx#t#n@X(<75<3A{mNX3EIPbTv5o@mo-iL+mM4T~0F_ z?JHMK<|XW}TaMJmH`nwV5gS{{vgbm)-9@IXK6HkMOEQ1(zrmOp81nZ$M!f)viq-H; ziyZ~#s4GD1H0P<{R;4IsR|{$lorxl?f-H;@8V3tf*X zW266uv0Hx#l)Eq_GL#tX6-YDieJo%B_lYcXO3}|XIoxpNe+-82J_3(D^@}1>_C9^F z2Vn$UyhR1j$T26gaCIsC4F(nScS4s{RHW<*QL7#V1iT;8pxapp_mc)gwzjItu>W>N z9o7fL!X^;QW$6%D^F$Z@ZZW|*@CpTXi8qfEfgtep#3SCf4Z(_NcX6TD4-~Vt-ADNi za(TG;6=v|n3r71TcD=&_@=qEJ3=DHDfdFX~X;}};LBimRTq5-cl1MCw5JNpoAlag{ zk3^k?63?ka`;2qw6zC6P`344P00Z>7nX{0O@)BLvg&)x~Fu;l(6N^k4+$!J8GPFZ;8uwy8 zIwOsKfu5I@XW-sTBI;#VHbRw+`#5c^7RdJc$^dC(e`bUzosdPEDhD6$NYD(ujw|Sp z4!V3XPk+CT>O|giQ=Q=ljE;+&!v#5nH77>wN60BELb+Psz`f?_JL`S%(V`1cy#%Ar zW(Vp%$=;KhkAr8ctXS<&1$Yjr4}^##5Jk+j+CInuN^ibYZ}cNJp8vPZZ%U&0Hg)YB z1M+u_I-1taG5s4B4=4;EvzcaPpU0w@*#Nw#>zAK0Ae_kh0ol4e=5b3G$6 zUcR&$(2g44$55!tYIZyF?w(IO@B`Trh;i1jFJnQZnZc(_2zET9AMK09tK97_LMc{GoNy*X=pf^;Z*`8bQesCURVqHwm&s-Eop7IR z=A#EA!g@e)%K}s=)j5sLrTCob)j@}tB|sbRVtRI=#&F9-p82DfH9|2A3{UU1Jm!@F z_%ZE8AQ2mVZ#nNIr54PEjMl>1-9UP!~83;Nx(h7XGKc^KwC|X!klT* zvuT8&s<@*nbi8A(9#B0?o)*w%&Xo^e@V$~}j=$$VN<2!H2@w<4k$?Z#r&HNQ2;e>m z`j+>Kp^x1}VTGv8vBs{y@pcc@S;jv^Lfc$W|K4Z_$J8Fg#*G(jKfniU@duzep(8A# zdMA6HHF4ocX}C$Hg+z97;2WLqgAY}+i?Sl5aK0@EcNcpt@_w85eT5704 zO3{G$BD4!Lb)CYhWB}#qxo?RM!UUauY@z#$HjNjAtrbXWF6CvB_LYd&R?m-=!OGs= z6Ay8}=iDidFEvTYdyZMpg`LN6lesplW1)8av%z%(2}NmvNXcut)H>KcL3^U`lVf;XuAi z)JX&fp}L`SEA5q)xf#knKGxV|x}U%FX~=>zM$BbNJd*GA#{lCWzBk;>pFUY16^nTH0m1l`YYnRge*KQbD_9V86nke1JRI~edOqY~ zv=`b+wOwIEmIC|_dEVkLopk)k_%mre{I6wXHPirv`ch#XK6xLa?zRk!c7{kb=KR-V;TH`9O2AMzK{ zhzF!XO5PEb#sDAmMfKNdzX4PL(*1}^K}QJyLTZ{{$j66Gm-u77BGvUq!B^ce919l6 z56)CeaKFZ6^dY&H@6#cZ0zU|vHCiSMN4~q(X|x0Y6^PE~@eta!n@}Y*hF;{Hx*su5~k zG`rcUmmAi9+l#fS&~Uuwx5#jBNm1f44?L{r--e-ATw)gdK+g z9upBI5WbA`VBb$j7q`QIF-SNL7G*F>BY8{AmE}Z5!Q9X$*C#_M{G3K#GG$mIN5vsg zJsz|TgN7+y0NxigjG*QhBn*~f0YuF`i3&P6SRNeSPUInW87VoT16Bg{c!Ok>FM%WF zTv{=n#DqF1VH^ zGBKT%#J{IA|J(hlI=1OL`i7@KdFGE&IbZN;D$L8c_+ymamXz2+BniX=X$7fWaCyle zaD^+?tjU*(h(K8Wbk_cl!(##OaWRVomxXyBCNu4Nm`v26{tqhj2Bpy=*lSZ6RtIDy zB%EB=D84*s4@>0c<9FoEE~AuXo_g9PhCUo-g?XYp#F3CjeK5sTj7?M5~VH<6vvf&o`No<)VOS@XTz9md>B%XyPf0-18TgMDvs!a-!{<4kSGzz}p|sruVXK z9m$KpyMe6w#l;vH8RNqm+rs_I&vTcOK1NB4NI?!meIFfrq)-|MZ#S~;6>jE9ET<`E z^oL(Llgq+s>VvQLvT8}R-Q4q0EZvTkKam(sJDAISE25J^AGy3 zCVDzTd=o@+n?kf07@fEbM*olh;t|!|jkL}yHRD0!lvoWN6BTzG6qDJGDjVUI zMx!KpobhTl^DgO8I9}@xy5>^Y9;VK0(nBjn$|cheC}aa&=H8T<%I|-+=NjANH?ju5 zsl%YN>r5S(U!^oh8S%fAqeDW9Zv7ooB(AK6K^`aYB<}r{o8pMvWtRg_%2#A$cbt{x z`!qSMGZ5nI_Tn5?H$o7^y(7MME!nd4fiz{vbAq^yG&(SKrX1#pdPL8;DIxizK}+&u z^~W4ZbM{`kh%<*J;?*u&BJY}GP`wN-@+XY1L~0DIxZc0pGlL1+NOYGz^lNo~E)lsF zS0n#c6~M{H_J<}E=k84uk<`z2l*EdF!-(CTZ0(yEX+KU%hehsXAJq10{O9G~3rn77 zFKwhL&w4Ijj&Q(kA9F09T~SC^y~}>Suy4!Lk50bV*twhLH#}9@6<3)|IkGSPxqTRX zJl#f?;gz)#V$tk>_gkhK;RlEn=J|8g0-Ag?UT$bggcEiW_H)JN7qvqKZ9NW-chaz| z>rOcjxczh0D0Rqm05VImV|``e{-uwjiEMN2bI8ScQ#0`C#^|1Oef5`k~?i`d*H^)~|alkJrzE;fD^cMW5TF!JC8lvtGfywy_7 zCiMEIZ9B8?2Rc`H?0TSDmRNNNSWl)V8eQ{nb>nH&qLxho2yJ=u6VT3E zdcE%(*p5>OhD;$_0+D`M*IeD-i!BG>aWBhSoos>4Zo-=QyR7kR(h*lw%Sw!Tr)l1A z{rr!JJX5R9BPu9Uat)L-g|lh(PKf((T{y3aqJJ%i7~RB4VeXj@un?63QHWNHJlVPm@%*^+1!XK!f$U z8_~UDf8a<*hmG(j&8OTh@8J|n*TCx&R=YZp@IJy|{M*dbTh3=8Yh7mMFXEY^R=447D;=3rUVm3%(vWU++WApp{FMm zVp2&FkT-Ag7)udx0SET9RZ~^3n!UVg@i~c7!?$0m_uzpxi zKPIXN(>%Vsnp%AtT^1b%B&D2TQCB>vrThECcu+?k9K=btvwp|hxWA*8^6-q}J8pj1zO!gAro-{i8 zEd8JNQjtol4uQ?+CWn}T<3lZ5@wUw@4;055!>7jpFtr_1hp;#JiwA|{%FP{JxSZ4A z-1CVd{14|y4LKHUAOO52uzZnMdylz{)kX_>36R`7)~stKSuP*G2s~ zeZZw3!7WW*;}(`;&E*hJO9%;ri(5$NsBKYx)79s1DaO_EoOf>THzP;AD%W_kr#iaq zhfA0gt&lo->&n=ER!@yK3>YTmT|Jw3<9@IN2I~~2ag?^WO5bMnV)yq4=epz`xi|Ds z{8iz|Qh-)2MbV%-4y$J2N(NTLE40$vT#sN z}2|epKDBW`PYBXzoJF+*Uh!}`|Ww~i$>Pta7I`}O) z%DV<75!*AHH~dPfz*uSfApEaGhnLogDpC+y;l`=!-d~R(VJ{IPCCW@6;A!|Ch_M24 zG|ykdfJNR`Y>2*CmA$oCJTtS@yd+!n%UXQ0o1LE)>S@Q*~Yvx7=9m7MFpY@!h>z!;9*!hDPTo z55~ExCy=s_&&%`O-{DXef8h>IAMk$7odeMdgn;kj5}Km6F^WTOSbND&pimHou!0j- zWw5n;a~X|_gZmV>9K2f$?I@e%H_2=Bix^@5fc3VbGaVmBfl>>8Z+9fP$#DGMGlB01 zKa**z8W*I4%nH6bgPeGH+mIBUBfsY2@vw4JnV4@$QBeRsp+7pf$R03g(E)C)DH^((7x17@g{B8$9V{0&Bl_X>{aido% zCua$#3%aKam-Q)|y9!yLJ5kf+?*>MegbcS^?0Wq6@11Xo9(+oT?Mff|h%(}zY}OcshRWH*`#Jz)8T<-7Bs-nz+{(z4aL+CGxD+jsp9an1perB;aio5LM= zXIo$nzEt$wmJ6imTB5yD`jeY?BQ?VsNd;KVl$G{bMRmIH_AM!U0gMkc zzgxcb^FgmKOAD_SQ(>rTgU>40S$^J?yustnjV)LG^Ezv@mb&d-1TK`%Avad|EZ1cv zGc>yDE+RK(2hbzU{yfa(sEm}d zPw+#c3nnfR)Wlo=W3c7@ymF3UDsVf3Sfja!=m$yql2ZGLSa_v+RFf0cfIGK^j0`A$ zkS*5L9AU~lxDppfPenom6@LfjEZSNx@sNKt?9kyhITAo@ZV@|IhH}<%vP|{DQv8Z~ z+}GA(Gm9v?cJB{N*+o%_RrH^N!#yW65xj}d1W#KM16&w?JZ%h%1ePBc%((&t=eIrH z9|Ypfxgy-nB{#MKo=c(C1I)oDY9j@9a2I~me%oUkwW;Nnft~VnH1ujoT@9W-RvmpY z;k~_CBA2$hhvnhH%|u7lU#7wOn2-`}$hi?znuBa; z&h_Tn&5=^EL9zpRRe9sY=)lt#HV3DtoV&4<4Cjrkr)E(GU$U-Mh_(MX1fra)JnRPX z_?Q#Wdb8-N^|2u(;|;IJy#TnbZAvzDKPX7{@qM3&hYp2J>PUWbntaX5_u3TFNfY< z47gPspac5~#h~?Lx$R__ZC{rCA6FeVzqC&1*x;IHiVFN*fS(``U{r8K)!Ko{l*=YY zG!%0t4ZV;*Xy{u#B)auOD;n~^Y@!nrf|j-3n1{i&)-LlycLz(RLMdKq8*BA15EOlm zkuZt{2rk>AUoFW?q2%n&b!zDrY0%Y?CWaD`4sQ&{FeL`{4uTk3%s#2W=BG68ZCHTIfJS0H7SsFP+IR%M%;}hPNs!h-Wy=eA~quxvIEqatY(K^;Kd&;2_+GV9>19- zP?^&gU=EL22mG9}jOSO4-l6a(xxk=U;}7!8b)K5uWOeu4 z*kd0m8tB&fU{SCGy<&Y{WdIl-TvA`U8wh3qTJ7^`zsctA6< z)DsAKjAInj-#&s~o4Qj$k5Qp~bLEv~S?)%fAlH$b_?A8iJLw$bE>8}UbkuiT)-U>(f(Jm+# z+3H%R_p-mF@2=Ss`(>08{@a1HzjK%Ic*fn0a}A*QHG9n2@~LvTcOE6y;Rd9Z*jA?94B@hV z?9Yo>#x}_DZ%U=1Zk7UyX4_OSs#^S%PO7daq_BYB3SnEdUOj>Z1hy9AS@%)$WwI6n z?x+c4y8=|?t7xv))UuNQDQgzOCyE+meY&z=E`?tao@?#3L~b7h7%JPjNSD&j9fns2 z`voGNsP4%(qqA1*>X3;j2(n)TiW7dBRxcad&sdn=qV1}k8kc4i3n)|_rpbkRe*4>g z+6iD)wfjq5mg~2OlT^pZfx{ME?Vp<{6UgbOw3=%_z1V8fc77VPaP5VAMC8+WUPgas zge<8lJV)hkom|#r3LY%+3#Opq>5A+^1w89x#!N7BPx6U6#9!^In;#jD{G~P#^;oR0 z3wpdVCy7s%2#n}VEV`&YM?H!uF{Rj)`lQ*v8c2d4BpE5*>Pc`@)OUcM@SDdK^x(t#T8izTm`XZ<0QtF*C)8yXDv?AnM2z_3F}Qw*y=vzSX>O8 zX`TL9iK$|O9pXU0*3pPNaop8)ev3MYDc7UtTTGhz@y*Q&OgxYj@MA~T-4E|_&aWm~ z1c!^rH1C>i`jHRX^7*i9SoVBJ6Woi4JYvZ1;>9G%h9bvSiX~ zR6fhp8xO8ff}T>HSqAK;Dt7Y#2%Q-Taw=YTb3uLU!o^7eOUgA0I&(+4OC$Ic#XJWX zLqg8y6veDVE?>=xdr*dk2}SKH%JfAcLBl<$4LOPw@heV1XOP;hNG@^M zYCkDt#~Cr*CJsTE&^_$33F@z>){VxH&Evr}_g zI^sXM1KdhEv?svZ^e-CiKzMLt$Y7U_EeG_oP>BtV(7nmb5gs#*=Z0K8u@ep<5m7vi zX8N*v)v!lYJe?5px;!?Ap5jS9wp3E=nB)4#c?;7#Y{;#YbKb!yxgP&_`;G|Mj)8HaK5YU_~^{(p}B4=4Zw{{_bW+RGG{ zm1m4!t#z5D!j@S#3$zzHU0XlTzeoZ!vL!K9Wh%q+J8vw?S}^ji!6YmaLJA~L(i8%{ zxz~=jB;({}vF-y#R55`zLp?53M5$4M8Fy?CXs8wH={frd63F+hGL{^QUvh2NYA{Zo zzN@+>sHeSc3EV1O_48!0ljhqK*3)jy^BcF}npB6U2^tzqM^;k`0wbqosY_s%w}-3B zz~-Q3%v7JMU5B567kMog(^AV={>)k5jE09_@+Al&9hGSay?%s2IY~n@ zr~36)apFh$(d{Vfs4zc%|Ka4)>5py08yz$)XH`O(KaoY#`uM##yEgn)rC@SnFPfZA zJXhSu=k#HbhT6k#Z!g(cYg=&*rDU|dztC6yN{>;M-13d?)Em*$f_)O}x!Team>CQ& zG%EoeT|CtIYF6mH=IwJ^R3CYgxQ#6+j^Coewi~YP&OLKvZ)Gn=mrU@Lfgj91qrqT8 zK;>=vxm{H|cQkl*nSWF__i`fMJg z?^VU#KCRUjT~m!Xqp~cU-`*;3Ep40+S#+Zh3qp654UHz#N3G zx}vA3pgEEody53Up1z?ddYAGwOyvenOms*#5dDp(0A2yUU6?7we}^+|!yFB{c(JVp zX=1VGd$4 zNya2GBtI07FG@kA5cnVWbw0!27_ga-i2Pu6ybbr}nPm<$c7A1>AF`m^12v@k{EhG%y=3FQ$b%4SH={nt zn0_`I+VGx4hM~tUTj5C0X);mgWvG7Cmz1x|86X*A3RiFr^(55)sQ^Aq;Ks9`RuGV? z?8b#w7Q6MqJMz?K9DbIry^!l4Q`LcJ`39`l(B@b^LJuOThifiF}g46~A=jjFb%CXTAE zsiRTnZVLmoTAl@7{dku|QrMZug$=MJKptS;S!|NEa+8d_bz$?aZ(bw_W{&O*8ff6G zdqbnnv~RAyXRf|3s8P{M@UzlHJEr-sGTdWLmV}0GAUZrrI!MCw)Ki|G2OA^%JBjyM#Mb>zk0OB3X|CN8Jfz?|z-3M)C_0YqHFq7*m3Syq_xVt>^gHrr!de-JM>WFL$XW{o$wS9?> zBlNPT^~Yy@ACETlB;5GCr(`%3aC~Lf``w^#?`-le77sDdwmsS{zo7HK3Sd`ll+?_dVgr-khbs; z(U#C69|)m8!Qxy$cw9!2wsQc*N$`I6dIX#2geG~+VMWgnQLsd`(d4ltsj%Ws=|{x* z5vg{(8<*{(|?~E!EfJDv3X)ElzI!uNW#d-deytCf(*#54`_o4Y z>G!YfHg7=Vh_z$;=NqShup( zwK|s14vo2pxyeR%3LBfUjiGGg&3Ise^rP0_8#3zUr^fOt%mXKXa$Yzfg*y61D1L01 zujC}fh^5Z(O<>~X^L(W4{ke)a&Uv#1O=_12B$`b9;Nr!7OhltuN`->7lPrk`pBp>o zp!PFf0I}hh#omVC%Qu=EX89SIKPBJHBk2`PdjYCzSa7F=5P6Yg2O7*L?BIOHL%md@ zlpT!{3_0BS-kgX?Am6`^-KXity{`L!C|BQt0Xecmaze_(mu>jc_eYtm(j&m(3Lmpg zn%hxpg$$)Y2*@t8^&NZO^URH2oLSa)%eF*6Dtg&P!&5lkoOAD6ypBG3nt9Du9n2Df z0=T{w86RkEZ4F|SiOr-jj%XS*a&D_t_`TC*HOdOM8d)cKvNfoUhrj1gNB!c#mm&0) zM13xhmZ}JtQ8>P5*NM2)i}x}`UTq2aNqSt+lqi z4-hQh<$q0udhy`}+XUiZ8f(>IQ&)|$E%}!^oGOuk5i`UAV)ooXUbfdRi!J)|2>&mk z8FO3EF8R6|+O{=}@T*u*@N@8Dw4Z=A`HgAg+6y&Z$+v#xTKvz{qd!mI$y?R8v|M@P zf=tD6NfE-^VK|hvhDSjSMc>$v%2o@ zwh(DCuh=2-Zth_h2|zwBuQMc@8^?Yt_fIx6wx9&7u~F=X-vkAS%>mDIFIkFIBIJsO ztkkd^RG-~yb@}b;a+innL$#1nz;=&@Rd+!4N3Dp?B!<*g-!%TJ0lLb!y$cwo?FS|c zlOGMzGq&?U+&yu3Zmxk@l-fU=RLuO=yKFzEZNR#!TC-=SsOFi2>j+s~hA@jzp<-L} z!15kW)A;zgv!0{NDwicTG8hkydi!KHg*2b1;|E8qn52lq(5>t45V7tE_vF0^w}Eaw z#X1pO)ff5ve!#;#a=r42xLodTQB+r~kcpMUI6%O=AvOr~$bcr4(Ctb|$JbO7G;xc{I*I@ll8=|-8&`_BQOs8ek% zR%3*_BS7WRgj@V;yPP6r&j}_+K<*dS?kDDvgMa+4Vq~Vi zUDTD8Wz&A@SCiVp%tzUC04$*%S-$o~FJASNsHeFpg+j0k&hn`iohp6>cjs zC-wf|6jz?DWA*z5JGb(j`ruA&2%FIA=x+l1&R7ryEnMiB*X=rD$>`BJe)_>wu^NjO zcIt(^;vBudgk4M7{zqb{(qx2#P?`7&*yD!}3$S>1LVAuf?yXWVM}i74dOa*H(o%_| z%b^R9N87-bRV$h930bVfnPK(HT2P)m4d;~h26Mq)(t)XLUOFze>qX6(n{6IQ?VCv4 zrBX<9`!FEE8(0n>twi#mK`RBG^>u#FLni{@)MNW2Mc(c;x%J5sgDS50J} zGe|QLJ5?QpHK~Lqv|U_cRa@Mt`dBf&CPoM`8@>izIE`i1c^A(9pe-+?{svQTq~4RW z!U$82;pI?4p5GkRxHI5dX-nVKqN?RiV-q1?3p}`Ai@AFvF~rhum~+b0!n?^NKl^q= zHt*O#r!u`_A6cM4B}H!n@in@73szR%2$B*$~%$sSP|P_uB^Hqrzi$82fS zZRopGf;F`Gt2Lya1A3lGeX`mptZ$ffkElLHS`dL_yqgS)eG59=j45e~3~@i4uit;i z>`xERm3C2H2@sus1%OLyoAG9fW#dX+Ll-^q@ih=x2Q{e-2yqH}jqy6u6n|b!^|tI2 z;$R<(g$ufaNTyL<|47KsGw^s42-G6kXLa8fKHUQgeGpaY^Jy3Z>5jtBmKCv zgyQT${CmpN)uvhu_Q?C;F9&{%AmL~nmd{+5_VWHw*7|;Cf>o0flbVQGKo8v-i_4Ra zYrk}j=;k`9uMD3PKsf@Do3UADa0s(%AUgBxlnebCQf@8H!k+_Vkdjn0Ue6&4I-hER zUnYQ3H2IfJj+D_OW4Eh_-OtPQ2pBnH6KsON5g~0niP)N?yW%8M6qRbyKrifP>3?hG z5%Du_=ae`Y#hQONOA2{Eq>4mWZh1&D%QOQ-Gf1Y~1d2u6f?*vV;lAJb%rYE*U+$LA zq@?-2Vc-NmX}w5c9r5^Z=*)?ynKnRa=J!~$9gP<*Inj#Na&Fc&WWOCZ&{-JGN*rV3 zT!=lx9FSSErR~x)Fpg=JU6|%it5ezS%#r}XEEwEBcyW(P>d9SoH8hC zR7Y=IwxocF3S5v?i;+Tc?>8tR*A+i79PY#=%#&Fv#VQC*&*L9WH0LoG2w0MAfHEhDeh+ys8cPBH zvcWtnmvO#ysUlXnKo_lGj`uLg{=jAkL=l)#Y02q!!GBGx9*aP3N#l$M;y<>R=3}h zn%|kI&>Bbu7jnY!3#MXR4h=gKO?h6tdWxq#-+?^Wg@olF1}GNMFMjk~iS{_qnoOJv z(jj7yvZA=b4ASUIS9LG^QG~9GnVbLP{YHV$PBu&A6;oOUjWoAIJ+1$-e|flXDhWIt zJHzrY0%LRM{YUuTjz~J}LrvhxHot`*%9RQ(uzV{* zpOl#RZ3<#Mf2lq7?!K8_*vyY*iakBeQDTG-8oh@OlvyeQvJOm4F)$F9?*pb*^3pga z*Qo?$f+FIEGzfOi2yb1z<7uY-{e8G5jYSo4oKnQnuZL-KXAh66&|segfy$37y0^-n6}vRfYWg81NaLYO@sa0L4U2b0}2c` zlwLPrxn`3f&E)@EXBse~}T0+K+>HB5jQOgVnd%J&POTwhlG@JAA zX^JGF*N4?v8k5;inRZl73vD-N)D2<7r#&&PIZdoa&aZ>A5k-h-GV4)dHBN)t7A;$h z4*!$cQy16uP4yyC!Bs33`VJ7Y;dDhDN<{p&<>eoKNn6hgcBt5fhQaOWt<-b(vT7gX z9mEby@yPp!dPj0r{Ay;w*R79N&ylfpZZCF>)_88Y4|?uqT8H;?ko3lRIEXMYWm)E7Bj+KNkp6>zc8U-2 zvV@izvmWRt8BDQeT33VQcn;43*9_ZJoC{jtM+KopG_cXW$YE$;@H8e?QPiXm(HlYJ zynspG=NDTY%)(pOBUL6mjx+O!ERv?C1gDCaB}ux*^EhYVlCGBvE@s0FQ>{7Um@$ewO#cUQNDs~ z>0w8F?pJ;uoIGo6+s!GYBmCTZ`*%W^xjx(sVSMv#x8Hf=N_*@pa4{V*R|H9{eI{r9~`1foWyE;%y@aK@t@V%5ls z*qs;ThT`wAL-A9zbGy4*u_pp>6|3=RH?~rZx(L(NnQ;HwO6aGh8eVV<@79M0Kn|ys zg{HyCo_NLTG+p)S*Gg>jyV^Mx;__FLiwjfxJJ>^k6(9mR=ewl+%Oxn zu?BoS%_M{JKWLwNC>tJ9)!?^1>QwgSkP7;PVqh@hwDpW%ZNCIHI1i?SME85JicIa8 z8|#gP8R^Ki9x8f7-Y>vDZwzsU{N1k)22Q))P6irkOnFKw0ou_&pwnAPwd9y+=f=#DU}crq-z#Xx7rAC7zGjm5_DZRt zTEGog>IdU^%AXVFt4ahaHm6+eht&|@)INd)FS`adN9^u?r$O{hqr;bOF%dG(r!`X4 z%W_kI<4_daB5E0k(+5aD&IVEV#pr%aw8kc|XXbi80*TK5V5d!RS>+0^5pCD{kW}*# z*l^UhLFagAq_wXU^*J&lWlwm^m(UmagNmRx-xp3j#H&S+9PHmA&E{RAp5P>ZAGIF40$kt@+X2+wkuHjjI> zeICRTW!-!|9c??O%qf?hR4_VatSI2PR7A$GSM&5FXo4&@1Ha;`zMEi!yQMMs>Q835 zAuS?B-^VZ|vX32dl*I@)c=v}4rba`%Yi_A*gn02*ug&2{%S}mO9is0Uy`Kywb&2Rc zALBkGO+xg6@7L+^)Y!hW;ijsOydvJggB{lR#BOfY8c+ZDb#-V79!cv4cFYTnbqsfE z-%1%35?rGsjo$7Uy;wKwwlt;z9s|8L6^$ZXjLNawiW06FvR4|G=GC{FGV_sN9mo3w z-(EQDhz0vO4~(?gYM(^JJ<^p}#eg+AzP%|3q8%GG$&sJ#;I46TS$OByB^h|aoGpE+ z3!4(?Q*~WXuC3d^8303Gp{Aqs@C8@)OZP3TZt1OQt%yG_3O4!Hql;gBF_plgXKF@iu~ra1!Y^laox!kE|;7zQ0b=C}`^e_RR($swxLGCoM9A<+p4mT_>mD zYMZV5cLL@J8<>c4--MY9lPPB*UIHkV3AwW5ldC!va4a!Gh|gk5KCAf&%u#{1IKdLR zCZ{!6ZLbA=*cNJiTl*-tdr0S&X^{od+$2*{A7D}CM<33@7NjKtk$&I(8`zhCZ(JNi z)F}0K(%UQ_^^of6xi>&kmZ6XGOpTp;RC_sZSCy$`SCe?_6YV=VGU2S)`~H*^JG$oJ zTm7@LjT;FwKWY&ES04q};<^-@PhSTaCakxI#=Y&%m)ZW9WXlA--6b8V-q@s4xQNe; z5pEeI9MonH_4RN}zjRrOO;oXRq1Ekov!v>}UYE8ytdU{|Ic=xswGDpwRp@UXq54@U zq5+({^gLn&&Bvix;q8gzu*<~L5Sw1`Yo!qW~^&RE2$(){=^!F1@Hc4g=5 z^JsO;OR=&SjgjtBDCoSV8Xy%tH4JU}m6uU9TgZqqcmBSO8pF_-21sKfHsQuiF#PrQ zLEV$FO+?W_7S#rk)>i{9Qv;WD3yGwqXz6J-}|$0(+#SyKs3tU z6w_uNpm#d+#{U z0hz}539Lsn>DM$=@?9TAMQTN}BYeRtOvEWuSG~TPpvqs&_WsZ2DI2`v@CJPKirp#CSCC|kF=lCN7H0BrK50uH`ZW7BE`LZg}uW^fNaU^^{cKb$Iuoh z&P|jgOR2!0s%-!_Mm?TJYsLwVZ1pyg@o?&WW6Y*!*pe&BY=DoeOc{7X*Y%QURA-e(`JZUBs z%0Y7W%PMJZVtJ^M>OZr^u(fk>jxEZP6?!X>_sVw43qVv!}viW6*%{!r4bS(0!EP%4@5xXni3jaAprb2V; z!6P4Bn3PZ*HeO3ni?uA8cj$gZs^`qMLbG&EyC^!OH{vn#Cz`xY?~$CQJk5i2(PJ6+ z&FmBR{v)oB|4A!aJfV!+mDBDJ$H^<|aR&wnUicJhr*T33pU=txYR~sJd3hW2hdGiwuW+a9-bq8R-{8POSJ;ZGj2-wE z#h;dSo3+&8&M`<(Z>8Vaml&|XQ#sdbVhCCVVZ+CYjQq}cBL^}l&<6kK4V6$srx(); z6Nr*(61L-cfXXw3)ZGrPnqvyM^ZaOw7y<5-Xo%C) zAE~>uskTN%!B8BTFRNi9Sblh1xfwhvtxW3zb#W|yW=%7sJ1FVK70rCyAAHGNXg1ht-wFl2T{UE(-7cP=**kK(>_C)o} ziH$7I$))3tREvlg!&k{`$KEk3ZZ`CAsL%!5%a)z*5}NyHnXEH8yTnaF2Df* zVV`c@qbPmxjk-W64G6jFq~{s9;*CxW z9RirEvv$VU%Pd-DW8b{S*F_v{%7`&}{M#juTgmG~R?D4ZX0iU%Ug&ADG#<}l=wDfq zQT`cKnr3iLzhs#5g8Z1;G~^|+bz8f1>|sSDxfeTrA$H!XM(p?v_$bmdcuUqOD2z%q z(!%;tL_6wPx0pf7+1u!@RUpbQr?b}S0u_c0%%dSq5R3nrb+4j>nL6pXP^2?Pf^JTD z(ZUP&)Ad!5DK^J6AoxA6!u91ifzA;<@mAPw*%XGi#f`32;GNRMkVr)}TO!c!=d%t* zK-4XTQ})HBnk{~!-@0yvSKqdI((6o_Tm^*rRF`%Ma@=}T>)=l_(9rO1-QT$*;`GHq zOkD!y*g?7Sd{b-ILIq-{pj2XAe^M21%bt;q0Jcjg7#d@K zyM|FNXzQezBgAae*m1Uyado513+h~1>3RjZy`(K-3L-+GzqGPhC5z=1-60erTHb~3 zJrhdo>~9`>A7J+tZz;Lc2*np%oqaMEd}+Ucs;O1TF@Kqw?HIp6DnohHKhQeOBz@pI zbfw?&8Pz)YyjkVpAplSF)G^z{ZGtL+F6t_MOrRYM$%9hjd z05Uj?-Cd^xSt!P|KLzy8}!074o(YMt3i_14$@qfN!#H3(VpMu*;dt*fBOuG$znuH0_NKI>f0NFQZs7ei(CIl-NfCWlb4c&k zA0&Y?IqQ(sVs13cx-&A?a7Np_$AcYRlv+*AGCI-s!yT6v-^BR(!rMjw7bR4QL)N$E zv*uW>F_W>XG$)zlbx8!LgFsyo`;-7jhZ7Fq#8q2E>x@)Oyp0yE`q!7GOB;Ii`DpEU zHT;IcHP?TL{onrQi}${Im9RXsp9X6LI(S&b%ZfB#8NpCaqT2pIS~GAZx^~Uoy*CPy z>^h|g1q_W-c5(N#*_J=m>hEx(`I;=MSP&#W9 zzzl@^9RxJ7;eONDM76ahpnn3aWS_JTN#1l;t|(N75eMc>JdMks-307X@R?l)Iy7Mm zmu4O>%c<@U2-+%oAgy!qgr|C#?XXBd~ zV#vaFh%oH3CQuJ|_5wF>HqO-$u(lbN%Ec116~H!#NPIL57Y_Z+XxQDOusg!*3lWQ&UaCL^W}5pJDU{S<`vW}ayR>- zZ1^FJ!9LEKhEE{Q$I`t1i{81r=fhW%fMorPs;v%t{iep@S4@84v@=*n3B%QRMk5z3 z0tTGPe9Aox2%l`9Fa;}iW-cSfz?Wo+L~;U3srr_Vk0^nU72A2j`JW~i!5w>4v;nwF zSaPl*P^@GcAU>LH)r->PfV$?tmo3Y@&-A1IX!TOl1-Eeg>0S_f-Nvvh{3>eTFDp4- zeqn5Q=8krp6dB4DW?(D6&{yRv-@sk5J?tSgt!82#>ISsmL!OdL0p`A1uGcW!eiw1a zuP*lCO_{|<-YMWy$82;WV%sx>_d zB4)t7?F3~4&;o>*g;Lvs>1GjPWB5_FFL~U>ZuO|Fw#RTw#|CT4{zpnN+Ghk*-K}|i z8&R*58h$ilO19Bkg6GN zxkv_7wy$LuvVKu3R8HB~t899w*=QjgtMR!qPq5=kp~={!II(Jb zh135g=2}J2eP~swk9gmjHs9s4YHPMyMeMi0i9T*LI_R1H(husH^PQL}y|HsE_*Gk|h^M>ezx=$_$1cy~Izf}}u@zsOfaC4K5@<}Xm_(p8 zMvJX|zJStZDwL_n+JT?uk)n`T%pdKG1d;0sFHgG+PuC_{qUAQpawf?}@rfj#(qXox z@W4%Cv$Y@Fz}U!hn6m(;12LlR^84143^-c;Hu6=vH2>~-UdyI!)$szsQ4o#>42TPb z2XB`z{&K=alAZ~`VykFjxArR^r1(!imY!j#(r=Lh5jJrhL6hhtJ-8U# zb=tZL)TX6*_qC*j-Nb#L^r_2M^};Sc9a>3uswG*(oQQpA<;OOqu8Bf*$*v>r5eo0c zhjv{M)dxr@17C4UElWaXzW0308;f(xD@vEuQ1JvGz-FZ&Te`k)7gjd+iWr;jmdLao}!eF6kYI7tC8j{G3kXUQ@-dyJsiYCso#(Xy`{>DBH&s;g_ z(OC#YWV$>d>alIrnRcd{dVZky^RtD9O;DKa{*M`661@+5`bk?Hj-)<@FYf*z*e&8O zh-E3Ckzg_tZ~%)}zldwYu^JA~a(m(a=QYP7tTSszT+XP!X+Qw!vh&Jz!peP}hb7B()#@chv0my5U+*gd9z|F^Ylx$j-S3$zw` z;!`atc{=GyvMmem{&7qQg2jQQ6AN;TH+(b}Nv>BHTff0yHpM0r4JM(3s_w6=crN`e z?gdxkFhm5u%J(~>1MmAI)1JMfWb5RA1B`7hVIw!pW^7)4GKcWH%r?(0p%Er3e3pRWT^AD_mFPj812O=M>61VCn=1I| zP9xA}H!W$Y7(F?#d`kBCs>)cf@{`B^3e){0y;+r2vpL=v8$C8vM*1)_EOF7lxI#`l zdr?Qk3MJ)dQ=T1Hb$~_We%0?{lC|+3#ACVem#%GFgq6y=lO^?qY5RWdRpL|S;vC7pKrdRylX3Wx@wT| ze~e}!R0>>(C6ZfJ@dL$>ovO-|Uw+bnuAv5I%68)6vQoqScB{0UWp9TX49Dv9N~nL9 zh`tv`;d&i*!#D5>6d34sDgnnHNVtrL9uJ_8hj&7~yuS{RWsnrPu@9YW1W zCd!wbbcS8gEN_B$g0SF@1vzW9nw5)TaudeMy5c-7G;6x*+urL4vW*)slV-@;qy2u7 z8yZr!$6oOJpzvJudQZ7PMV=~phI_B?9bA=~((Y>a2Xi{947+{c1*F?8GGsep^gayE z%%dQfs2DO>@HbrmjNZ`wWIoZav~4y{TOZ8a`ju-oHva67isRLSpoZI4)>pyd$h56! z)ie!ci0Najlb6Nydl56D3-Ch6pN??{dscGlTnQ{OS&Pb$HnWV{Th~nOTPXvnxj*Eu zuP~;wro1AIpP>)r74Ga_315w^*l`5CD(8xSC5V+`J^lPydFltRyGjwmxw>w@C9mv^ zAK!uCRvEo~dQLl?xJDX?F;Hw!sWek2YiyX!Vn6S$x9HMLBK(IGjC<0*6asDW{0rN*Cx36p_A5U>#%>X;KFK)I0ahJr*_PuOjaG)U}kRXz^qCswIq>r(cgo1n;@Z)c4{8kO{Cf5V*fx70juWOtJUyr zG5&b-NTxdWkaugJS$J`%iH`=7?laxC|%BqU_s?m8nY#o=dWuyR#ux?<&OVHS6h?;7fAt=bMu1tDYNQ zJtUG=_>T*8*gk4SJm%+qWGz$JqAT5z2ou?H0(g2~JBYPzU6bmx0qL}u{4p*LyaYdve5|}T;2kl#pU#I9TXI@_tC|ftkz=?Q&ZPmdj2M5X zhHJ>hNi(eM61kUc@KbSyl11)L!>Ej!1;CuBhD4bf+fjNkoQy5EB@{o zQk2i!jeCIg{P$VU;MG&902VbILf$28iBvy+|C?k_(l?0m4=1Vb&3sR_l5l$~=gGIj zAUAV4#k+v1eX)$kW{0VOGx#x=I1~8QXP(Vw`R}yT)z-5ZexYlIrMa^*#zu*m{ow*% z71!eD*3WVQM-R}0=j%Tw2!s%lU?8EQ`uy2X`*P^=KPT0Vs)x4a9<4-IIZ~ndRvz(i zn!u<7QN=|f<^t1*wzm!>8tO9i-y!eh*?960>_zjc1}klH-(xJhD$Q=GV2+Fxt}M*Q zU_}cVq+@!8#<6x$-{TJ&#kcY~ISW*|Olw~)fdf*sYG0|ao-{%7olk@hOY^S3mURR| z>8&u}CqpR#EE#+bWh?UFOA^r~)5&V)A96NRK%=4FF4%`?#xks)e1-%)mn_MOf40V; zWeN+k)KP+d(Pik|X)uM9?^SUEga2o|F@Ng0MUB;4M@ov7Mg-*=;K4rdldPMp8+*9M zPqV2_8DM}&XzBKo`#iiFI;fL-wujoO-&WN*?au-x_1z|cNA>c@GvsUvTv>HYu(l4M zHFtZVfCma=<0-Pt?VrjjB1{9?ppHo?^S#S}tCq6&%2P{#^Sv;(#DG)%FOu*y3^12^ z(H%kAgV6sRy27TDOO%`}&BfX#FWb&vJ~n8n3q__GkzcL;FJ(ItT}`i`$6XTu;d87Q$8HrO{aYg$h-aM=!Hh1TyU_ogAr;+d*OKvQM1hDaJ@ z(zZMds+xA%ajIL#e>ZXTQ1Nf=1rK|wOktwmBOhgK9N{eF?Q)cb8*-C%?RZO%t#LWz z|D zcUnYBH@MWx)~opT_fz>p>bFVscHTuR`#I5 zmKMv}u$j{QUDGsSW#)K6v|1!R7D-uUNmn@cTQ_;2j+Ki`vi5|a1l}n)-*12<2U~yq^>mp#IPl)2bbt-3Wt0fWY*4E zb;_!60^a7&5`j@Ea4NIgL^o?cm}F?i0MH{jQ_X|sH)w?f_ruk9qis2I7Pf$kWip$m zp82xfyf9Ar{3rQhvm?~9u)h2KeboX4a)a|xx_C56&q)1ZWv!}I+Rf$uz+mcP;Iv?| z#)S?b!om+Kl$NYws3rC7cju3Vyi>?3&C74#{# z5C}kd@wG~AU{)lFY`enc>B4gic{5O$1HC5mzxcfFgA2P}LjQZKu7Ze~;}r{57K?Vv z5MNy#YI_Xn)!ce@_{rqO5eg}k*lnyU%(2mmF9r3UHC%Qi{lCHhn60aO9N=C16VBCHjnDUCfIL zyb1x{(u$H5NA4F*y)6|U!6QER8%uBCng8jG=H|QD`9_S8){a|?2EO=iP<}~*z}}xJ zR*G_mu+L;!0UBONW`7v}tY74aVv1wfAyNkQpyLuK8UZbt)s-O7)|b9zkQyWw^7=jU`Ue$gtVwHUG8o^Vd>bim=+jEh@_(9YLs)E@T0=wL;dxp@+ zqYa#@=5Ebx*Sz7cHfnF4zAOFLaa%C!`Y3R{v8u-!e$<)7r9?G0jUA?!xOxi|km4f` zHzBM4QX`P$BN~=)3aq+C+r?+wQ@8e^IkWghl&#fj`JNUTgMnS zW-DJs?UND;N%Myaftk3xb(b<_+$$b2Z1f@s1a_3$xbr{hk;}fePec(of+JAU1C}{8 zmv0NooGUoZ0vH??Z|0&ePtoA_Gkvc2Ao!qS$Jdq&kFblraL0`%T<#mpNnzgLGw7pBLV!8Xrb&s>gGU_|d-!r09O| zEk-w`NJ)K={O^IGCF6YTYMB~}OIvg!`CCJA%`@QD=(Etd`-yhwy&a-C!zZQRHQ}RU zc9ZflG|uf$&*I3AuB8CE43g&M=27VGZ!93CUj;ODMl%gyM~n}(cffaR@aHC=^`|~E5@zi6=$=r2JU6Lkv zsrw5{qXl>01L7V+WO>Vz4wRE?_-sMEG;y?R0g#v8i$XPh*kSr5b&b#Tx~jj502m!- zR+j$0(yo@7t$IqSkUsVqaqF7HVj(c``}$tnn!~rerAoi{`KsHhY^Gkob1#7Ivr+z1 zo1XafG_m-Q`=d$9zA)FQQG&HdX7A(Nm;n9-LG~Gjl(EVD#no>tq&^>ZiV}Dh>h!h3 zip9JB@y@$GS?|H(-?nZAKVD;N>|9*>!cP3bz{$J0V!ZC_;iho(-z$sV4fy3&K@0w< zw^;lNd&Xpzsl|6utJk_8RxSpjSs<#ff!2Lx>Y}V$RV(IxJ&HMIVYyG0qI+hxzJo`? z6gkr<>$`q^ha9+k%D??XK5W3sQ@yMIuFWl=H^eRh3*6Drkdr3>Ey7mESwzp9e{DWz zwFAC&PcRP9P&OhSF>Bn369_D;^jACjub;!`3ELatVRWP0g;qly!9a;}#SH@!tiWmc z2gAu0DhFQ3F0y_Q?B#|Hbj&)C%=nZ(e)FINP*F?Y25Y#1Y@vHI&g>4Kp~(^&Vr1Ht z5GH9?_RxYHe{f#)xR+-vz2tR^u**OH*>?k)o)fERj#(aGRxp}%5eEX)cXz49$VW=& zo!5&b^_I5S1sRP-0!jPg@dui3w@9RAHm@J3Dm(69-f|+pFP8W$Xa4yYTzhYed40wD z=|WcQeD?yR^++n%;YtMUUszGhQ>y-5!=Q-<^M-PwXuRU{@`r4I-nQC?V@R%Iw@MsL z&;+T}&=P;R8*qFCz}v%T)H6qRn5wGLXZGh(A2q4H`<38j#23Z`WcgS5oK3$;P^7Z0 zE^GuyW@`mwblX1sRP_@(G3E-aZ>Mq8&vp(_5XZFI3-^EMElHaWI9@OasM~eNgtvr> zSP9EsU8f=TNPRHe?Yvshag-)$(PGYwLug{2PoA;t3A#*eS^9Lz`?~&mQbn--DHRfY z(4gs6Gh1zF$La&?Ml|rdO$db)2m$UEYx%8nR+f^#!U8+u*nF#4wX;rs&b+16 z+5h<(SN)^P_daPHc_l0kBCD3VBs$eDRY+*Ln8HkuT>uz8y~K&>LnRkL^zC{9GII8K zN9ntBE3oDN;`Rte8)&)V;vvNE7t;DyKnRyW+K!6?A{ZPT3IxG*;&V zQ$6}E)C6A($M6EuYX2Hj}k?2+^xRT%&0 zg8K|5avxUta;3ju$&XsR-+Egq zj-=(c_|3j=Cn!dR!L28m1quYU@DTLz^|~#;N=-C{<=evGWx*la*;D};h!VXN z&JfT0OUOtPxAZMBk*Y!E>I@q4B8;5IMvP)c4==a*W9kRkvaw>I-;t&UmF@^QpK8H*G|&5p2x@Q=0U`(*U`h> zdkVIg6qrFMAC;HFx@)Zo9dJRMQ*(|UF}VH?cLyW0prpeAbHuXGlUI)NS~mD3rrP+eZG@%cDu%0pVy}~FtDAIpaWuA#-`88jH#|s%xN=S2_5-7*0J^XHKEBDS2%C2H zYuO_@(5`gPw)8gyy7d(`WhUPFWq_%o+lPaJ^@s9|nA`-%Z^E?IQtDctqS3VQSO`wI z?Ks2;g@VZreYL8yy_N0+lqJkY>bx-c>DxfZZvw+7oSEF@dyqty=&ru(scqT9O8z2! zuz#B`06uZ0iL?r}K?*?o;_8pR2+nnq?7M(AszwJ0>&FB|3?_w*O}!OfM8+w8*-i-v z(hDArlX3z7yDUR1&U#1z<{soL97tz*%{U{GrC~h#n+=qmVWRTr0XqHX6UM&AUo9J! zwezFG04JSe*~)n7)=ezFT3h=7BfzNMBg{Ax=aYph4HLv^Z+aT(qOmqq6;Pz|EvYep zbufog0)l&eRiT&MjFZ`4Eks?x5`T_}ekLZs1jznlbv(pjVp8E87`OLNJwh^Nny)j8 z^vxiuO?n-ww^m;o-H8<6355%syETVT`_Pp&+uC6)g^Q$~217n~t@eFVg^M3-CLPjb z%V~)B&rvmy@^_8&(DH17wni;8doH$PV^q`7Z?e@KG^M>Q+nvw5hh5z7|8-bulMLcC zrxGy6RbE^;K~+H$akuyF`2C$Io~)W9CuCbe}AWr2xdGxK_iKtrZtY zUE2i_EI8^3k``ncD^jqO8Ozq6wV)!X_j&7A$@eg9zO>e*Z0O13!CgDB0OZ+9Gc}y6 ztoaTFt^qa#39-Q=@5|MmU%gifRK^{R#0Zp|S)0YG944Q9BX!cyu)?VS41g5(( zY7dWy;zP>UA2BPBfu;ff3-b)qg;r;X_pQ^>n(UJImROx%8>QfUv1o9w=dPe2-pR;_ ztXR9HDk`ZT@G*>a+;-TZnTab;whwPd;~G3a2h!uGF&-Gd3?(6F4O_?V_Ih;jZZFpO zL6<1LBy5)`CvRX@52{U?(x}!Tch%_NQ%Fs}54g&8fNC?6_)(n1XIPPcU+1$lfCun4 zk?#$_IJ?}VXWFwEs!=N~j8*Z|uVx-?iotan*<$d)oli+)9;+T3of7ZIt;Fd@L_&34 z2O~3zeB|`w^Q#YN10j*2SNAfJ4%YwC{v^z*#IOBn5Sad?zp> zop(PAgP#DXN*q9$C~;^t`-DwEe%5T3m;yIST?7sR1@;+4{(UwbU=hMRC4j|F@!Iw6 z0M>~mdsdXR#zI4R1=eNy&El#_&$#@hXgxScuP2VQec3NYIOciHkn&Uw_>@f4c9X^%8;} zdAAPOct8I#XzkRL<@JAVZkQVZ`p)|!-lP7S(h-QA;4yN0{V+uh9pU*bqSo6sxzphR zKfdz5I<~P92pIS{7I_2!eA)w}49O_^(3>v9)HL$DrVZkguV$z5hGWn`a$#;B199AP8z+>=XI z&6o2?VdEl3PQbQ@0UaJgfZkqs@ZS#vnFVhLc^`j@U+|4dfzrs9W5?m83@SXW)jncp zZ187*O+*3S^bc#hP4#MMi@1Al3tIK%!euI+3F64_Y;$HL0H<8gSlZ}Dxp(EEJ2mF9 zh#KW8Hl&qte3&!6C12eC^*v_r82-eP87$$le`@_m}y0*z*JH8i?+J4Iue`}>UMR)Y-V#lxBP51>}C^YruHhtX>V&)pP z-!$Teh97mZi8~Bp(E8PPTi+hjCY>bibp$8D1w>AHUw~7N04sz^=rd*u>{) zCLvSRG_(bwxgr;-XWGeoWqcqmZfaSNuU{k}(Tg@0^Y|Iw|5s>~ibkSsVxx?6JIErv zaHgk&eA$(rzfJn8= z{Xy`^^k|E!=Ru+;dNBaYru{X#jnFTZ1}*9C{EtT{Xn*qhw?W4hz}=)9-mjT_jgu*z zb~(PZ6TuoS`{NsI95O*IoryOwxpPkK*AXsxhtpt(H7&(y7j`npD|ZuIvPnA|vd&*e zWy99GBBlGmXW~~0D;Rak>yON*#MAq>WAlLoZ@oTap1qln!!vnFx3rw!oVRV6URiyY zO@UT(Od`)V=#&7%imabjd2H~iZ%yd85+#x%BtAW@iJ_!MvIXC8MMo$vNHPo~C)R$l z#4zPUVs#WGaQRCU4X|HT)ac_u$nNG@uu%mZNXr5E-h~V9g)F9`Z~%oaC60r8MmESt z%CD!ZPgK8}_^BwYgKT&LDja!;di4tK4sRlhv?)9WXaI-{Ltaud*F;j+dP&aN@?v0h zPXr2xwMfN>kc{g%`W*3Y*ht~?=Q_3b;;#Z?vnn138+pb_M2@L?PM3(=!^B5{s%?W+ZUi1HXmGZ znG??rHNU{?VEz)Wy@E^wdYz>nlbU~ouE-x@wNp$f58;vSK+8Do^gJrjaOf(N)68>) zAH%UQQBk;0@2Ku49?~1Bxh`ne#cecBR8iXI>UPxh9H`G4ewNWs60`&`L@i2Drp{#n zX`F4Pw0dw=2!G|_ufM2Bu}g0p%XEr@3}}4c?x+*QVG#w5QBOd?V^F;GQv$j>8o$AA9MVV{`0+o(g8oeG$@24Vt#%9{~j%3?6c2$s*C;Df5_id)k&A{#7V8@ zhmUl+M6@x>f#s{&!o`S&))9tkBrQAZNgw$k)swDxpdRXng(%Sn- zczqAiT*o7Yws)Bx^gIlGRtqWhowH_DYW0QefOy_>_{SR$4uYKs= zMrMd#!De^at-w9WP?=AeO4`q1ELURamKBCTLyqP8P+5mnhH%#;FT@hp=Algzb1wgE z{l-Be1?9+*f(l4tK6re1R}q^_Y@$Avw{|O8_eWSKwiU=!#dUcnve~#d?QBbs{kPG7 zG;`hp@7gvOnyx+~r;0Wo2=IwS;MLZgoB2sF+}WWjx8997VHuf=z3Hmr8K9(9?Y%(A zgr<(fTb|SAW5rQP-Vf>-&V_KiIpc;oOGrPQEQj3q=76qb%GXndKF6m9=`*RwZ!0Wy z7fW0|iFIcbx%1iF>#@ziS^8!#ubH(A3PAPJiAoig@ibS_PVaN=Hl_`5KAJ)j6yJk! zDmfM&rJY|)cfuId7g2hKbIB-M8)GLA-(E|Ihkx^X;qD7eU+TX7w**5_!EDE;vlYB0 zD0pL3dusKR#s6?X;r)gZ2iLCD>dPPq%)3VA0&VxT5s)rE^ z%qJDFW%Dzt2Owu3UoS7q{*)*@=7#MIya(TQh1~qiw4OnrM`0n2oalqLIKHJT+Iucc z3d5YaIO`@@Jm`b?LR;eKku7T^s8sYf;|N>Vf8EJiQlE^cHuu?{)8tEX5>Ten;&q~- z=Ir9)-PgRDIIrH5O|+Y^>ySFRe54_Nf1H`9aYk5ksUTLy7&vux#v_gw@n^L}U9U0j z#D(XmLS?SQ>LF%iQ494EGIScK_fPn2fnooYA4wl!s(5t#V8xmJuXoy&;()pwf{@TU zR0@K~*q9tiOm?$xCt3L-XdaBDVl?A6v0^D*h4rMwcbtudpZ5LYANonukr7T8Ja~_F z;01XRaLt!gk8*a7SNVE!UHMX8pAvHnY9lpLWRtYIt*PsJM)B*W8sX;=}CP6+ek-cz> zX!pkZ!C_lpQaj#RyKPxN&fPKD!7b|LfB&I@_6%ML*O!4&jmOC`fgovNiZ;ayP-{=D zK|NjYWt#54#dXIoL(+_U8}r0zmk;X48vzk^Y0WH-FGJt)seTq^&Lpy4aDtXZ_-8g1p}U)Z zeCXe^p~|~2)F-BpW=L!g6gdogC;3T9a?au9^xRYt+9U%U`pq`XBP`}6kt>ABCDSVFDPK7Y(0nA+or(1Tp9Yy10>0hR(Ytq{*f8MN=1}K?7;MELf z3iN(De?+yB%$^3TDTEU$`ju{2qPY)yMQr!Gd=v2Zs7A-!WVe!9cXAF%;BIA@!Rc__ z*+Vy{OY=Ut1t?3b|4`F~!)vz~;8XP6hMq+eRR1aXD(06p?d;?cs-f9zGUOR8t|4rj!qdOHD{24CrwOyMu%q2ZMCEV z6!vK3#u${1|6AAcg(6Vt7v!a-bo;)JG|J6c*rveo7x1JH+LU4NQiY!Xk%L=Suee3~ zBm~dWu86AkyLBiUfqsxRa@c>*+*IHx1n??^TYq(Aq!jnyf+{o2=123@q5HX5*>MwK zn{BR?t-|&x!V{qnP!Mg~aL8U3^9Zz88~Ww$d?TIqF>sDM9zpxVD}R0tL;)X?^EmUB zS*`F0)v3xe5IkHPIP3qNFnfj64G24e-aN}#QSURY6^~U$QhZbl;-%$9ywsMXgJOe( z@P%-{;RINWg*=M8-9kcugIt6qTkd1-e4+BLakZG-Xr#Y<7?>$sjfs|{E3D$JnQp)a zUNVkT%;$u!MD??`aE#8_yzG6_@_-K9+?A4Ig=h|!3Fu`L-ZaUz^N=%m@^*KnM(Z^& zZ>n-g>4ABU&PzN)Fv)0_mPOs=TdGM&PsvcnR?g#IPDH^Gm05%9UUgHbu9DL-4;lz_ zDM}Eg5-JN&;e5)zf3mPIr$YYEv#6y4BBNBKLB>D`{B1eTx1$>oG#tcA$wq5k&UupI8V<_E2vOjU6Q z+>si$p84=FXDQoNk=J!u;mX$q5f>P^s7jjK9*jLi7^T_0#DcDCDI>>kMpbUn}78^8eIIw!tMhq=I% z)}YMhD6MITJY8aMPnb3D&sWRhDKh-WCRjoH;%0EaN#2S!sst|xa!hZy{J_hPyuI&<`o22^Ir=ci%6q8vU3aBLk#p=^%z75UGXbSo3_bXt)9^t+OAdu^*AwSVOL-3heR@l?-%a78erg9mMU;q68z+B{wXyWoP{E&A_=S z>NsSt^{V$yL&q9*6?C---VyCsd2SCwAD;?nt3lwB? zUKyfEMqelXu9)~hobLM{-BNr$1(%|<#_pD1l6o*%7jDSnP6ppMDL`N-Sp?T6BUibUkHxY*aocLc;Q#Ks2B3W z_Ud$zlP2)-S2-33?C37Dk;t#>O=P7jbC06pUfR%8qT1$z&s`9~AYQlhXT0&{u&w{R zK@xVN5xDwNXBI8A1u5_%Ho#x@5<8D9u~fe`N$O?vB5tQCfU1JE22)4 zoXMa~|J(Lu*@!D#B)~cMVqLlF{g3*0Bnx%~u#-3W^yUQ%iZs3H`A5=-9TcsB`*W_~ zulcHdMtTg!wjs!0QRG0st(0XKC^x}HV#D?KS3{Y!uuzJbjS zeI%#&a|j)VPDt$0rYtpCuQe)`{(jRdl-6?Idrb%+hRTgMY*4E^oI06VIK8Nh)k!PW zy_rIm~|J+)4fm_>y3EPl!pdYEFrsR+w}lY@(M^ z!W;bS19$9@$5x@GIf3Tb2-wjuRLjJKU)e z+jr^H_){DibR-WNFV90m=}>ergr2}lMwOIrSlV(2LVS#T>jR8u>L*NRcF+%UlYO_q)v`t?H-W$?0%-(mj_Cb`%rNy zg^8Duya?CP!#XAPw>5t;Qh-!Jjdt77a}a5a7&qrTkC)l5!0Jq-OfmqPl0{{BX{*3I z7Nq0~5vJJ+#D?p`9n|xUn=1JTZ^7LkH^28oc~{=MH6ZY9R{7lR1k$QM&f>%U7K;=f z>MdW#hyn_uZ!qd)vaIKQx`Ek5E)%$MDc=!@&2=yRE1 z&{A9B*H9<4&D!i%5wjTPBP~ok{@wrhV6loYgKEWh&oN^|aX9K6{pOpyK}C2WT4-+~*Dz%3ZT8Xt z=jQ|i>2^|BoYN6QPPDrRp48_Y#GoStI79LcivhidEbwL=CtivX%dNmuS=(36LclYx zrkWWzVZSPXC3lY+$rWrl@EM|IcuVj9$o*bHv!Pb2F*sTlF_L14`waZmn69N36z|$o_&@X2yI$MEp zwsqLrl&vQno>&~&mu~|aeoZt06M-TQM`R~M(YZ`0ggUI+88*6Sfun zuKqMZ*A(bN0*zy5HN4I9W<4q7(jgt$q;mx9NE~=9rt6&L{--izl1LFOtuNtlv@rK@|C-=d^oR~x^_^b*iZ(d{eR0Mhly$>Nr-*dr6 zK3Jvs$a@U1Gg+Jd?jSzL9#Zq@%RW;ielJpt>WcBZ`r0Uog%fFb_W%E`%>1W^XS&jL zNRRn}0GE3Wc&BZB4;0>(RRHrK?|-}1h(;D3ha*W(tMn+$Z)1nthjRgd5isk`!U*I= zUZzxu8F*v-<@RhBwe=RND#!k%nwSs_OJrPJqcT1gnE5G=5;|?wsLEcn@on&=Are)M zaxzJ7P@Mg$51BXt&hQXzv$sI*_ld&?JOvK~J z`ba8pM*9TwcaYa%GI9fPr&l{NowE6YhCKMBDC6doPi zz-1u9LwUu?! z4B2fAcCK)0bhMmca;H*$w0OV|ehb5t?on^cLBkhGl5*LNsz!KkXr~=NMqTHh{yRt| zNp}eTvQkAwLm?#s7uY5|{@N>Xy&aXafB1k00AT|h10EG7oSZ?&u|?y*;xR=?hfck% zbJn4G+vgc5q$Ouf~Sn@-eC*(UQpMZ!h4jY?PH&p!D?msz|=C4MV{mp%VV z9Vm_JZGj1JQZOlTIZwu6hbdGW6ICDv0=%NSBlaq?=fKpB81Ggyy$o5S*TT|*+jM1< z!XGkR?Sjuk+o#U}fu30cR^$2tDeFHgxuHfSjVO4Uw-U;{Aa3%bUWbzakvnG6KJAuV z63C`{j)t>^mn*D0!Ki_~)tf+6cYa?sLTTikUKm@2qX#%L^D~;25`Z@A{Q3oGaYc)z zD~W|yvo+(AtS9^IYvBJY?5pFN{=T@kIY2;aC`dR$LV+)!h=?>y86_bWDFPxOEiEuY z8Yw9WVIoojN_T^xgfbdwq?^(4+zo%v^XK#VW7}){#JTsLd+#}C=e*xZt;qKZ)yNbz z)^(Sa*RugX`qgA7_j+1)pRb2_x!AstKGq7#^TO&Xen8X5Vf3i)JQDuUO-J$9tRsQZ zJaoS^N3tnO_xpDI2B3IHq29doFH}jC<))BsTA`-Lb6``Ryt@%_Th6Q7`l6+-Y#7t( zGY16P4pb%XluSsuk&EUAg!H=mQfaPs>r65KupYYj%S`f4!-N{?Q&?Lc>#~Qsbi;E` zC;DHc798HwkiKL;hQJO{hyO5(Qr^`|Ek_Me+CNSN%)*eE=qm~l!;H{vo!n7*_wvUu`vM)IbPmwwL+tHXP#M1EnIQY;Ri9qiQz*Vr%O{iT+|1*w@qlQ9na`<}t0S9b`e2 zg&kcmNGNhd;i?`)qlq=_9z`-iPL~AJ5T^NZ_~-G(Oh5hC3!VhEsG!1{3m;z2>O8lw zH`4l-YpauvJDlf2tVDnt%_CO<|3)3_946f6qOrqh>jT%bJmJ~G3Y$SoDg~c~=|ras zA6X|bk1y)!<&0~)A3T<4uURtup_%|;%}EH|=!yg!Cg#%V@$5gfu|N8)a#d76{rEZm zb^3{7LjEgpt@AuKZvFj_dU(e?p^yQYvdCJeh$<1T1=S4UL(hn;{yLW>la+fT*Xp(p z{}TM^>2I$`P3 z^r*H`!5Xo~osnS;nI&G)-l>ljNF~abE4Bfq9ciPjQklkcnjv3tSF2tC%+%?G{^!EF zCckYxdi)nJV=v(c`NL?88HrRqd~GSrG)B%TzZHbNlGA22lH%^aW#M+R-`lTz^IS>h zk#P$zeZRM-OO-Unxbk&^nz{qsVtr8+e@D=X-G6>gC*M;U-`%lEL_$^vqlLm}@D~68 zOWU)tE>)VdSBoiebBKE71Qwk^cyyn~nq&yj!xzTdF{of*1KvE)plVn(7V+x=Mm|l<;gea1dDueP zdjH$!o2sdKtR9pLo7KU7dWo+XseTSon(@0*^@~}TNpoU(@KDyAM9%dOEjN;*#moA5 z9)iEC{5QC4nO3Wdcun2nuPIg%vKBwATynJiq$a)vNqRBay}spab}Ke0k3?(MwZg#2 z?bjJOpSB=mq|p$$R-mc;)z4zufQ+p?<@G_Y8AQO1!)Q;>V^N3me&%Mni};h+bro+C z+-iOs#aF_0f`+ zN`pQ}*K{AzHQT@6@DrW9^Pu1~)=)m|E_bdiSfv;&xE_6xoUUe=V}hn~_IG3$jNq zC{AeJ>~usNU7P-N{Mr6>Jj+>DF2jH=r_Eqv^LAFv@b}25FiPp0JFdlT0mnxp;YVzO z54Ppc(Z@?;Lr5Ejac?=^FD8^g++*o9&*&_@fyeuYDhLd6!u{T*m?z(eoKuS)o>A0& zUf1FJvm~@=?4~rrk4e(ZP%JzGXENw_V~e$&XXfRkUS71)D^uqwa-4V@WV zY0attP*wYc5s51N#>Vl8pRSx(npwa>+4{%;3%Q0=2yguf021Resmg1Lrml@{d_x%5 zn~xcz&?p|ywc7q?=gzMWQ>Q{5)?9MHAl+}A0;AdPSupoAQW9zQbQPO-+p-|KP z_WjX*@9;7y`;7T@M%o)AX3-1=Zau#I%dgS*GAPnFs5A#XUHXq5dE_=F=9j6Y0uGn- zTZ(#r>{luNZ9ATw7d_Y2+abhLFD4zOX9f+t+-M15|MhVZCF@Bc+S(1@cj)w_{Y>c% zHF?Ex_qCU_*M8U?m%&?n!U{L4l0%mE-iOcmAc?CjLX{ItOU7N`Cd9C$ChV=q1HLN| z$?(mo?1{)-fB7TSO}N=@%7mVCR@`(8a#%c~gH2KJ+)q>RXqaV-he5A*4gUBdF_}>+ zwp6*>Jy)Jyr;YxjCpW~gA-_Z0A#{Ely8^xSHU2{_d3!+j!S+nsSr4FDO!aH`(-duldig zOepV;tp`5Ww&t?#x)DV_;5hM}n$`)-ya(0iB;DNsykv;!lS`Jci$H4F?*8D(QJfS=U^D_Jc`>#!p zKNVT=p{5q+LLQ!7(61Br`WUfKvLIm-xh!tN#MF*w9RmC9?!iK{$ny4k`=k;3!c9F> zu4r3Mp-3X*udW|4-za+s^gFWN(}jx@``fekSl9{D9ak^uFqq*Pxl6O%SXo{kes@Nn z^QL0=u_Gr(Z>{GB{=b|d__5|~sH5!{HS445HGfqSEP1DjjD0b$?S(C@rj$9veBYp* zK}u8x`ca&nv_&KGB15-!zJWz`LDJC?dn8wrSb&(}zGCbr@eHz;2sDotq+gXt-fbmH zOqS17J_Dyprb+qp=Yll(Z{n;MFq+&l8vNP!=dDQ%QeMW}%YsxheKo1NS zZVzMS%M6AEodW9xo{e-j##H=~62|v?yQuBGZo5NPcvlhSUd&Sk=JIPT&bD; z8cibS-jv!WAlB$-^rgauPKHK1aMHiKLbmvu+(hn}hWvz$*Cq(qFey%+i~}M(2X!`D zv%$(@%MXpxzdngT*KXpR=^=@LYNX*8cRU=TH|mNHtECyQr5-oy=BB!zFIRAMjl#dE za9S21L^~Gq0wlZt2&!9jDINtx!)BL3?y=b}F(Ra7afl#*Sm z^>+wTv0%(zfj8|K%|?6}OI~=O)AxLX>MwtL=IeZ1e57Tb5Q<={es@Ytqh>BpN0tg~ zwsjqQtv@&(rgaxegel>%duMvwt~PGb3!q<7o#Wq1o0PtGPB)H^G%jrkke_!y`uw}}o)WD8K2#YNDcbr|BB+nbO!u3sGDM`xM&jAN6 z_GRI`V%zJ>6hCH##`HK|6Gm01Rc;tc2s1sj{q<4^P!@I_$f2~`^D9L{FVR|V#c3?> zOq|d`Wd&2`gG}aJr^@j);;Nw$Im83sRZ+_>WW91`%z{|GSQL5c^X3bz-njSK( zS#ZqsbLk;3(dpMK0+u5(4m=_BKd0}=fh%ow#%=H@-bHM_2TnM_} zbYqbHQg`)UAf%h?>N$l$fFzJ0MP;+3&kM7i4*BdgSQbNzQO#ZDM5D<);yyt`t0M|j zecpi-c(yxd6;Fv(N1JFcrey-E4h;-CA_mL+N zmePuF;u@wCaAw@se=&);mLRn@HF_ z{#@=?XFJL78BY8GO+`EsHgREv&@0enJWBw0&b--MGs#A00`5wU)TULs!@OzQmqaqt zd#X2-%HdtumiDh=pg4+7)8XI7b@L(H@(pB|Zf0~~i5??8TH`~Z;<|*%!pTLU4LdgO)q5?*oyG%>9M_i$0|x3s z5Zmc?Hlb;5524?s?rnT>%Cb9{acj8mRdi}c(+|)S^x)xytZ2=pnf!<4;F_G4pumUI z3#*;4=)}XtvIC^-@^)|@KU@R2I=Px6F#_*J<`7@uKO7@WDE!ZV+ymt1zYue2b);ctd?ZR^S%hj(@e`>Jyop)5Eh}&1D`01kf zL=bmKsm2?Dp{lY-3anTyjFT9oB!?swOw_Q?PYuV!$Q;gHFJ~8~Sfw8<4_j$73x`7b zyK1QV<@w4}S>CUlrCpuj;qOa1xPpJZb+E*N5xJ@@XX?HnS>uY4P&a%xl6}oMb9&?| zX1WSpua-9Kx;{Y{=f%LY!vG=uTVAlQ6jk=@`uXnRw;1Pi@#Bwnt0X zrtvW~8b`FQgL5w{>9X+1wx{1qCXM=vXe2QU-Zx5f9XHROqh;w6G^>Uu9$js={{yV( z$tn}V%7-fQCJxFA3BXI}esz`L}g6 zz^yL6Brga}dc57T4l!2$#GwuRxoIz)rD>*XQFsAaHl(S(W9H1Gv7a0Uu2QwL5!^sQ z%dTqsSo-X4g1_?Gx3J}YZU&>w6IQ|EIOkek?~gGCaX3$PD=%utd%}jiL{{=4R^nh; z#UZtgCje8$J)b+XtbBI3yEd1jRhI5fvC(OM7nO=&URg%eNyf-7KyUdfC#4HLwOXh@x+Q3yGq#R#Wlem%GYx@e=d6JowNB|P}TS?AnHZba5 zWs~AFHq$Qvzk(ar%o?rCNl0nNhJnmOFhtt`RBxb9?Kmm#k`mq@7@er$na#M$Ka}L+ zN?fLcbF0lOuTtF>O%RF3iFVg; zy__g{y5OW98dVgMG?!uVyMK!n1`#UD5BFL1PBQ91DnGp_`B$EMMWpSci@nimAAgzW z#s$duV7}MeBNpcZXOWor5dZH#!W;9*qJ?~2V^P`};V-+c27`3Z^f2EOGQ_Qti^z4@ z!sGXtgE;Wu7qi55vglIAKqnkgR~`|VDQ{Tyy3pm5V1%4{`efReWY#yZi7?peWRK zJ>!R*CN63b^JXxB$2 z%y}WwS~G(?tV3Q|WM6k2#`pJG?56t0U~B4-Lk{iyxXXXGXXsUf1OgaG%NStpe`gi; zUKcon`9?<b1euY&;P0DO;TacJ1 z_9}hh-QKp_-KVJy<&J&4s+?OEUI`Y456!sh7I$-C- zhW3E$%C@ttS^p4zZCIIDqfJJmWH4bfjaeEzn#iaM~zG! zpm04f@GJx=Cn|uT`(kzc`z{dEN}Vg_o)WjyA@1Z2pR#bWN}x9;X%>Wt-{;%tXr!M> zeX+tD?%Pk4y4bgI&?;^)7AJ>7ZG*hg%zEw;=Rj#ri-E&WVN4BzIN$f5uRQDAcd0*d zfXwDlt1({ZwTNXAfK<`o|A>_4e#!i%YBQwD3X?|B-hJu%wTok!DC0s4kTF3_qczw2 zHi%!Q0EF%fx{!+)DZ`)_&f<_**F@notkPN%D7Et4Kd1KCT1bFia32pkmR6SS!WQC! zJyW^m*P9WW`mHF!&PAtL9gi9k^z4}V(61*A22u zl5Qh5Vv%B95Wq zUb~4+QQW-H-Zl`OaIBqYH&ci!c`SwJpkDXEx^3*=hP~Rkl23icc zr#ao_qy|&&Uf9G+(xKv(IG!N`}^GUc>i+Q@=-a@0@ePv1?f^9CilESn6St9 z>+HgCAAad6n^*#IHRjAdA5b_fJw?S2mW94q<6@4GAtz|>(}iLdQaY;;Ds^_KoH_?( z&3TK*mA3QPBTENT^6jo({`<2%XQ-`+@@aE}9`1toc7@rI6Fj8~>%fi#7IWyhLMCa` zlblG}Uwyj^+bv*Q#<9u`PK?D?ZC!fg{-+J_yZ*q!5wz0%FIGN|pEPYg#*EPu8~o)N z!d!w%!T<>zVjF5vfss3vM(5qm#z-;~rHU4^8T0cS*d@Ujce1>Jch@76N>{gRZMp4i zAH+*ih-11{z0Y4BSAdwteHI&FAs8lsnS838R=hM8J{x=^iez5t2)f8v8b{L|U#G_J zk7c>TM|~=UjryJoql)xP(gDldmWgm_*0rx#glgQ}!0a5q%pivKK%y zH6y-$nvSk_JV#eIS{A1205|n__qADQ+j?eW>^WKqq)scu)qh4uHUYe=-!uLzaU()IpbTvld> zX6;4TySUOwkBv1>^x}u=2i#^l#&k?Y#Ma{=><_T2f6h0Jx>FOx9cAFTQI-9S z*i|J+&<@b;t4(LK*SozdI3=w}@s~UDrNF*vF?8dx{L#~S4%Zp?{naAzs4;qj1usx( z_|gbQF3c7Aj65cyA0;g6(9lGOWigK%zhiYeqQ(gG(%q*7`UKpYGlk>ZMa5SF1C{sp z`;3M;aCgzSe)t7n%ZH^Z%x(fyadD&fLD~%hOWx-Vq2Fpu1}j>BC4Uy;3-%+9-2?T@tt zG3^c8B207;P^jeXj*c()*r9t0L4uX@=(^~IlZKhWdQbhYlWWcGlN;|-zcQJ-2*`5+-e?nrTMWe0`!c@F2_Aqk!aaumB@JRECTcs&16jf_MqHg`_d2Pi6Rl>eZbB+{(*urn$9OCwf z=WFh~y7KGSjnXUbAo$gw`#H+ShIPENXO`bew_Xy>!Mmd=>OTgn%gv@Bs@V&v|Me)p zQ_-xZ4@Oe0gUoLx6g|;|uH)80=Z?7KMJY0f3-ZHkMF>3_qXl!Xa*9m41f2{?98S0jz+%VClA=_=peFlycXFT<)<5t zC^2XOEWx{y!(*)?An}#}GM3F(q{anMga>JzIK(^xj10K)7lIeJLq?NJ6bPk7oY_Q!@^^EzR`3+&_su z`@-X2izAEGYrSwzXNMI?s~xcA;jb7n3wX666T1v?4~keuzb7!xBO*X4De;NejenAI zS{TsO*~M$LZOc-uTv)TZz4i`||DsTJX!T9(_m_PH8e$u@t8!NMc}fDWSP|LqhwAO? zfYM^Ir14a8)>%TB1|0l_tJOo`=}lqQ83^9`4|2iHIsBM>o8gXd?F%6$@rLfSr|LsN zv`#@M&-JWUQ;^$cS=$LXSDTfOD%VtaoRM9wjtDi`I5Fp!)1Ef0+O{9&ApEC7_uwtj z1LovxZ$_ev0rpC|+mZ|3A3EaMhlb5pJx#9&fIL@&6CmM%RdNe#XM}_4nzwdhkL}Q9 za?v{x&#Ov9ZvRziw=h9sud%B)%4j*r>dTJjCd?A5^HC^M!X`*K%=K&P;aX}*OZuD% z3J3_hq~phxPjvS|9jid>&RJ+C5-o^vz3)wwD^hCvv;(l`cp@xjVAub!D7IR%GaPQr zQP_?uSu?1xnI-&TcWVCYSrce|wNjGi- zC{{)a0ni=HZdziP5jX`S1jg|ag#qrjuAJ#jXAG7jiSpB3IQ1t%M*o6(f+PruAKy3b zCjmKxyB&V?dv{i;J&ljc<}FV5yT*T7GjvVHE4l3>gE+&YK({`jtEuArHdlyk?{ZhM8J$;=Sp~N2WDDkk#kO^A~FrVJ{)DZ;P zHyB2nszIl->7fotU`5KGEh;}8v=kC8;wRocbMgO9fKC*Q2BS)oP6-hlpd_|4@b)w? z!ieVuI{jqQ4o`<-kPxBAr{CPZgT5NlPkh?H7i@CF#M0@Hr>|H3C*!{p+MoXGqkkt| z|KGz;e+RmA+U9>h|G&%kPfE?mU=VQl9DX5toOT0zz8C~2d;zILa}nPmcR8WN55fp? x;yWf1efmQvig+pHDHr1Oi_4Lx@BjZ>kQ)sexUc@jBoHo8x^oYeC;!yv{{V*=21x(_ literal 0 HcmV?d00001 diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 1054f7626..c49b49c73 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -40,7 +40,11 @@ "label": "Resources", "items": [ { "text": "Project Templates", "url": "https://github.com/explosion/projects" }, - { "text": "v2.x Documentation", "url": "https://v2.spacy.io" } + { "text": "v2.x Documentation", "url": "https://v2.spacy.io" }, + { + "text": "Custom Solutions", + "url": "https://explosion.ai/spacy-tailored-pipelines" + } ] } ] diff --git a/website/meta/site.json b/website/meta/site.json index 169680f86..9ecaef74c 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -48,7 +48,11 @@ { "text": "Usage", "url": "/usage" }, { "text": "Models", "url": "/models" }, { "text": "API Reference", "url": "/api" }, - { "text": "Online Course", "url": "https://course.spacy.io" } + { "text": "Online Course", "url": "https://course.spacy.io" }, + { + "text": "Custom Solutions", + "url": "https://explosion.ai/spacy-tailored-pipelines" + } ] }, { diff --git a/website/src/components/list.js b/website/src/components/list.js index e0a3d9b64..d31617487 100644 --- a/website/src/components/list.js +++ b/website/src/components/list.js @@ -6,11 +6,14 @@ import { replaceEmoji } from './icon' export const Ol = props =>

    export const Ul = props =>
      -export const Li = ({ children, ...props }) => { +export const Li = ({ children, emoji, ...props }) => { const { hasIcon, content } = replaceEmoji(children) - const liClassNames = classNames(classes.li, { [classes.liIcon]: hasIcon }) + const liClassNames = classNames(classes.li, { + [classes.liIcon]: hasIcon, + [classes.emoji]: emoji, + }) return ( -
    • +
    • {content}
    • ) diff --git a/website/src/styles/list.module.sass b/website/src/styles/list.module.sass index 588b30ba0..1a352d9dd 100644 --- a/website/src/styles/list.module.sass +++ b/website/src/styles/list.module.sass @@ -36,6 +36,16 @@ box-sizing: content-box vertical-align: top +.emoji:before + content: attr(data-emoji) + padding-right: 0.75em + padding-top: 0 + margin-left: -2.5em + width: 1.75em + text-align: right + font-size: 1em + position: static + .li-icon text-indent: calc(-20px - 0.55em) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 74607fd09..b7ae35f6e 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -15,9 +15,9 @@ import { } from '../components/landing' import { H2 } from '../components/typography' import { InlineCode } from '../components/code' +import { Ul, Li } from '../components/list' import Button from '../components/button' import Link from '../components/link' -import { YouTube } from '../components/embed' import QuickstartTraining from './quickstart-training' import Project from './project' @@ -25,6 +25,7 @@ import Features from './features' import courseImage from '../../docs/images/course.jpg' import prodigyImage from '../../docs/images/prodigy_overview.jpg' import projectsImage from '../../docs/images/projects.png' +import tailoredPipelinesImage from '../../docs/images/spacy-tailored-pipelines_wide.png' import Benchmarks from 'usage/_benchmarks-models.md' @@ -104,23 +105,45 @@ const Landing = ({ data }) => { - spaCy v3.0 features all new transformer-based pipelines that - bring spaCy's accuracy right up to the current state-of-the-art - . You can use any pretrained transformer to train your own pipelines, and even - share one transformer between multiple components with{' '} - multi-task learning. Training is now fully configurable and - extensible, and you can define your own custom models using{' '} - PyTorch, TensorFlow and other frameworks. The - new spaCy projects system lets you describe whole{' '} - end-to-end workflows in a single file, giving you an easy path - from prototype to production, and making it easy to clone and adapt - best-practice projects for your own use cases. + + spaCy Tailored Pipelines + + + Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's + core developers. + +
      +
      +
        +
      • + Streamlined. Nobody knows spaCy better than we do. Send + us your pipeline requirements and we'll be ready to start producing your + solution in no time at all. +
      • +
      • + Production ready. spaCy pipelines are robust and easy + to deploy. You'll get a complete spaCy project folder which is ready to{' '} + spacy project run. +
      • +
      • + Predictable. You'll know exactly what you're going to + get and what it's going to cost. We quote fees up-front, let you try + before you buy, and don't charge for over-runs at our end — all the risk + is on us. +
      • +
      • + Maintainable. spaCy is an industry standard, and we'll + deliver your pipeline with full code, data, tests and documentation, so + your team can retrain, update and extend the solution as your + requirements change. +
      • +
      { - - + + spaCy v3.0 features all new transformer-based pipelines that + bring spaCy's accuracy right up to the current state-of-the-art + . You can use any pretrained transformer to train your own pipelines, and even + share one transformer between multiple components with{' '} + multi-task learning. Training is now fully configurable and + extensible, and you can define your own custom models using{' '} + PyTorch, TensorFlow and other frameworks. Date: Tue, 8 Feb 2022 13:37:27 +0100 Subject: [PATCH 047/177] fix(phrasematcher.pyi): change type annotation of `docs` in `add()` to `List[Doc]` (#10235) https://github.com/explosion/spaCy/issues/10234 --- spacy/matcher/phrasematcher.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 82a194835..68e3386e4 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -14,7 +14,7 @@ class PhraseMatcher: def add( self, key: str, - docs: List[List[Dict[str, Any]]], + docs: List[Doc], *, on_match: Optional[ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] From 10c77af83d700c78aaf08be793e78b5d79f8550a Mon Sep 17 00:00:00 2001 From: John Boy <2187261+jboynyc@users.noreply.github.com> Date: Wed, 9 Feb 2022 06:04:26 +0000 Subject: [PATCH 048/177] add textnets to spaCy universe (#10216) https://github.com/jboynyc/textnets/issues/38 --- website/meta/universe.json | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 1a67de67b..4ded8880f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3769,6 +3769,29 @@ }, "category": ["pipeline"], "tags": ["pipeline", "nlp", "sentiment"] + }, + { + "id": "textnets", + "slogan": "Text analysis with networks", + "description": "textnets represents collections of texts as networks of documents and words. This provides novel possibilities for the visualization and analysis of texts.", + "github": "jboynyc/textnets", + "image": "https://user-images.githubusercontent.com/2187261/152641425-6c0fb41c-b8e0-44fb-a52a-7c1ba24eba1e.png", + "code_example": [ + "import textnets as tn", + "", + "corpus = tn.Corpus(tn.examples.moon_landing)", + "t = tn.Textnet(corpus.tokenized(), min_docs=1)", + "t.plot(label_nodes=True,", + " show_clusters=True,", + " scale_nodes_by=\"birank\",", + " scale_edges_by=\"weight\")" + ], + "author": "John Boy", + "author_links": { + "github": "jboynyc", + "twitter": "jboy" + }, + "category": ["visualizers", "standalone"] } ], From 3877f78ff9f406a148e27a16ee60a7778bc5a551 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 9 Feb 2022 12:21:20 +0200 Subject: [PATCH 049/177] fix the syntax for the slow/gpu test crons (#10244) --- .github/workflows/gputests.yml | 3 ++- .github/workflows/slowtests.yml | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index 7c062fe4c..2a5be13ce 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -1,3 +1,5 @@ +name: Weekly GPU tests + on: schedule: - cron: '0 1 * * MON' @@ -15,5 +17,4 @@ jobs: PIPELINE: explosion-ai/spacy-slow-gpu-tests BRANCH: ${{ matrix.branch }} MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action" - secrets: BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 4d4441679..c3a08e5b5 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -1,3 +1,5 @@ +name: Daily slow tests + on: schedule: - cron: '0 0 * * *' @@ -21,12 +23,10 @@ jobs: fi - name: Trigger buildkite build - needs: check_commits - if: needs.check_commits.outputs.run_tests == 'true' + if: steps.check_commits.outputs.run_tests == 'true' uses: buildkite/trigger-pipeline-action@v1.2.0 env: PIPELINE: explosion-ai/spacy-slow-tests BRANCH: ${{ matrix.branch }} MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action" - secrets: BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} From ee662ec38190f2f806ab67309ed9eb160dcaceed Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Thu, 10 Feb 2022 02:15:23 -0500 Subject: [PATCH 050/177] Raise error in spacy package when model name is not a valid python identifier (#10192) * MultiHashEmbed vector docs correction * raise error for invalid identifier as model name * more succinct error message * update success message * permitted package name + double underscore * clarify package name error * clarify underscore run message * tweak language + simplify underscore run * cleanup underscore run warning * spacing correction * Update spacy/tests/test_cli.py Co-authored-by: Adriane Boyd --- spacy/cli/package.py | 37 +++++++++++++++++++++++++++++++++++-- spacy/tests/test_cli.py | 14 +++++++++++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index f9d2a9af2..b8c8397b6 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -7,6 +7,7 @@ from collections import defaultdict from catalogue import RegistryError import srsly import sys +import re from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX from ..schemas import validate, ModelMetaSchema @@ -109,6 +110,24 @@ def package( ", ".join(meta["requirements"]), ) if name is not None: + if not name.isidentifier(): + msg.fail( + f"Model name ('{name}') is not a valid module name. " + "This is required so it can be imported as a module.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", + exits=1, + ) + if not _is_permitted_package_name(name): + msg.fail( + f"Model name ('{name}') is not a permitted package name. " + "This is required to correctly load the model with spacy.load.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", + exits=1, + ) meta["name"] = name if version is not None: meta["version"] = version @@ -162,7 +181,7 @@ def package( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) - msg.good(f"Successfully created package '{model_name_v}'", main_path) + msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) @@ -171,8 +190,14 @@ def package( if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) - wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" + wheel_name_squashed = re.sub("_+", "_", model_name_v) + wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) + if "__" in model_name: + msg.warn( + f"Model name ('{model_name}') contains a run of underscores. " + "Runs of underscores are not significant in installed package names.", + ) def has_wheel() -> bool: @@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: return md.text +def _is_permitted_package_name(package_name: str) -> bool: + # regex from: https://www.python.org/dev/peps/pep-0426/#name + permitted_match = re.search( + r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE + ) + return permitted_match is not None + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 9d5bdfab2..9d3f1ee71 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -17,6 +17,7 @@ from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies +from spacy.cli.package import _is_permitted_package_name from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -695,6 +696,17 @@ def test_get_labels_from_model(factory_name, pipe_name): assert _get_labels_from_model(nlp, factory_name) == set(labels) +def test_permitted_package_names(): + # https://www.python.org/dev/peps/pep-0426/#name + assert _is_permitted_package_name("Meine_Bäume") == False + assert _is_permitted_package_name("_package") == False + assert _is_permitted_package_name("package_") == False + assert _is_permitted_package_name(".package") == False + assert _is_permitted_package_name("package.") == False + assert _is_permitted_package_name("-package") == False + assert _is_permitted_package_name("package-") == False + + def test_debug_data_compile_gold(): nlp = English() pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) @@ -707,4 +719,4 @@ def test_debug_data_compile_gold(): ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"]) eg = Example(pred, ref) data = _compile_gold([eg], ["ner"], nlp, True) - assert data["boundary_cross_ents"] == 1 + assert data["boundary_cross_ents"] == 1 \ No newline at end of file From 2d6cabb23c1b39995fdd8bdaf78f68ac344f7901 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Thu, 10 Feb 2022 13:06:30 +0200 Subject: [PATCH 051/177] Fix the date command and the matrix failure mode (#10254) --- .github/workflows/gputests.yml | 1 + .github/workflows/slowtests.yml | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index 2a5be13ce..14c1552bf 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -7,6 +7,7 @@ on: jobs: weekly-gputests: strategy: + fail-fast: false matrix: branch: [master, develop, v4] runs-on: ubuntu-latest diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index c3a08e5b5..9490b53bd 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -7,15 +7,18 @@ on: jobs: daily-slowtests: strategy: + fail-fast: false matrix: branch: [master, develop, v4] runs-on: ubuntu-latest steps: + - name: Checkout + uses: actions/checkout@v1 - name: Get commits from past 24 hours id: check_commits run: | today=$(date '+%Y-%m-%d %H:%M:%S') - yesterday=$(date -v-1d '+%Y-%m-%d %H:%M:%S') + yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') if git log --after=$yesterday --before=$today | grep commit ; then echo "::set-output name=run_tests::true" else From 7961a0a959e6860bc9b2eb9d487a77de84758ae9 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Thu, 10 Feb 2022 13:45:46 +0100 Subject: [PATCH 052/177] Fix typo in errors (#10256) --- spacy/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 390612123..b45c4f9db 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes): "components, since spans are only views of the Doc. Use Doc and " "Token attributes (or custom extension attributes) only and remove " "the following: {attrs}") - E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " + E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. " "Only Doc and Token attributes are supported.") E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " "to define the attribute? For example: `{attr}.???`") From bbaf41fb3b1b0123455b93d7b97a9ef5d886f8b1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Feb 2022 11:45:26 +0100 Subject: [PATCH 053/177] Set version to v3.2.2 (#10262) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index c253d5052..d01b278c9 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.2.1" +__version__ = "3.2.2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 9a06a210ec8ef2a6cd93f4572c3dd18c2532ca71 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Feb 2022 14:22:43 +0100 Subject: [PATCH 054/177] Exclude github workflow edits from CI (#10261) --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 71a793911..8e322f3dd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,6 +17,7 @@ pr: - "*.md" - "website/docs/*" - "website/src/*" + - ".github/workflows/*" jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config From 5adedb8587818741dcd4ee1364ffb3f7d5074e75 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 11 Feb 2022 14:23:01 +0100 Subject: [PATCH 055/177] Auto-format code with black (#10260) Co-authored-by: explosion-bot --- spacy/tests/test_cli.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 9d3f1ee71..fc35ff86e 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -706,17 +706,27 @@ def test_permitted_package_names(): assert _is_permitted_package_name("-package") == False assert _is_permitted_package_name("package-") == False - + def test_debug_data_compile_gold(): nlp = English() pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) - ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"]) + ref = Doc( + nlp.vocab, + words=["Token", ".", "New York City"], + sent_starts=[True, False, True], + ents=["O", "O", "B-ENT"], + ) eg = Example(pred, ref) data = _compile_gold([eg], ["ner"], nlp, True) assert data["boundary_cross_ents"] == 0 pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) - ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"]) + ref = Doc( + nlp.vocab, + words=["Token", ".", "New York City"], + sent_starts=[True, False, True], + ents=["O", "B-ENT", "I-ENT"], + ) eg = Example(pred, ref) data = _compile_gold([eg], ["ner"], nlp, True) - assert data["boundary_cross_ents"] == 1 \ No newline at end of file + assert data["boundary_cross_ents"] == 1 From 8818a44a39f6e8f5387680e28984897a60baa830 Mon Sep 17 00:00:00 2001 From: Markus Konrad Date: Mon, 14 Feb 2022 07:16:43 +0100 Subject: [PATCH 056/177] add tmtoolkit package to spaCy universe (#10245) --- website/meta/universe.json | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 4ded8880f..d7eef97e8 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3792,6 +3792,39 @@ "twitter": "jboy" }, "category": ["visualizers", "standalone"] + }, + { + "id": "tmtoolkit", + "slogan": "Text mining and topic modeling toolkit", + "description": "tmtoolkit is a set of tools for text mining and topic modeling with Python developed especially for the use in the social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation and a clear programming interface while offering good performance on large datasets by the means of vectorized operations (via NumPy) and parallel computation (using Python’s multiprocessing module and the loky package).", + "github": "WZBSocialScienceCenter/tmtoolkit", + "code_example": [ + "from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm", + "from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table", + "# load built-in sample dataset and use 4 worker processes", + "corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)", + "# investigate corpus as dataframe", + "toktbl = tokens_table(corp)", + "print(toktbl)", + "# apply some text normalization", + "lemmatize(corp)", + "to_lowercase(corp)", + "# build sparse document-token matrix (DTM)", + "# document labels identify rows, vocabulary tokens identify columns", + "mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)", + "# apply tf-idf transformation to DTM", + "# operation is applied on sparse matrix and uses few memory", + "tfidf_mat = tfidf(mat)", + "# show top 5 tokens per document ranked by tf-idf", + "top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)", + "print(top_tokens)" + ], + "author": "Markus Konrad / WZB Social Science Center", + "author_links": { + "github": "internaut", + "twitter": "_knrd" + }, + "category": ["scientific", "standalone"] } ], From 23bd103d8940c110e2588e7c93f8e33205e1b3be Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 14 Feb 2022 15:17:25 +0900 Subject: [PATCH 057/177] Add tmtoolkit setup steps --- website/meta/universe.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index d7eef97e8..122281583 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3799,6 +3799,9 @@ "description": "tmtoolkit is a set of tools for text mining and topic modeling with Python developed especially for the use in the social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation and a clear programming interface while offering good performance on large datasets by the means of vectorized operations (via NumPy) and parallel computation (using Python’s multiprocessing module and the loky package).", "github": "WZBSocialScienceCenter/tmtoolkit", "code_example": [ + "# Note: This requires these setup steps:", + "# pip install tmtoolkit[recommended]", + "# python -m tmtoolkit setup en", "from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm", "from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table", "# load built-in sample dataset and use 4 worker processes", From f6250015ab4693131bde160ba5659151046cdd1d Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Tue, 15 Feb 2022 15:18:36 +0200 Subject: [PATCH 058/177] Fix the datemath for reals (#10294) * add debugging branch and quotes to daily slowtest action * Apparently the quotes fixed it --- .github/workflows/slowtests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 9490b53bd..3b0f177a7 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -19,7 +19,7 @@ jobs: run: | today=$(date '+%Y-%m-%d %H:%M:%S') yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') - if git log --after=$yesterday --before=$today | grep commit ; then + if git log --after="$yesterday" --before="$today" | grep commit ; then echo "::set-output name=run_tests::true" else echo "::set-output name=run_tests::false" From 22066f4e0fd2a0685932b118bbc7501370c17dd9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 16 Feb 2022 13:45:30 +0100 Subject: [PATCH 059/177] Also exclude workflows from non-PR CI runs (#10305) --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8e322f3dd..4624b2eb2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,8 +11,9 @@ trigger: exclude: - "website/*" - "*.md" + - ".github/workflows/*" pr: - paths: + paths: exclude: - "*.md" - "website/docs/*" From d30ee14ab3959addd726eee4555e5f07fe94f062 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 16 Feb 2022 16:39:42 +0200 Subject: [PATCH 060/177] Pass the matrix branch to the checkout action (#10304) --- .github/workflows/slowtests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 3b0f177a7..74f2b8998 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -14,6 +14,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v1 + with: + ref: ${{ matrix.branch }} - name: Get commits from past 24 hours id: check_commits run: | From fef768ef748d0526c53d147a38243c4dc84e0d28 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 16 Feb 2022 15:43:36 +0100 Subject: [PATCH 061/177] remove develop (not an active branch anymore) --- .github/workflows/slowtests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 74f2b8998..1a99c751c 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, develop, v4] + branch: [master, v4] runs-on: ubuntu-latest steps: - name: Checkout From 26eac22d3b46131187c66f4d732603fb54610645 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 16 Feb 2022 15:44:05 +0100 Subject: [PATCH 062/177] remove develop also from GPU tests --- .github/workflows/gputests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index 14c1552bf..bb7f51d29 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, develop, v4] + branch: [master, v4] runs-on: ubuntu-latest steps: - name: Trigger buildkite build From da7520a83c6ec6ec22f74bcc265b57620f3b64d8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 17 Feb 2022 11:35:34 +0100 Subject: [PATCH 063/177] Delay loading of mecab in Korean tokenizer (#10295) * Delay loading of mecab in Korean tokenizer Delay loading of mecab until the tokenizer is called the first time so that it's possible to initialize a blank `ko` pipeline without having mecab installed, e.g. for use with `spacy init vectors`. * Move mecab import back to __init__ Move mecab import back to __init__ to warn users at the same point as before for missing python dependencies. --- spacy/lang/ko/__init__.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 05fc67e79..eb3c2e1f5 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -31,15 +31,24 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab): self.vocab = vocab - MeCab = try_mecab_import() # type: ignore[func-returns-value] - self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") + self._mecab = try_mecab_import() # type: ignore[func-returns-value] + self._mecab_tokenizer = None + + @property + def mecab_tokenizer(self): + # This is a property so that initializing a pipeline with blank:ko is + # possible without actually requiring mecab-ko, e.g. to run + # `spacy init vectors ko` for a pipeline that will have a different + # tokenizer in the end. The languages need to match for the vectors + # to be imported and there's no way to pass a custom config to + # `init vectors`. + if self._mecab_tokenizer is None: + self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") + return self._mecab_tokenizer def __reduce__(self): return KoreanTokenizer, (self.vocab,) - def __del__(self): - self.mecab_tokenizer.__del__() - def __call__(self, text: str) -> Doc: dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] @@ -90,7 +99,8 @@ def try_mecab_import() -> None: return MeCab except ImportError: raise ImportError( - "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires " + "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " "and [natto-py](https://github.com/buruzaemon/natto-py)" ) from None From a9756963e67cff6be5445ae441263f889c629123 Mon Sep 17 00:00:00 2001 From: Grey Murav <65895033+gremur@users.noreply.github.com> Date: Thu, 17 Feb 2022 17:48:50 +0300 Subject: [PATCH 064/177] Extend list of abbreviations for ru language (#10282) * Extend list of abbreviations for ru language Extended list of abbreviations for ru language those may have influence on tokenization. * black formatting Co-authored-by: thomashacker --- spacy/lang/ru/tokenizer_exceptions.py | 347 +++++++++++++++++++++++++- 1 file changed, 341 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index 1dc363fae..f3756e26c 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -2,7 +2,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc - _exc = {} _abbrev_exc = [ @@ -42,7 +41,6 @@ _abbrev_exc = [ {ORTH: "дек", NORM: "декабрь"}, ] - for abbrev_desc in _abbrev_exc: abbrev = abbrev_desc[ORTH] for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): @@ -50,17 +48,354 @@ for abbrev_desc in _abbrev_exc: _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}] -_slang_exc = [ +for abbr in [ + # Year slang abbreviations {ORTH: "2к15", NORM: "2015"}, {ORTH: "2к16", NORM: "2016"}, {ORTH: "2к17", NORM: "2017"}, {ORTH: "2к18", NORM: "2018"}, {ORTH: "2к19", NORM: "2019"}, {ORTH: "2к20", NORM: "2020"}, -] + {ORTH: "2к21", NORM: "2021"}, + {ORTH: "2к22", NORM: "2022"}, + {ORTH: "2к23", NORM: "2023"}, + {ORTH: "2к24", NORM: "2024"}, + {ORTH: "2к25", NORM: "2025"}, +]: + _exc[abbr[ORTH]] = [abbr] -for slang_desc in _slang_exc: - _exc[slang_desc[ORTH]] = [slang_desc] +for abbr in [ + # Profession and academic titles abbreviations + {ORTH: "ак.", NORM: "академик"}, + {ORTH: "акад.", NORM: "академик"}, + {ORTH: "д-р архитектуры", NORM: "доктор архитектуры"}, + {ORTH: "д-р биол. наук", NORM: "доктор биологических наук"}, + {ORTH: "д-р ветеринар. наук", NORM: "доктор ветеринарных наук"}, + {ORTH: "д-р воен. наук", NORM: "доктор военных наук"}, + {ORTH: "д-р геогр. наук", NORM: "доктор географических наук"}, + {ORTH: "д-р геол.-минерал. наук", NORM: "доктор геолого-минералогических наук"}, + {ORTH: "д-р искусствоведения", NORM: "доктор искусствоведения"}, + {ORTH: "д-р ист. наук", NORM: "доктор исторических наук"}, + {ORTH: "д-р культурологии", NORM: "доктор культурологии"}, + {ORTH: "д-р мед. наук", NORM: "доктор медицинских наук"}, + {ORTH: "д-р пед. наук", NORM: "доктор педагогических наук"}, + {ORTH: "д-р полит. наук", NORM: "доктор политических наук"}, + {ORTH: "д-р психол. наук", NORM: "доктор психологических наук"}, + {ORTH: "д-р с.-х. наук", NORM: "доктор сельскохозяйственных наук"}, + {ORTH: "д-р социол. наук", NORM: "доктор социологических наук"}, + {ORTH: "д-р техн. наук", NORM: "доктор технических наук"}, + {ORTH: "д-р фармацевт. наук", NORM: "доктор фармацевтических наук"}, + {ORTH: "д-р физ.-мат. наук", NORM: "доктор физико-математических наук"}, + {ORTH: "д-р филол. наук", NORM: "доктор филологических наук"}, + {ORTH: "д-р филос. наук", NORM: "доктор философских наук"}, + {ORTH: "д-р хим. наук", NORM: "доктор химических наук"}, + {ORTH: "д-р экон. наук", NORM: "доктор экономических наук"}, + {ORTH: "д-р юрид. наук", NORM: "доктор юридических наук"}, + {ORTH: "д-р", NORM: "доктор"}, + {ORTH: "д.б.н.", NORM: "доктор биологических наук"}, + {ORTH: "д.г.-м.н.", NORM: "доктор геолого-минералогических наук"}, + {ORTH: "д.г.н.", NORM: "доктор географических наук"}, + {ORTH: "д.и.н.", NORM: "доктор исторических наук"}, + {ORTH: "д.иск.", NORM: "доктор искусствоведения"}, + {ORTH: "д.м.н.", NORM: "доктор медицинских наук"}, + {ORTH: "д.п.н.", NORM: "доктор психологических наук"}, + {ORTH: "д.пед.н.", NORM: "доктор педагогических наук"}, + {ORTH: "д.полит.н.", NORM: "доктор политических наук"}, + {ORTH: "д.с.-х.н.", NORM: "доктор сельскохозяйственных наук"}, + {ORTH: "д.социол.н.", NORM: "доктор социологических наук"}, + {ORTH: "д.т.н.", NORM: "доктор технических наук"}, + {ORTH: "д.т.н", NORM: "доктор технических наук"}, + {ORTH: "д.ф.-м.н.", NORM: "доктор физико-математических наук"}, + {ORTH: "д.ф.н.", NORM: "доктор филологических наук"}, + {ORTH: "д.филос.н.", NORM: "доктор философских наук"}, + {ORTH: "д.фил.н.", NORM: "доктор филологических наук"}, + {ORTH: "д.х.н.", NORM: "доктор химических наук"}, + {ORTH: "д.э.н.", NORM: "доктор экономических наук"}, + {ORTH: "д.э.н", NORM: "доктор экономических наук"}, + {ORTH: "д.ю.н.", NORM: "доктор юридических наук"}, + {ORTH: "доц.", NORM: "доцент"}, + {ORTH: "и.о.", NORM: "исполняющий обязанности"}, + {ORTH: "к.б.н.", NORM: "кандидат биологических наук"}, + {ORTH: "к.воен.н.", NORM: "кандидат военных наук"}, + {ORTH: "к.г.-м.н.", NORM: "кандидат геолого-минералогических наук"}, + {ORTH: "к.г.н.", NORM: "кандидат географических наук"}, + {ORTH: "к.геогр.н", NORM: "кандидат географических наук"}, + {ORTH: "к.геогр.наук", NORM: "кандидат географических наук"}, + {ORTH: "к.и.н.", NORM: "кандидат исторических наук"}, + {ORTH: "к.иск.", NORM: "кандидат искусствоведения"}, + {ORTH: "к.м.н.", NORM: "кандидат медицинских наук"}, + {ORTH: "к.п.н.", NORM: "кандидат психологических наук"}, + {ORTH: "к.псх.н.", NORM: "кандидат психологических наук"}, + {ORTH: "к.пед.н.", NORM: "кандидат педагогических наук"}, + {ORTH: "канд.пед.наук", NORM: "кандидат педагогических наук"}, + {ORTH: "к.полит.н.", NORM: "кандидат политических наук"}, + {ORTH: "к.с.-х.н.", NORM: "кандидат сельскохозяйственных наук"}, + {ORTH: "к.социол.н.", NORM: "кандидат социологических наук"}, + {ORTH: "к.с.н.", NORM: "кандидат социологических наук"}, + {ORTH: "к.т.н.", NORM: "кандидат технических наук"}, + {ORTH: "к.ф.-м.н.", NORM: "кандидат физико-математических наук"}, + {ORTH: "к.ф.н.", NORM: "кандидат филологических наук"}, + {ORTH: "к.фил.н.", NORM: "кандидат филологических наук"}, + {ORTH: "к.филол.н", NORM: "кандидат филологических наук"}, + {ORTH: "к.фарм.наук", NORM: "кандидат фармакологических наук"}, + {ORTH: "к.фарм.н.", NORM: "кандидат фармакологических наук"}, + {ORTH: "к.фарм.н", NORM: "кандидат фармакологических наук"}, + {ORTH: "к.филос.наук", NORM: "кандидат философских наук"}, + {ORTH: "к.филос.н.", NORM: "кандидат философских наук"}, + {ORTH: "к.филос.н", NORM: "кандидат философских наук"}, + {ORTH: "к.х.н.", NORM: "кандидат химических наук"}, + {ORTH: "к.х.н", NORM: "кандидат химических наук"}, + {ORTH: "к.э.н.", NORM: "кандидат экономических наук"}, + {ORTH: "к.э.н", NORM: "кандидат экономических наук"}, + {ORTH: "к.ю.н.", NORM: "кандидат юридических наук"}, + {ORTH: "к.ю.н", NORM: "кандидат юридических наук"}, + {ORTH: "канд. архитектуры", NORM: "кандидат архитектуры"}, + {ORTH: "канд. биол. наук", NORM: "кандидат биологических наук"}, + {ORTH: "канд. ветеринар. наук", NORM: "кандидат ветеринарных наук"}, + {ORTH: "канд. воен. наук", NORM: "кандидат военных наук"}, + {ORTH: "канд. геогр. наук", NORM: "кандидат географических наук"}, + {ORTH: "канд. геол.-минерал. наук", NORM: "кандидат геолого-минералогических наук"}, + {ORTH: "канд. искусствоведения", NORM: "кандидат искусствоведения"}, + {ORTH: "канд. ист. наук", NORM: "кандидат исторических наук"}, + {ORTH: "к.ист.н.", NORM: "кандидат исторических наук"}, + {ORTH: "канд. культурологии", NORM: "кандидат культурологии"}, + {ORTH: "канд. мед. наук", NORM: "кандидат медицинских наук"}, + {ORTH: "канд. пед. наук", NORM: "кандидат педагогических наук"}, + {ORTH: "канд. полит. наук", NORM: "кандидат политических наук"}, + {ORTH: "канд. психол. наук", NORM: "кандидат психологических наук"}, + {ORTH: "канд. с.-х. наук", NORM: "кандидат сельскохозяйственных наук"}, + {ORTH: "канд. социол. наук", NORM: "кандидат социологических наук"}, + {ORTH: "к.соц.наук", NORM: "кандидат социологических наук"}, + {ORTH: "к.соц.н.", NORM: "кандидат социологических наук"}, + {ORTH: "к.соц.н", NORM: "кандидат социологических наук"}, + {ORTH: "канд. техн. наук", NORM: "кандидат технических наук"}, + {ORTH: "канд. фармацевт. наук", NORM: "кандидат фармацевтических наук"}, + {ORTH: "канд. физ.-мат. наук", NORM: "кандидат физико-математических наук"}, + {ORTH: "канд. филол. наук", NORM: "кандидат филологических наук"}, + {ORTH: "канд. филос. наук", NORM: "кандидат философских наук"}, + {ORTH: "канд. хим. наук", NORM: "кандидат химических наук"}, + {ORTH: "канд. экон. наук", NORM: "кандидат экономических наук"}, + {ORTH: "канд. юрид. наук", NORM: "кандидат юридических наук"}, + {ORTH: "в.н.с.", NORM: "ведущий научный сотрудник"}, + {ORTH: "мл. науч. сотр.", NORM: "младший научный сотрудник"}, + {ORTH: "м.н.с.", NORM: "младший научный сотрудник"}, + {ORTH: "проф.", NORM: "профессор"}, + {ORTH: "профессор.кафедры", NORM: "профессор кафедры"}, + {ORTH: "ст. науч. сотр.", NORM: "старший научный сотрудник"}, + {ORTH: "чл.-к.", NORM: "член корреспондент"}, + {ORTH: "чл.-корр.", NORM: "член-корреспондент"}, + {ORTH: "чл.-кор.", NORM: "член-корреспондент"}, + {ORTH: "дир.", NORM: "директор"}, + {ORTH: "зам. дир.", NORM: "заместитель директора"}, + {ORTH: "зав. каф.", NORM: "заведующий кафедрой"}, + {ORTH: "зав.кафедрой", NORM: "заведующий кафедрой"}, + {ORTH: "зав. кафедрой", NORM: "заведующий кафедрой"}, + {ORTH: "асп.", NORM: "аспирант"}, + {ORTH: "гл. науч. сотр.", NORM: "главный научный сотрудник"}, + {ORTH: "вед. науч. сотр.", NORM: "ведущий научный сотрудник"}, + {ORTH: "науч. сотр.", NORM: "научный сотрудник"}, + {ORTH: "к.м.с.", NORM: "кандидат в мастера спорта"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Literary phrases abbreviations + {ORTH: "и т.д.", NORM: "и так далее"}, + {ORTH: "и т.п.", NORM: "и тому подобное"}, + {ORTH: "т.д.", NORM: "так далее"}, + {ORTH: "т.п.", NORM: "тому подобное"}, + {ORTH: "т.е.", NORM: "то есть"}, + {ORTH: "т.к.", NORM: "так как"}, + {ORTH: "в т.ч.", NORM: "в том числе"}, + {ORTH: "и пр.", NORM: "и прочие"}, + {ORTH: "и др.", NORM: "и другие"}, + {ORTH: "т.н.", NORM: "так называемый"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Appeal to a person abbreviations + {ORTH: "г-н", NORM: "господин"}, + {ORTH: "г-да", NORM: "господа"}, + {ORTH: "г-жа", NORM: "госпожа"}, + {ORTH: "тов.", NORM: "товарищ"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Time periods abbreviations + {ORTH: "до н.э.", NORM: "до нашей эры"}, + {ORTH: "по н.в.", NORM: "по настоящее время"}, + {ORTH: "в н.в.", NORM: "в настоящее время"}, + {ORTH: "наст.", NORM: "настоящий"}, + {ORTH: "наст. время", NORM: "настоящее время"}, + {ORTH: "г.г.", NORM: "годы"}, + {ORTH: "гг.", NORM: "годы"}, + {ORTH: "т.г.", NORM: "текущий год"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Address forming elements abbreviations + {ORTH: "респ.", NORM: "республика"}, + {ORTH: "обл.", NORM: "область"}, + {ORTH: "г.ф.з.", NORM: "город федерального значения"}, + {ORTH: "а.обл.", NORM: "автономная область"}, + {ORTH: "а.окр.", NORM: "автономный округ"}, + {ORTH: "м.р-н", NORM: "муниципальный район"}, + {ORTH: "г.о.", NORM: "городской округ"}, + {ORTH: "г.п.", NORM: "городское поселение"}, + {ORTH: "с.п.", NORM: "сельское поселение"}, + {ORTH: "вн.р-н", NORM: "внутригородской район"}, + {ORTH: "вн.тер.г.", NORM: "внутригородская территория города"}, + {ORTH: "пос.", NORM: "поселение"}, + {ORTH: "р-н", NORM: "район"}, + {ORTH: "с/с", NORM: "сельсовет"}, + {ORTH: "г.", NORM: "город"}, + {ORTH: "п.г.т.", NORM: "поселок городского типа"}, + {ORTH: "пгт.", NORM: "поселок городского типа"}, + {ORTH: "р.п.", NORM: "рабочий поселок"}, + {ORTH: "рп.", NORM: "рабочий поселок"}, + {ORTH: "кп.", NORM: "курортный поселок"}, + {ORTH: "гп.", NORM: "городской поселок"}, + {ORTH: "п.", NORM: "поселок"}, + {ORTH: "в-ки", NORM: "выселки"}, + {ORTH: "г-к", NORM: "городок"}, + {ORTH: "з-ка", NORM: "заимка"}, + {ORTH: "п-к", NORM: "починок"}, + {ORTH: "киш.", NORM: "кишлак"}, + {ORTH: "п. ст. ", NORM: "поселок станция"}, + {ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"}, + {ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"}, + {ORTH: "ж/д б-ка", NORM: "железнодорожная будка"}, + {ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"}, + {ORTH: "ж/д к-ма", NORM: "железнодорожная казарма"}, + {ORTH: "ж/д к-т", NORM: "железнодорожный комбинат"}, + {ORTH: "ж/д пл-ма", NORM: "железнодорожная платформа"}, + {ORTH: "ж/д пл-ка", NORM: "железнодорожная площадка"}, + {ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"}, + {ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"}, + {ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"}, + {ORTH: "ж/д ст. ", NORM: "железнодорожная станция"}, + {ORTH: "м-ко", NORM: "местечко"}, + {ORTH: "д.", NORM: "деревня"}, + {ORTH: "с.", NORM: "село"}, + {ORTH: "сл.", NORM: "слобода"}, + {ORTH: "ст. ", NORM: "станция"}, + {ORTH: "ст-ца", NORM: "станица"}, + {ORTH: "у.", NORM: "улус"}, + {ORTH: "х.", NORM: "хутор"}, + {ORTH: "рзд.", NORM: "разъезд"}, + {ORTH: "зим.", NORM: "зимовье"}, + {ORTH: "б-г", NORM: "берег"}, + {ORTH: "ж/р", NORM: "жилой район"}, + {ORTH: "кв-л", NORM: "квартал"}, + {ORTH: "мкр.", NORM: "микрорайон"}, + {ORTH: "ост-в", NORM: "остров"}, + {ORTH: "платф.", NORM: "платформа"}, + {ORTH: "п/р", NORM: "промышленный район"}, + {ORTH: "р-н", NORM: "район"}, + {ORTH: "тер.", NORM: "территория"}, + { + ORTH: "тер. СНО", + NORM: "территория садоводческих некоммерческих объединений граждан", + }, + { + ORTH: "тер. ОНО", + NORM: "территория огороднических некоммерческих объединений граждан", + }, + {ORTH: "тер. ДНО", NORM: "территория дачных некоммерческих объединений граждан"}, + {ORTH: "тер. СНТ", NORM: "территория садоводческих некоммерческих товариществ"}, + {ORTH: "тер. ОНТ", NORM: "территория огороднических некоммерческих товариществ"}, + {ORTH: "тер. ДНТ", NORM: "территория дачных некоммерческих товариществ"}, + {ORTH: "тер. СПК", NORM: "территория садоводческих потребительских кооперативов"}, + {ORTH: "тер. ОПК", NORM: "территория огороднических потребительских кооперативов"}, + {ORTH: "тер. ДПК", NORM: "территория дачных потребительских кооперативов"}, + {ORTH: "тер. СНП", NORM: "территория садоводческих некоммерческих партнерств"}, + {ORTH: "тер. ОНП", NORM: "территория огороднических некоммерческих партнерств"}, + {ORTH: "тер. ДНП", NORM: "территория дачных некоммерческих партнерств"}, + {ORTH: "тер. ТСН", NORM: "территория товарищества собственников недвижимости"}, + {ORTH: "тер. ГСК", NORM: "территория гаражно-строительного кооператива"}, + {ORTH: "ус.", NORM: "усадьба"}, + {ORTH: "тер.ф.х.", NORM: "территория фермерского хозяйства"}, + {ORTH: "ю.", NORM: "юрты"}, + {ORTH: "ал.", NORM: "аллея"}, + {ORTH: "б-р", NORM: "бульвар"}, + {ORTH: "взв.", NORM: "взвоз"}, + {ORTH: "взд.", NORM: "въезд"}, + {ORTH: "дор.", NORM: "дорога"}, + {ORTH: "ззд.", NORM: "заезд"}, + {ORTH: "км", NORM: "километр"}, + {ORTH: "к-цо", NORM: "кольцо"}, + {ORTH: "лн.", NORM: "линия"}, + {ORTH: "мгстр.", NORM: "магистраль"}, + {ORTH: "наб.", NORM: "набережная"}, + {ORTH: "пер-д", NORM: "переезд"}, + {ORTH: "пер.", NORM: "переулок"}, + {ORTH: "пл-ка", NORM: "площадка"}, + {ORTH: "пл.", NORM: "площадь"}, + {ORTH: "пр-д", NORM: "проезд"}, + {ORTH: "пр-к", NORM: "просек"}, + {ORTH: "пр-ка", NORM: "просека"}, + {ORTH: "пр-лок", NORM: "проселок"}, + {ORTH: "пр-кт", NORM: "проспект"}, + {ORTH: "проул.", NORM: "проулок"}, + {ORTH: "рзд.", NORM: "разъезд"}, + {ORTH: "ряд", NORM: "ряд(ы)"}, + {ORTH: "с-р", NORM: "сквер"}, + {ORTH: "с-к", NORM: "спуск"}, + {ORTH: "сзд.", NORM: "съезд"}, + {ORTH: "туп.", NORM: "тупик"}, + {ORTH: "ул.", NORM: "улица"}, + {ORTH: "ш.", NORM: "шоссе"}, + {ORTH: "влд.", NORM: "владение"}, + {ORTH: "г-ж", NORM: "гараж"}, + {ORTH: "д.", NORM: "дом"}, + {ORTH: "двлд.", NORM: "домовладение"}, + {ORTH: "зд.", NORM: "здание"}, + {ORTH: "з/у", NORM: "земельный участок"}, + {ORTH: "кв.", NORM: "квартира"}, + {ORTH: "ком.", NORM: "комната"}, + {ORTH: "подв.", NORM: "подвал"}, + {ORTH: "кот.", NORM: "котельная"}, + {ORTH: "п-б", NORM: "погреб"}, + {ORTH: "к.", NORM: "корпус"}, + {ORTH: "ОНС", NORM: "объект незавершенного строительства"}, + {ORTH: "оф.", NORM: "офис"}, + {ORTH: "пав.", NORM: "павильон"}, + {ORTH: "помещ.", NORM: "помещение"}, + {ORTH: "раб.уч.", NORM: "рабочий участок"}, + {ORTH: "скл.", NORM: "склад"}, + {ORTH: "coop.", NORM: "сооружение"}, + {ORTH: "стр.", NORM: "строение"}, + {ORTH: "торг.зал", NORM: "торговый зал"}, + {ORTH: "а/п", NORM: "аэропорт"}, + {ORTH: "им.", NORM: "имени"}, +]: + _exc[abbr[ORTH]] = [abbr] + + +for abbr in [ + # Others abbreviations + {ORTH: "тыс.руб.", NORM: "тысяч рублей"}, + {ORTH: "тыс.", NORM: "тысяч"}, + {ORTH: "руб.", NORM: "рубль"}, + {ORTH: "долл.", NORM: "доллар"}, + {ORTH: "прим.", NORM: "примечание"}, + {ORTH: "прим.ред.", NORM: "примечание редакции"}, + {ORTH: "см. также", NORM: "смотри также"}, + {ORTH: "кв.м.", NORM: "квадрантный метр"}, + {ORTH: "м2", NORM: "квадрантный метр"}, + {ORTH: "б/у", NORM: "бывший в употреблении"}, + {ORTH: "сокр.", NORM: "сокращение"}, + {ORTH: "чел.", NORM: "человек"}, + {ORTH: "б.п.", NORM: "базисный пункт"}, +]: + _exc[abbr[ORTH]] = [abbr] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) From 23f06dc37f8b9e309028d4d8b3ef17f6daaad8e0 Mon Sep 17 00:00:00 2001 From: Grey Murav <65895033+gremur@users.noreply.github.com> Date: Thu, 17 Feb 2022 17:50:08 +0300 Subject: [PATCH 065/177] Extend list of numbers for ru language (#10280) * Extended list of numbers for ru language Extended list of numbers with all forms and cases including short forms, slang variants and roman numerals. * Update lex_attrs.py * Update 'like_num' function with percentages Added support for numbers with percentages like 12%, 1.2% and etc. to the 'like_num' function. * black formatting Co-authored-by: thomashacker --- spacy/lang/ru/lex_attrs.py | 153 ++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 45 deletions(-) diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 7979c7ea6..90802cb9b 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,56 +1,119 @@ from ...attrs import LIKE_NUM -_num_words = [ - "ноль", - "один", - "два", - "три", - "четыре", - "пять", - "шесть", - "семь", - "восемь", - "девять", - "десять", - "одиннадцать", - "двенадцать", - "тринадцать", - "четырнадцать", - "пятнадцать", - "шестнадцать", - "семнадцать", - "восемнадцать", - "девятнадцать", - "двадцать", - "тридцать", - "сорок", - "пятьдесят", - "шестьдесят", - "семьдесят", - "восемьдесят", - "девяносто", - "сто", - "двести", - "триста", - "четыреста", - "пятьсот", - "шестьсот", - "семьсот", - "восемьсот", - "девятьсот", - "тысяча", - "миллион", - "миллиард", - "триллион", - "квадриллион", - "квинтиллион", -] +_num_words = list( + set( + """ +ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми + +один первого первому единица одного одному первой первом первый первым одним одном во-первых + +два второго второму второй втором вторым двойка двумя двум двух во-вторых двое две двоих оба обе обеим обеими +обеих обоим обоими обоих + +полтора полторы полутора + +три третьего третьему третьем третьим третий тройка трешка трёшка трояк трёха треха тремя трем трех трое троих трёх + +четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым +четверых + +пять пятерочка пятерка пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми + +шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых + +семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро + +восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью + +девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых + +десять десятого десятому десятка десятом десятый десятым десятью десяти десятером вдесятером + +одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати + +двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати + +тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати + +четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати + +пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати + +шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати + +семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати + +восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати + +девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати + +двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати + +тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати + +тридевять + +сорок сорокового сороковому сороковом сороковым сороковой + +пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти полтинник + +шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти + +семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти + +восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти +восьмидесяти + +девяносто девяностого девяностому девяностом девяностый девяностым девяноста + +сто сотого сотому сотка сотня сотом сотен сотый сотым ста + +двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот + +триста тремястами трехсотого трехсотому трехсотом трехсотый трехсотым тремстам трехстах трехсот + +четыреста четырехсотого четырехсотому четырьмястами четырехсотом четырехсотый четырехсотым четыремстам четырехстах +четырехсот + +пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот + +шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот + +семьсот семисотого семисотому семьюстами семисотом семисотый семисотым семистам семистах семисот + +восемьсот восемисотого восемисотому восемисотом восемисотый восемисотым восьмистами восьмистам восьмистах восьмисот + +девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот + +тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс + +миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону +миллионов лям млн + +миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду +миллиардов лярд млрд + +триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону +триллионов трлн + +квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе +квадриллиону квадриллионов квадрлн + +квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе +квинтиллиону квинтиллионов квинтлн + +i ii iii iv vi vii viii ix xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix +""".split() + ) +) def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] + if text.endswith("%"): + text = text[:-1] text = text.replace(",", "").replace(".", "") if text.isdigit(): return True From aa93b471a1cadb661c063dee4913ad8f2e492d48 Mon Sep 17 00:00:00 2001 From: Grey Murav <65895033+gremur@users.noreply.github.com> Date: Thu, 17 Feb 2022 17:51:15 +0300 Subject: [PATCH 066/177] Extend list of stopwords for ru language (#10313) --- spacy/lang/ru/stop_words.py | 105 ++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 23 deletions(-) diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 16cb55ef9..d6ea6b42a 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,52 +1,111 @@ STOP_WORDS = set( """ -а +а авось ага агу аж ай али алло ау ах ая -будем будет будете будешь буду будут будучи будь будьте бы был была были было -быть +б будем будет будете будешь буду будут будучи будь будьте бы был была были было +быть бац без безусловно бишь благо благодаря ближайшие близко более больше +будто бывает бывала бывали бываю бывают бытует в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею -всея всю вся вы +всея всю вся вы ваш ваша ваше ваши вдали вдобавок вдруг ведь везде вернее +взаимно взаправду видно вишь включая вместо внакладе вначале вне вниз внизу +вновь вовсе возможно воистину вокруг вон вообще вопреки вперекор вплоть +вполне вправду вправе впрочем впрямь вресноту вроде вряд всегда всюду +всякий всякого всякой всячески вчеред -да для до +г го где гораздо гав -его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею +д да для до дабы давайте давно давным даже далее далеко дальше данная +данного данное данной данном данному данные данный данных дану данунах +даром де действительно довольно доколе доколь долго должен должна +должно должны должный дополнительно другая другие другим другими +других другое другой -же +е его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею едва +ежели еле -за +ж же -и из или им ими имъ их +з за затем зато зачем здесь значит зря + +и из или им ими имъ их ибо иль имеет имел имела имело именно иметь иначе +иногда иным иными итак ишь + +й к как кем ко когда кого ком кому комья которая которого которое которой котором -которому которою которую которые который которым которыми которых кто +которому которою которую которые который которым которыми которых кто ка кабы +каждая каждое каждые каждый кажется казалась казались казалось казался казаться +какая какие каким какими каков какого какой какому какою касательно кой коли +коль конечно короче кроме кстати ку куда -меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего +л ли либо лишь любая любого любое любой любом любую любыми любых + +м меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего моей моем моём моему моею можем может можете можешь мои мой моим моими моих -мочь мою моя мы +мочь мою моя мы мало меж между менее меньше мимо многие много многого многое +многом многому можно мол му -на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим +н на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но +наверняка наверху навряд навыворот над надо назад наиболее наизворот +наизнанку наипаче накануне наконец наоборот наперед наперекор наподобие +например напротив напрямую насилу настоящая настоящее настоящие настоящий +насчет нате находиться начала начале неважно негде недавно недалеко незачем +некем некогда некому некоторая некоторые некоторый некоторых некто некуда +нельзя немногие немногим немного необходимо необходимости необходимые +необходимым неоткуда непрерывно нередко несколько нету неужели нечего +нечем нечему нечто нешто нибудь нигде ниже низко никак никакой никем +никогда никого никому никто никуда ниоткуда нипочем ничего ничем ничему +ничто ну нужная нужно нужного нужные нужный нужных ныне нынешнее нынешней +нынешних нынче о об один одна одни одним одними одних одно одного одной одном одному одною -одну он она оне они оно от +одну он она оне они оно от оба общую обычно ого однажды однако ой около оный +оп опять особенно особо особую особые откуда отнелижа отнелиже отовсюду +отсюда оттого оттот оттуда отчего отчему ох очевидно очень ом -по при +п по при паче перед под подавно поди подобная подобно подобного подобные +подобный подобным подобных поелику пожалуй пожалуйста позже поистине +пока покамест поколе поколь покуда покудова помимо понеже поприще пор +пора посему поскольку после посреди посредством потом потому потомушта +похожем почему почти поэтому прежде притом причем про просто прочего +прочее прочему прочими проще прям пусть + +р ради разве ранее рано раньше рядом с сам сама сами самим самими самих само самого самом самому саму свое своё своего своей своем своём своему своею свои свой своим своими своих свою своя -себе себя собой собою +себе себя собой собою самая самое самой самый самых сверх свыше се сего сей +сейчас сие сих сквозь сколько скорее скоро следует слишком смогут сможет +сначала снова со собственно совсем сперва спокону спустя сразу среди сродни +стал стала стали стало стать суть сызнова -та так такая такие таким такими таких такого такое такой таком такому такою -такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому -тот тою ту ты +та то ту ты ти так такая такие таким такими таких такого такое такой таком такому такою +такую те тебе тебя тем теми тех тобой тобою того той только том томах тому +тот тою также таки таков такова там твои твоим твоих твой твоя твоё +теперь тогда тоже тотчас точно туда тут тьфу тая -у уже +у уже увы уж ура ух ую -чего чем чём чему что чтобы +ф фу -эта эти этим этими этих это этого этой этом этому этот этою эту +х ха хе хорошо хотел хотела хотелось хотеть хоть хотя хочешь хочу хуже -я +ч чего чем чём чему что чтобы часто чаще чей через чтоб чуть чхать чьим +чьих чьё чё + +ш ша + +щ ща щас + +ы ых ые ый + +э эта эти этим этими этих это этого этой этом этому этот этою эту эдак эдакий +эй эка экий этак этакий эх + +ю + +я явно явных яко якобы якоже """.split() ) From 28ba31e793cf0a59e5ce14bd2e8f5c5d6e785ca2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 17 Feb 2022 15:54:09 +0100 Subject: [PATCH 067/177] Add whitespace and combined augmenters (#10170) Add whitespace augmenter that inserts a single whitespace token into a doc containing annotation used in core trained pipelines. Add a combined augmenter that handles lowercasing, orth variants and whitespace augmentation. --- spacy/tests/training/test_augmenters.py | 87 ++++++++- spacy/training/augment.py | 224 +++++++++++++++++++++--- 2 files changed, 288 insertions(+), 23 deletions(-) diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py index 43a78e4b0..e3639c5da 100644 --- a/spacy/tests/training/test_augmenters.py +++ b/spacy/tests/training/test_augmenters.py @@ -1,9 +1,11 @@ import pytest -from spacy.training import Corpus +from spacy.pipeline._parser_internals.nonproj import contains_cycle +from spacy.training import Corpus, Example from spacy.training.augment import create_orth_variants_augmenter from spacy.training.augment import create_lower_casing_augmenter +from spacy.training.augment import make_whitespace_variant from spacy.lang.en import English -from spacy.tokens import DocBin, Doc +from spacy.tokens import DocBin, Doc, Span from contextlib import contextmanager import random @@ -153,3 +155,84 @@ def test_custom_data_augmentation(nlp, doc): ents = [(e.start, e.end, e.label) for e in doc.ents] assert [(e.start, e.end, e.label) for e in corpus[0].reference.ents] == ents assert [(e.start, e.end, e.label) for e in corpus[1].reference.ents] == ents + + +def test_make_whitespace_variant(nlp): + # fmt: off + text = "They flew to New York City.\nThen they drove to Washington, D.C." + words = ["They", "flew", "to", "New", "York", "City", ".", "\n", "Then", "they", "drove", "to", "Washington", ",", "D.C."] + spaces = [True, True, True, True, True, False, False, False, True, True, True, True, False, True, False] + tags = ["PRP", "VBD", "IN", "NNP", "NNP", "NNP", ".", "_SP", "RB", "PRP", "VBD", "IN", "NNP", ",", "NNP"] + lemmas = ["they", "fly", "to", "New", "York", "City", ".", "\n", "then", "they", "drive", "to", "Washington", ",", "D.C."] + heads = [1, 1, 1, 4, 5, 2, 1, 10, 10, 10, 10, 10, 11, 12, 12] + deps = ["nsubj", "ROOT", "prep", "compound", "compound", "pobj", "punct", "dep", "advmod", "nsubj", "ROOT", "prep", "pobj", "punct", "appos"] + ents = ["O", "O", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"] + # fmt: on + doc = Doc( + nlp.vocab, + words=words, + spaces=spaces, + tags=tags, + lemmas=lemmas, + heads=heads, + deps=deps, + ents=ents, + ) + assert doc.text == text + example = Example(nlp.make_doc(text), doc) + # whitespace is only added internally in entity spans + mod_ex = make_whitespace_variant(nlp, example, " ", 3) + assert mod_ex.reference.ents[0].text == "New York City" + mod_ex = make_whitespace_variant(nlp, example, " ", 4) + assert mod_ex.reference.ents[0].text == "New York City" + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.reference.ents[0].text == "New York City" + mod_ex = make_whitespace_variant(nlp, example, " ", 6) + assert mod_ex.reference.ents[0].text == "New York City" + # add a space at every possible position + for i in range(len(doc) + 1): + mod_ex = make_whitespace_variant(nlp, example, " ", i) + assert mod_ex.reference[i].is_space + # adds annotation when the doc contains at least partial annotation + assert [t.tag_ for t in mod_ex.reference] == tags[:i] + ["_SP"] + tags[i:] + assert [t.lemma_ for t in mod_ex.reference] == lemmas[:i] + [" "] + lemmas[i:] + assert [t.dep_ for t in mod_ex.reference] == deps[:i] + ["dep"] + deps[i:] + # does not add partial annotation if doc does not contain this feature + assert not mod_ex.reference.has_annotation("POS") + assert not mod_ex.reference.has_annotation("MORPH") + # produces well-formed trees + assert not contains_cycle([t.head.i for t in mod_ex.reference]) + assert len(list(doc.sents)) == 2 + if i == 0: + assert mod_ex.reference[i].head.i == 1 + else: + assert mod_ex.reference[i].head.i == i - 1 + # adding another space also produces well-formed trees + for j in (3, 8, 10): + mod_ex2 = make_whitespace_variant(nlp, mod_ex, "\t\t\n", j) + assert not contains_cycle([t.head.i for t in mod_ex2.reference]) + assert len(list(doc.sents)) == 2 + assert mod_ex2.reference[j].head.i == j - 1 + # entities are well-formed + assert len(doc.ents) == len(mod_ex.reference.ents) + for ent in mod_ex.reference.ents: + assert not ent[0].is_space + assert not ent[-1].is_space + + # no modifications if: + # partial dependencies + example.reference[0].dep_ = "" + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.text == example.reference.text + example.reference[0].dep_ = "nsubj" # reset + + # spans + example.reference.spans["spans"] = [example.reference[0:5]] + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.text == example.reference.text + del example.reference.spans["spans"] # reset + + # links + example.reference.ents = [Span(doc, 0, 2, label="ENT", kb_id="Q123")] + mod_ex = make_whitespace_variant(nlp, example, " ", 5) + assert mod_ex.text == example.reference.text diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 63b54034c..59a39c7ee 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,4 +1,5 @@ from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING +from typing import Optional import random import itertools from functools import partial @@ -11,32 +12,87 @@ if TYPE_CHECKING: from ..language import Language # noqa: F401 -class OrthVariantsSingle(BaseModel): - tags: List[StrictStr] - variants: List[StrictStr] +@registry.augmenters("spacy.combined_augmenter.v1") +def create_combined_augmenter( + lower_level: float, + orth_level: float, + orth_variants: Optional[Dict[str, List[Dict]]], + whitespace_level: float, + whitespace_per_token: float, + whitespace_variants: Optional[List[str]], +) -> Callable[["Language", Example], Iterator[Example]]: + """Create a data augmentation callback that uses orth-variant replacement. + The callback can be added to a corpus or other data iterator during training. + + lower_level (float): The percentage of texts that will be lowercased. + orth_level (float): The percentage of texts that will be augmented. + orth_variants (Optional[Dict[str, List[Dict]]]): A dictionary containing the + single and paired orth variants. Typically loaded from a JSON file. + whitespace_level (float): The percentage of texts that will have whitespace + tokens inserted. + whitespace_per_token (float): The number of whitespace tokens to insert in + the modified doc as a percentage of the doc length. + whitespace_variants (Optional[List[str]]): The whitespace token texts. + RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter. + """ + return partial( + combined_augmenter, + lower_level=lower_level, + orth_level=orth_level, + orth_variants=orth_variants, + whitespace_level=whitespace_level, + whitespace_per_token=whitespace_per_token, + whitespace_variants=whitespace_variants, + ) -class OrthVariantsPaired(BaseModel): - tags: List[StrictStr] - variants: List[List[StrictStr]] - - -class OrthVariants(BaseModel): - paired: List[OrthVariantsPaired] = [] - single: List[OrthVariantsSingle] = [] +def combined_augmenter( + nlp: "Language", + example: Example, + *, + lower_level: float = 0.0, + orth_level: float = 0.0, + orth_variants: Optional[Dict[str, List[Dict]]] = None, + whitespace_level: float = 0.0, + whitespace_per_token: float = 0.0, + whitespace_variants: Optional[List[str]] = None, +) -> Iterator[Example]: + if random.random() < lower_level: + example = make_lowercase_variant(nlp, example) + if orth_variants and random.random() < orth_level: + raw_text = example.text + orig_dict = example.to_dict() + variant_text, variant_token_annot = make_orth_variants( + nlp, + raw_text, + orig_dict["token_annotation"], + orth_variants, + lower=False, + ) + orig_dict["token_annotation"] = variant_token_annot + example = example.from_dict(nlp.make_doc(variant_text), orig_dict) + if whitespace_variants and random.random() < whitespace_level: + for _ in range(int(len(example.reference) * whitespace_per_token)): + example = make_whitespace_variant( + nlp, + example, + random.choice(whitespace_variants), + random.randrange(0, len(example.reference)), + ) + yield example @registry.augmenters("spacy.orth_variants.v1") def create_orth_variants_augmenter( - level: float, lower: float, orth_variants: OrthVariants + level: float, lower: float, orth_variants: Dict[str, List[Dict]] ) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. level (float): The percentage of texts that will be augmented. lower (float): The percentage of texts that will be lowercased. - orth_variants (Dict[str, dict]): A dictionary containing the single and - paired orth variants. Typically loaded from a JSON file. + orth_variants (Dict[str, List[Dict]]): A dictionary containing + the single and paired orth variants. Typically loaded from a JSON file. RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter. """ return partial( @@ -67,16 +123,20 @@ def lower_casing_augmenter( if random.random() >= level: yield example else: - example_dict = example.to_dict() - doc = nlp.make_doc(example.text.lower()) - example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference] - yield example.from_dict(doc, example_dict) + yield make_lowercase_variant(nlp, example) + + +def make_lowercase_variant(nlp: "Language", example: Example): + example_dict = example.to_dict() + doc = nlp.make_doc(example.text.lower()) + example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference] + return example.from_dict(doc, example_dict) def orth_variants_augmenter( nlp: "Language", example: Example, - orth_variants: Dict, + orth_variants: Dict[str, List[Dict]], *, level: float = 0.0, lower: float = 0.0, @@ -148,10 +208,132 @@ def make_orth_variants( pair_idx = pair.index(words[word_idx]) words[word_idx] = punct_choices[punct_idx][pair_idx] token_dict["ORTH"] = words - # construct modified raw text from words and spaces + raw = construct_modified_raw_text(token_dict) + return raw, token_dict + + +def make_whitespace_variant( + nlp: "Language", + example: Example, + whitespace: str, + position: int, +) -> Example: + """Insert the whitespace token at the specified token offset in the doc. + This is primarily intended for v2-compatible training data that doesn't + include links or spans. If the document includes links, spans, or partial + dependency annotation, it is returned without modifications. + + The augmentation follows the basics of the v2 space attachment policy, but + without a distinction between "real" and other tokens, so space tokens + may be attached to space tokens: + - at the beginning of a sentence attach the space token to the following + token + - otherwise attach the space token to the preceding token + + The augmenter does not attempt to consolidate adjacent whitespace in the + same way that the tokenizer would. + + The following annotation is used for the space token: + TAG: "_SP" + MORPH: "" + POS: "SPACE" + LEMMA: ORTH + DEP: "dep" + SENT_START: False + + The annotation for each attribute is only set for the space token if there + is already at least partial annotation for that attribute in the original + example. + + RETURNS (Example): Example with one additional space token. + """ + example_dict = example.to_dict() + doc_dict = example_dict.get("doc_annotation", {}) + token_dict = example_dict.get("token_annotation", {}) + # returned unmodified if: + # - doc is empty + # - words are not defined + # - links are defined (only character-based offsets, which is more a quirk + # of Example.to_dict than a technical constraint) + # - spans are defined + # - there are partial dependencies + if ( + len(example.reference) == 0 + or "ORTH" not in token_dict + or len(doc_dict.get("links", [])) > 0 + or len(example.reference.spans) > 0 + or ( + example.reference.has_annotation("DEP") + and not example.reference.has_annotation("DEP", require_complete=True) + ) + ): + return example + words = token_dict.get("ORTH", []) + length = len(words) + assert 0 <= position <= length + if example.reference.has_annotation("ENT_TYPE"): + # I-ENTITY if between B/I-ENTITY and I/L-ENTITY otherwise O + entity = "O" + if position > 1 and position < length: + ent_prev = doc_dict["entities"][position - 1] + ent_next = doc_dict["entities"][position] + if "-" in ent_prev and "-" in ent_next: + ent_iob_prev = ent_prev.split("-")[0] + ent_type_prev = ent_prev.split("-", 1)[1] + ent_iob_next = ent_next.split("-")[0] + ent_type_next = ent_next.split("-", 1)[1] + if ( + ent_iob_prev in ("B", "I") + and ent_iob_next in ("I", "L") + and ent_type_prev == ent_type_next + ): + entity = f"I-{ent_type_prev}" + doc_dict["entities"].insert(position, entity) + else: + del doc_dict["entities"] + token_dict["ORTH"].insert(position, whitespace) + token_dict["SPACY"].insert(position, False) + if example.reference.has_annotation("TAG"): + token_dict["TAG"].insert(position, "_SP") + else: + del token_dict["TAG"] + if example.reference.has_annotation("LEMMA"): + token_dict["LEMMA"].insert(position, whitespace) + else: + del token_dict["LEMMA"] + if example.reference.has_annotation("POS"): + token_dict["POS"].insert(position, "SPACE") + else: + del token_dict["POS"] + if example.reference.has_annotation("MORPH"): + token_dict["MORPH"].insert(position, "") + else: + del token_dict["MORPH"] + if example.reference.has_annotation("DEP", require_complete=True): + if position == 0: + token_dict["HEAD"].insert(position, 0) + else: + token_dict["HEAD"].insert(position, position - 1) + for i in range(len(token_dict["HEAD"])): + if token_dict["HEAD"][i] >= position: + token_dict["HEAD"][i] += 1 + token_dict["DEP"].insert(position, "dep") + else: + del token_dict["HEAD"] + del token_dict["DEP"] + if example.reference.has_annotation("SENT_START"): + token_dict["SENT_START"].insert(position, False) + else: + del token_dict["SENT_START"] + raw = construct_modified_raw_text(token_dict) + return Example.from_dict(nlp.make_doc(raw), example_dict) + + +def construct_modified_raw_text(token_dict): + """Construct modified raw text from words and spaces.""" raw = "" for orth, spacy in zip(token_dict["ORTH"], token_dict["SPACY"]): raw += orth if spacy: raw += " " - return raw, token_dict + return raw From 6de84c8757f0779f3bc90edabc6789f6b24b05a5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Feb 2022 09:15:42 +0100 Subject: [PATCH 068/177] Auto-format code with black (#10333) Co-authored-by: explosion-bot --- spacy/lang/fr/syntax_iterators.py | 4 +--- spacy/lang/ko/__init__.py | 2 +- spacy/pipeline/textcat.py | 2 +- spacy/tests/lang/fi/test_noun_chunks.py | 19 +++++++++++++++++-- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 5f7ba5c10..5849c40b3 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -64,9 +64,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: prev_end = right_end.i left_index = word.left_edge.i - left_index = ( - left_index + 1 if word.left_edge.pos == adp_pos else left_index - ) + left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index yield left_index, right_end.i + 1, np_label elif word.dep == conj_label: diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index eb3c2e1f5..a03f7821a 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -99,7 +99,7 @@ def try_mecab_import() -> None: return MeCab except ImportError: raise ImportError( - "The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires " + 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " "and [natto-py](https://github.com/buruzaemon/natto-py)" diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 64a452a7a..690c350fa 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -320,7 +320,7 @@ class TextCategorizer(TrainablePipe): self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) # type: ignore - d_scores = (scores - truths) + d_scores = scores - truths d_scores *= not_missing mean_square_error = (d_scores**2).mean() return float(mean_square_error), d_scores diff --git a/spacy/tests/lang/fi/test_noun_chunks.py b/spacy/tests/lang/fi/test_noun_chunks.py index cc3b5aa36..cab84b311 100644 --- a/spacy/tests/lang/fi/test_noun_chunks.py +++ b/spacy/tests/lang/fi/test_noun_chunks.py @@ -107,7 +107,17 @@ FI_NP_TEST_EXAMPLES = [ ( "New York tunnetaan kaupunkina, joka ei koskaan nuku", ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"], - ["obj", "flat:name", "ROOT", "obl", "punct", "nsubj", "aux", "advmod", "acl:relcl"], + [ + "obj", + "flat:name", + "ROOT", + "obl", + "punct", + "nsubj", + "aux", + "advmod", + "acl:relcl", + ], [2, -1, 0, -1, 4, 3, 2, 1, -5], ["New York", "kaupunkina"], ), @@ -130,7 +140,12 @@ FI_NP_TEST_EXAMPLES = [ ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"], ["nsubj", "ROOT", "obj", "obl", "amod", "obl"], [1, 0, -1, -1, 1, -3], - ["sairaanhoitopiirit", "leikkaustoimintaa", "alueellaan", "useammassa sairaalassa"], + [ + "sairaanhoitopiirit", + "leikkaustoimintaa", + "alueellaan", + "useammassa sairaalassa", + ], ), ( "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa", From f4c74764b84c5b5e7628392875b8d2def8bb07d5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:22:36 +0100 Subject: [PATCH 069/177] Fix Tok2Vec for empty batches (#10324) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types --- spacy/pipeline/tok2vec.py | 4 ++++ spacy/tests/pipeline/test_tok2vec.py | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index cb601e5dc..2e3dde3cb 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#predict """ + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + width = self.model.get_dim("nO") + return [self.model.ops.alloc((0, width)) for doc in docs] tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index eeea906bb..a5ac85e1e 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -11,7 +11,7 @@ from spacy.lang.en import English from thinc.api import Config, get_current_ops from numpy.testing import assert_array_equal -from ..util import get_batch, make_tempdir +from ..util import get_batch, make_tempdir, add_vecs_to_vocab def test_empty_doc(): @@ -140,9 +140,25 @@ TRAIN_DATA = [ ] -def test_tok2vec_listener(): +@pytest.mark.parametrize("with_vectors", (False, True)) +def test_tok2vec_listener(with_vectors): orig_config = Config().from_str(cfg_string) + orig_config["components"]["tok2vec"]["model"]["embed"][ + "include_static_vectors" + ] = with_vectors nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + + if with_vectors: + ops = get_current_ops() + vectors = [ + ("apple", ops.asarray([1, 2, 3])), + ("orange", ops.asarray([-1, -2, -3])), + ("and", ops.asarray([-1, -1, -1])), + ("juice", ops.asarray([5, 5, 10])), + ("pie", ops.asarray([7, 6.3, 8.9])), + ] + add_vecs_to_vocab(nlp.vocab, vectors) + assert nlp.pipe_names == ["tok2vec", "tagger"] tagger = nlp.get_pipe("tagger") tok2vec = nlp.get_pipe("tok2vec") @@ -169,6 +185,9 @@ def test_tok2vec_listener(): ops = get_current_ops() assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor)) + # test with empty doc + doc = nlp("") + # TODO: should this warn or error? nlp.select_pipes(disable="tok2vec") assert nlp.pipe_names == ["tagger"] From 3358fb9bdd96792725461e346eb0c1a986322e15 Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Mon, 21 Feb 2022 04:24:15 -0500 Subject: [PATCH 070/177] Miscellaneous Minor SpanGroups/DocBin Improvements (#10250) * MultiHashEmbed vector docs correction * doc copy span test * ignore empty lists in DocBin.span_groups * serialized empty list const + SpanGroups.is_empty * add conditional deserial on from_bytes * clean up + reorganize * rm test * add constant as class attribute * rename to _EMPTY_BYTES * Update spacy/tests/doc/test_span.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/tests/doc/test_span.py | 13 +++++++++++++ spacy/tokens/_dict_proxies.py | 7 ++++++- spacy/tokens/_serialize.py | 3 ++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index bdf34c1c1..c0496cabf 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -655,3 +655,16 @@ def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with def test_span_sents_not_parsed(doc_not_parsed): with pytest.raises(ValueError): list(Span(doc_not_parsed, 0, 3).sents) + + +def test_span_group_copy(doc): + doc.spans["test"] = [doc[0:1], doc[2:4]] + assert len(doc.spans["test"]) == 2 + doc_copy = doc.copy() + # check that the spans were indeed copied + assert len(doc_copy.spans["test"]) == 2 + # add a new span to the original doc + doc.spans["test"].append(doc[3:4]) + assert len(doc.spans["test"]) == 3 + # check that the copy spans were not modified and this is an isolated doc + assert len(doc_copy.spans["test"]) == 2 diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 470d3430f..8643243fa 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -6,6 +6,7 @@ import srsly from .span_group import SpanGroup from ..errors import Errors + if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .doc import Doc # noqa: F401 @@ -19,6 +20,8 @@ if TYPE_CHECKING: class SpanGroups(UserDict): """A dict-like proxy held by the Doc, to control access to span groups.""" + _EMPTY_BYTES = srsly.msgpack_dumps([]) + def __init__( self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple() ) -> None: @@ -43,11 +46,13 @@ class SpanGroups(UserDict): def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups # know their names. + if len(self) == 0: + return self._EMPTY_BYTES msg = [value.to_bytes() for value in self.values()] return srsly.msgpack_dumps(msg) def from_bytes(self, bytes_data: bytes) -> "SpanGroups": - msg = srsly.msgpack_loads(bytes_data) + msg = [] if bytes_data == self._EMPTY_BYTES else srsly.msgpack_loads(bytes_data) self.clear() doc = self._ensure_doc() for value_bytes in msg: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index bd2bdb811..2b72adb4d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -12,6 +12,7 @@ from ..compat import copy_reg from ..attrs import SPACY, ORTH, intify_attr, IDS from ..errors import Errors from ..util import ensure_path, SimpleFrozenList +from ._dict_proxies import SpanGroups # fmt: off ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START") @@ -146,7 +147,7 @@ class DocBin: doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) # type: ignore doc = doc.from_array(self.attrs, tokens) # type: ignore doc.cats = self.cats[i] - if self.span_groups[i]: + if self.span_groups[i] != SpanGroups._EMPTY_BYTES: doc.spans.from_bytes(self.span_groups[i]) else: doc.spans.clear() From f32ee2e533c709c8f2cc00b9cce28b779f4a0304 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:24:52 +0100 Subject: [PATCH 071/177] Fix NER check in CoNLL-U converter (#10302) * Fix NER check in CoNLL-U converter Leave ents unset if no NER annotation is found in the MISC column. * Revert to global rather than per-sentence NER check * Update spacy/training/converters/conllu_to_docs.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/tests/test_cli.py | 8 ++++++-- spacy/training/converters/conllu_to_docs.py | 12 +++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index fc35ff86e..ec512b839 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -34,7 +34,7 @@ from .util import make_tempdir @pytest.mark.issue(4665) -def test_issue4665(): +def test_cli_converters_conllu_empty_heads_ner(): """ conllu_to_docs should not raise an exception if the HEAD column contains an underscore @@ -59,7 +59,11 @@ def test_issue4665(): 17 . _ PUNCT . _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _ """ - conllu_to_docs(input_data) + docs = list(conllu_to_docs(input_data)) + # heads are all 0 + assert not all([t.head.i for t in docs[0]]) + # NER is unset + assert not docs[0].has_annotation("ENT_IOB") @pytest.mark.issue(4924) diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 7a4f44d3b..a4e70b01f 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -71,6 +71,7 @@ def read_conllx( ): """Yield docs, one for each sentence""" vocab = Vocab() # need vocab to make a minimal Doc + set_ents = has_ner(input_data, ner_tag_pattern) for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: @@ -83,6 +84,7 @@ def read_conllx( merge_subtokens=merge_subtokens, append_morphology=append_morphology, ner_map=ner_map, + set_ents=set_ents, ) yield doc @@ -133,6 +135,7 @@ def conllu_sentence_to_doc( merge_subtokens=False, append_morphology=False, ner_map=None, + set_ents=False, ): """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. @@ -214,8 +217,10 @@ def conllu_sentence_to_doc( doc[i]._.merged_morph = morphs[i] doc[i]._.merged_lemma = lemmas[i] doc[i]._.merged_spaceafter = spaces[i] - ents = get_entities(lines, ner_tag_pattern, ner_map) - doc.ents = biluo_tags_to_spans(doc, ents) + ents = None + if set_ents: + ents = get_entities(lines, ner_tag_pattern, ner_map) + doc.ents = biluo_tags_to_spans(doc, ents) if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) @@ -247,7 +252,8 @@ def conllu_sentence_to_doc( deps=deps, heads=heads, ) - doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] + if set_ents: + doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] return doc_x From 30030176ee066e2de92238802d7af9d6120d689f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:26:19 +0100 Subject: [PATCH 072/177] Update Korean defaults for Tokenizer (#10322) Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist. --- spacy/lang/ko/__init__.py | 2 ++ spacy/lang/ko/punctuation.py | 12 ++++++++++++ spacy/tests/conftest.py | 13 +++++++++++++ spacy/tests/lang/ko/test_tokenizer.py | 20 ++++++++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 spacy/lang/ko/punctuation.py diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index a03f7821a..63bc06665 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,6 @@ from typing import Iterator, Any, Dict +from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS @@ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + infixes = TOKENIZER_INFIXES class Korean(Language): diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py new file mode 100644 index 000000000..7f7b40c5b --- /dev/null +++ b/spacy/lang/ko/punctuation.py @@ -0,0 +1,12 @@ +from ..char_classes import LIST_QUOTES +from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES + + +_infixes = ( + ["·", "ㆍ", "\(", "\)"] + + [r"(?<=[0-9])~(?=[0-9-])"] + + LIST_QUOTES + + BASE_TOKENIZER_INFIXES +) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ee90a9f38..f9266cb94 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -227,6 +227,19 @@ def ko_tokenizer(): return get_lang_class("ko")().tokenizer +@pytest.fixture(scope="session") +def ko_tokenizer_tokenizer(): + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.Tokenizer.v1", + } + } + } + nlp = get_lang_class("ko").from_config(config) + return nlp.tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index eac309857..e6b65dee9 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): def test_ko_empty_doc(ko_tokenizer): tokens = ko_tokenizer("") assert len(tokens) == 0 + + +# fmt: off +SPACY_TOKENIZER_TESTS = [ + ("있다.", "있다 ."), + ("'예'는", "' 예 ' 는"), + ("부 (富) 는", "부 ( 富 ) 는"), + ("부(富)는", "부 ( 富 ) 는"), + ("1982~1983.", "1982 ~ 1983 ."), + ("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."), + ("그렇구나~", "그렇구나~"), + ("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS) +def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer_tokenizer(text)] + assert tokens == expected_tokens.split() From cf5b46b63e91b9a2881c3a7d52bb9d2856c809f2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:22:36 +0100 Subject: [PATCH 073/177] Fix Tok2Vec for empty batches (#10324) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types --- spacy/pipeline/tok2vec.py | 4 ++++ spacy/tests/pipeline/test_tok2vec.py | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index cb601e5dc..2e3dde3cb 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#predict """ + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + width = self.model.get_dim("nO") + return [self.model.ops.alloc((0, width)) for doc in docs] tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index eeea906bb..a5ac85e1e 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -11,7 +11,7 @@ from spacy.lang.en import English from thinc.api import Config, get_current_ops from numpy.testing import assert_array_equal -from ..util import get_batch, make_tempdir +from ..util import get_batch, make_tempdir, add_vecs_to_vocab def test_empty_doc(): @@ -140,9 +140,25 @@ TRAIN_DATA = [ ] -def test_tok2vec_listener(): +@pytest.mark.parametrize("with_vectors", (False, True)) +def test_tok2vec_listener(with_vectors): orig_config = Config().from_str(cfg_string) + orig_config["components"]["tok2vec"]["model"]["embed"][ + "include_static_vectors" + ] = with_vectors nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + + if with_vectors: + ops = get_current_ops() + vectors = [ + ("apple", ops.asarray([1, 2, 3])), + ("orange", ops.asarray([-1, -2, -3])), + ("and", ops.asarray([-1, -1, -1])), + ("juice", ops.asarray([5, 5, 10])), + ("pie", ops.asarray([7, 6.3, 8.9])), + ] + add_vecs_to_vocab(nlp.vocab, vectors) + assert nlp.pipe_names == ["tok2vec", "tagger"] tagger = nlp.get_pipe("tagger") tok2vec = nlp.get_pipe("tok2vec") @@ -169,6 +185,9 @@ def test_tok2vec_listener(): ops = get_current_ops() assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor)) + # test with empty doc + doc = nlp("") + # TODO: should this warn or error? nlp.select_pipes(disable="tok2vec") assert nlp.pipe_names == ["tagger"] From 78a8bec4d0a0e607acd3f9a2c6eaafe54c7ca4ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 21 Feb 2022 15:02:21 +0100 Subject: [PATCH 074/177] Make core projectivization functions cdef nogil (#10241) * Make core projectivization methods cdef nogil While profiling the parser, I noticed that relatively a lot of time is spent in projectivization. This change rewrites the functions in the core loops as cdef nogil for efficiency. In C++-land, we use vector in place of Python lists and absent heads are represented as -1 in place of None. * _heads_to_c: add assertion Validation should be performed by the caller, but this assertion ensures that we are not reading/writing out of bounds with incorrect input. --- spacy/pipeline/_parser_internals/nonproj.pyx | 83 +++++++++++++++----- spacy/tests/parser/test_nonproj.py | 4 +- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx index 82070cd27..36163fcc3 100644 --- a/spacy/pipeline/_parser_internals/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from copy import copy +from libc.limits cimport INT_MAX +from libc.stdlib cimport abs +from libcpp cimport bool +from libcpp.vector cimport vector from ...tokens.doc cimport Doc, set_children_from_heads @@ -41,13 +45,18 @@ def contains_cycle(heads): def is_nonproj_arc(tokenid, heads): + cdef vector[int] c_heads = _heads_to_c(heads) + return _is_nonproj_arc(tokenid, c_heads) + + +cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil: # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] if head == tokenid: # root arcs cannot be non-projective return False - elif head is None: # unattached tokens cannot be non-projective + elif head < 0: # unattached tokens cannot be non-projective return False cdef int start, end @@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads): else: start, end = (tokenid+1, head) for k in range(start, end): - for ancestor in ancestors(k, heads): - if ancestor is None: # for unattached tokens/subtrees - break - elif ancestor == head: # normal case: k dominated by h - break + if _has_head_as_ancestor(k, head, heads): + continue else: # head not in ancestors: d -> h is non-projective return True return False +cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil: + ancestor = tokenid + cnt = 0 + while cnt < heads.size(): + if heads[ancestor] == head or heads[ancestor] < 0: + return True + ancestor = heads[ancestor] + cnt += 1 + + return False + + def is_nonproj_tree(heads): + cdef vector[int] c_heads = _heads_to_c(heads) # a tree is non-projective if at least one arc is non-projective - return any(is_nonproj_arc(word, heads) for word in range(len(heads))) + return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads))) def decompose(label): @@ -98,16 +117,31 @@ def projectivize(heads, labels): # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) # which encode a projective and decorated tree. proj_heads = copy(heads) - smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) - if smallest_np_arc is None: # this sentence is already projective + + cdef int new_head + cdef vector[int] c_proj_heads = _heads_to_c(proj_heads) + cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) + if smallest_np_arc == -1: # this sentence is already projective return proj_heads, copy(labels) - while smallest_np_arc is not None: - _lift(smallest_np_arc, proj_heads) - smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) + while smallest_np_arc != -1: + new_head = _lift(smallest_np_arc, proj_heads) + c_proj_heads[smallest_np_arc] = new_head + smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) deco_labels = _decorate(heads, proj_heads, labels) return proj_heads, deco_labels +cdef vector[int] _heads_to_c(heads): + cdef vector[int] c_heads; + for head in heads: + if head == None: + c_heads.push_back(-1) + else: + assert head < len(heads) + c_heads.push_back(head) + return c_heads + + cpdef deprojectivize(Doc doc): # Reattach arcs with decorated labels (following HEAD scheme). For each # decorated arc X||Y, search top-down, left-to-right, breadth-first until @@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels): deco_labels.append(labels[tokenid]) return deco_labels +def get_smallest_nonproj_arc_slow(heads): + cdef vector[int] c_heads = _heads_to_c(heads) + return _get_smallest_nonproj_arc(c_heads) -def _get_smallest_nonproj_arc(heads): + +cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil: # return the smallest non-proj arc or None # where size is defined as the distance between dep and head # and ties are broken left to right - smallest_size = float('inf') - smallest_np_arc = None - for tokenid, head in enumerate(heads): + cdef int smallest_size = INT_MAX + cdef int smallest_np_arc = -1 + cdef int size + cdef int tokenid + cdef int head + + for tokenid in range(heads.size()): + head = heads[tokenid] size = abs(tokenid-head) - if size < smallest_size and is_nonproj_arc(tokenid, heads): + if size < smallest_size and _is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc -def _lift(tokenid, heads): +cpdef int _lift(tokenid, heads): # reattaches a word to it's grandfather head = heads[tokenid] ghead = heads[head] + cdef int new_head = ghead if head != ghead else tokenid # attach to ghead if head isn't attached to root else attach to root - heads[tokenid] = ghead if head != ghead else tokenid + heads[tokenid] = new_head + return new_head def _find_new_head(token, headlabel): diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 3957e4d77..60d000c44 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab): assert nonproj.is_decorated("X") is False nonproj._lift(0, tree) assert tree == [2, 2, 2] - assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 - assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 + assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7 + assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10 # fmt: off proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] From 249b97184d12664dde53a3c5b8c658ad7b8cf0ca Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 23 Feb 2022 16:10:05 +0100 Subject: [PATCH 075/177] Bugfixes and test for rehearse (#10347) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fixing argument order for rehearse * rehearse test for ner and tagger * rehearse bugfix * added test for parser * test for multilabel textcat * rehearse fix * remove debug line * Update spacy/tests/training/test_rehearse.py Co-authored-by: Sofie Van Landeghem * Update spacy/tests/training/test_rehearse.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Kádár Ákos Co-authored-by: Sofie Van Landeghem --- spacy/language.py | 5 +- spacy/pipeline/tagger.pyx | 11 +- spacy/pipeline/textcat.py | 2 +- spacy/tests/training/test_rehearse.py | 168 ++++++++++++++++++++++++++ 4 files changed, 178 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/training/test_rehearse.py diff --git a/spacy/language.py b/spacy/language.py index e8fd2720c..bab403f0e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1222,8 +1222,9 @@ class Language: component_cfg = {} grads = {} - def get_grads(W, dW, key=None): + def get_grads(key, W, dW): grads[key] = (W, dW) + return W, dW get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr] get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr] @@ -1236,7 +1237,7 @@ class Language: examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {}) ) for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) # type: ignore[call-arg, misc] + sgd(key, W, dW) # type: ignore[call-arg, misc] return losses def begin_training( diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a2bec888e..e21a9096e 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -225,6 +225,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#rehearse """ + loss_func = SequenceCategoricalCrossentropy() if losses is None: losses = {} losses.setdefault(self.name, 0.0) @@ -236,12 +237,12 @@ class Tagger(TrainablePipe): # Handle cases where there are no tokens in any docs. return losses set_dropout_rate(self.model, drop) - guesses, backprop = self.model.begin_update(docs) - target = self._rehearsal_model(examples) - gradient = guesses - target - backprop(gradient) + tag_scores, bp_tag_scores = self.model.begin_update(docs) + tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs) + grads, loss = loss_func(tag_scores, tutor_tag_scores) + bp_tag_scores(grads) self.finish_update(sgd) - losses[self.name] += (gradient**2).sum() + losses[self.name] += loss return losses def get_loss(self, examples, scores): diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 690c350fa..bc3f127fc 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -283,7 +283,7 @@ class TextCategorizer(TrainablePipe): return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) - target = self._rehearsal_model(examples) + target, _ = self._rehearsal_model.begin_update(docs) gradient = scores - target bp_scores(gradient) if sgd is not None: diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py new file mode 100644 index 000000000..1bb8fac86 --- /dev/null +++ b/spacy/tests/training/test_rehearse.py @@ -0,0 +1,168 @@ +import pytest +import spacy + +from typing import List +from spacy.training import Example + + +TRAIN_DATA = [ + ( + 'Who is Kofi Annan?', + { + 'entities': [(7, 18, 'PERSON')], + 'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'], + 'heads': [1, 1, 3, 1, 1], + 'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'], + 'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'], + 'cats': {'question': 1.0} + } + ), + ( + 'Who is Steve Jobs?', + { + 'entities': [(7, 17, 'PERSON')], + 'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'], + 'heads': [1, 1, 3, 1, 1], + 'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'], + 'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'], + 'cats': {'question': 1.0} + } + ), + ( + 'Bob is a nice person.', + { + 'entities': [(0, 3, 'PERSON')], + 'tags': ['PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'PUNCT'], + 'heads': [1, 1, 4, 4, 1, 1], + 'deps': ['nsubj', 'ROOT', 'det', 'amod', 'attr', 'punct'], + 'morphs': ['Number=Sing', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Definite=Ind|PronType=Art', 'Degree=Pos', 'Number=Sing', 'PunctType=Peri'], + 'cats': {'statement': 1.0} + }, + ), + ( + 'Hi Anil, how are you?', + { + 'entities': [(3, 7, 'PERSON')], + 'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'], + 'deps': ['intj', 'npadvmod', 'punct', 'advmod', 'ROOT', 'nsubj', 'punct'], + 'heads': [4, 0, 4, 4, 4, 4, 4], + 'morphs': ['', 'Number=Sing', 'PunctType=Comm', '', 'Mood=Ind|Tense=Pres|VerbForm=Fin', 'Case=Nom|Person=2|PronType=Prs', 'PunctType=Peri'], + 'cats': {'greeting': 1.0, 'question': 1.0} + } + ), + ( + 'I like London and Berlin.', + { + 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')], + 'tags': ['PROPN', 'VERB', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT'], + 'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'], + 'heads': [1, 1, 1, 2, 2, 1], + 'morphs': ['Case=Nom|Number=Sing|Person=1|PronType=Prs', 'Tense=Pres|VerbForm=Fin', 'Number=Sing', 'ConjType=Cmp', 'Number=Sing', 'PunctType=Peri'], + 'cats': {'statement': 1.0} + } + ) +] + +REHEARSE_DATA = [ + ( + 'Hi Anil', + { + 'entities': [(3, 7, 'PERSON')], + 'tags': ['INTJ', 'PROPN'], + 'deps': ['ROOT', 'npadvmod'], + 'heads': [0, 0], + 'morphs': ['', 'Number=Sing'], + 'cats': {'greeting': 1.0} + } + ), + ( + 'Hi Ravish, how you doing?', + { + 'entities': [(3, 9, 'PERSON')], + 'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'], + 'deps': ['intj', 'ROOT', 'punct', 'advmod', 'nsubj', 'advcl', 'punct'], + 'heads': [1, 1, 1, 5, 5, 1, 1], + 'morphs': ['', 'VerbForm=Inf', 'PunctType=Comm', '', 'Case=Nom|Person=2|PronType=Prs', 'Aspect=Prog|Tense=Pres|VerbForm=Part', 'PunctType=Peri'], + 'cats': {'greeting': 1.0, 'question': 1.0} + } + ), + # UTENSIL new label + ( + 'Natasha bought new forks.', + { + 'entities': [(0, 7, 'PERSON'), (19, 24, 'UTENSIL')], + 'tags': ['PROPN', 'VERB', 'ADJ', 'NOUN', 'PUNCT'], + 'deps': ['nsubj', 'ROOT', 'amod', 'dobj', 'punct'], + 'heads': [1, 1, 3, 1, 1], + 'morphs': ['Number=Sing', 'Tense=Past|VerbForm=Fin', 'Degree=Pos', 'Number=Plur', 'PunctType=Peri'], + 'cats': {'statement': 1.0} + } + ) +] + + +def _add_ner_label(ner, data): + for _, annotations in data: + for ent in annotations['entities']: + ner.add_label(ent[2]) + + +def _add_tagger_label(tagger, data): + for _, annotations in data: + for tag in annotations['tags']: + tagger.add_label(tag) + + +def _add_parser_label(parser, data): + for _, annotations in data: + for dep in annotations['deps']: + parser.add_label(dep) + + +def _add_textcat_label(textcat, data): + for _, annotations in data: + for cat in annotations['cats']: + textcat.add_label(cat) + + +def _optimize( + nlp, + component: str, + data: List, + rehearse: bool +): + """Run either train or rehearse.""" + pipe = nlp.get_pipe(component) + if component == 'ner': + _add_ner_label(pipe, data) + elif component == 'tagger': + _add_tagger_label(pipe, data) + elif component == 'parser': + _add_tagger_label(pipe, data) + elif component == 'textcat_multilabel': + _add_textcat_label(pipe, data) + else: + raise NotImplementedError + + if rehearse: + optimizer = nlp.resume_training() + else: + optimizer = nlp.initialize() + + for _ in range(5): + for text, annotation in data: + doc = nlp.make_doc(text) + example = Example.from_dict(doc, annotation) + if rehearse: + nlp.rehearse([example], sgd=optimizer) + else: + nlp.update([example], sgd=optimizer) + return nlp + + +@pytest.mark.parametrize("component", ['ner', 'tagger', 'parser', 'textcat_multilabel']) +def test_rehearse(component): + nlp = spacy.blank("en") + nlp.add_pipe(component) + nlp = _optimize(nlp, component, TRAIN_DATA, False) + _optimize(nlp, component, REHEARSE_DATA, True) From b16da378bb584c4b2a12a4b944cb3141a4ec7789 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 23 Feb 2022 21:08:56 +0100 Subject: [PATCH 076/177] Re-remove universe tests from test suite (#10357) --- .gitignore | 1 - setup.py | 1 - spacy/tests/universe/test_universe_json.py | 17 ----------------- 3 files changed, 19 deletions(-) delete mode 100644 spacy/tests/universe/test_universe_json.py diff --git a/.gitignore b/.gitignore index 60036a475..ac72f2bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ keys/ spacy/tests/package/setup.cfg spacy/tests/package/pyproject.toml spacy/tests/package/requirements.txt -spacy/tests/universe/universe.json # Website website/.cache/ diff --git a/setup.py b/setup.py index 03a1e01dd..fcc124a43 100755 --- a/setup.py +++ b/setup.py @@ -81,7 +81,6 @@ COPY_FILES = { ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", - ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe", } diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py deleted file mode 100644 index 295889186..000000000 --- a/spacy/tests/universe/test_universe_json.py +++ /dev/null @@ -1,17 +0,0 @@ -import json -import re -from pathlib import Path - - -def test_universe_json(): - - root_dir = Path(__file__).parent - universe_file = root_dir / "universe.json" - - with universe_file.open() as f: - universe_data = json.load(f) - for entry in universe_data["resources"]: - if "github" in entry: - assert not re.match( - r"^(http:)|^(https:)", entry["github"] - ), "Github field should be user/repo, not a url" From 5f568f7e41f5bba85ac7f135d3a2dfee3cb2e2b1 Mon Sep 17 00:00:00 2001 From: Sam Edwardes Date: Wed, 23 Feb 2022 21:18:10 -0800 Subject: [PATCH 077/177] Updated spaCy universe for spacytextblob (#10335) * Updated spacytextblob in universe.json * Fixed json * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Added spacy_version tag to spacytextblob Co-authored-by: Sofie Van Landeghem --- website/meta/universe.json | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 122281583..6374600f2 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -227,11 +227,11 @@ }, { "id": "spacy-textblob", - "title": "spaCyTextBlob", - "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!", - "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png", - "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`", - "github": "SamEdwardes/spaCyTextBlob", + "title": "spacytextblob", + "slogan": "A TextBlob sentiment analysis pipeline component for spaCy.", + "thumb": "https://github.com/SamEdwardes/spacytextblob/raw/main/docs/static/img/logo-thumb-square-250x250.png", + "description": "spacytextblob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extension `._.blob` to `Doc`, `Span`, and `Token` objects.", + "github": "SamEdwardes/spacytextblob", "pip": "spacytextblob", "code_example": [ "import spacy", @@ -241,9 +241,10 @@ "nlp.add_pipe('spacytextblob')", "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'", "doc = nlp(text)", - "doc._.polarity # Polarity: -0.125", - "doc._.subjectivity # Sujectivity: 0.9", - "doc._.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]" + "doc._.blob.polarity # Polarity: -0.125", + "doc._.blob.subjectivity # Subjectivity: 0.9", + "doc._.blob.sentiment_assessments.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]", + "doc._.blob.ngrams() # [WordList(['I', 'had', 'a']), WordList(['had', 'a', 'really']), WordList(['a', 'really', 'horrible']), WordList(['really', 'horrible', 'day']), WordList(['horrible', 'day', 'It']), WordList(['day', 'It', 'was']), WordList(['It', 'was', 'the']), WordList(['was', 'the', 'worst']), WordList(['the', 'worst', 'day']), WordList(['worst', 'day', 'ever']), WordList(['day', 'ever', 'But']), WordList(['ever', 'But', 'every']), WordList(['But', 'every', 'now']), WordList(['every', 'now', 'and']), WordList(['now', 'and', 'then']), WordList(['and', 'then', 'I']), WordList(['then', 'I', 'have']), WordList(['I', 'have', 'a']), WordList(['have', 'a', 'really']), WordList(['a', 'really', 'good']), WordList(['really', 'good', 'day']), WordList(['good', 'day', 'that']), WordList(['day', 'that', 'makes']), WordList(['that', 'makes', 'me']), WordList(['makes', 'me', 'happy'])]" ], "code_language": "python", "url": "https://spacytextblob.netlify.app/", @@ -254,7 +255,8 @@ "website": "https://samedwardes.com" }, "category": ["pipeline"], - "tags": ["sentiment", "textblob"] + "tags": ["sentiment", "textblob"], + "spacy_version": 3 }, { "id": "spacy-ray", From d637b34e2f58199eb4cbb58634981334a5a17185 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 25 Feb 2022 10:00:21 +0100 Subject: [PATCH 078/177] Auto-format code with black (#10377) Co-authored-by: explosion-bot --- spacy/tests/training/test_rehearse.py | 215 ++++++++++++-------- spacy/training/converters/conllu_to_docs.py | 4 +- 2 files changed, 132 insertions(+), 87 deletions(-) diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py index 1bb8fac86..84c507702 100644 --- a/spacy/tests/training/test_rehearse.py +++ b/spacy/tests/training/test_rehearse.py @@ -7,139 +7,182 @@ from spacy.training import Example TRAIN_DATA = [ ( - 'Who is Kofi Annan?', + "Who is Kofi Annan?", { - 'entities': [(7, 18, 'PERSON')], - 'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'], - 'heads': [1, 1, 3, 1, 1], - 'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'], - 'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'], - 'cats': {'question': 1.0} - } - ), - ( - 'Who is Steve Jobs?', - { - 'entities': [(7, 17, 'PERSON')], - 'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'], - 'heads': [1, 1, 3, 1, 1], - 'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'], - 'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'], - 'cats': {'question': 1.0} - } - ), - ( - 'Bob is a nice person.', - { - 'entities': [(0, 3, 'PERSON')], - 'tags': ['PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'PUNCT'], - 'heads': [1, 1, 4, 4, 1, 1], - 'deps': ['nsubj', 'ROOT', 'det', 'amod', 'attr', 'punct'], - 'morphs': ['Number=Sing', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Definite=Ind|PronType=Art', 'Degree=Pos', 'Number=Sing', 'PunctType=Peri'], - 'cats': {'statement': 1.0} + "entities": [(7, 18, "PERSON")], + "tags": ["PRON", "AUX", "PROPN", "PRON", "PUNCT"], + "heads": [1, 1, 3, 1, 1], + "deps": ["attr", "ROOT", "compound", "nsubj", "punct"], + "morphs": [ + "", + "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "Number=Sing", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"question": 1.0}, }, ), ( - 'Hi Anil, how are you?', + "Who is Steve Jobs?", { - 'entities': [(3, 7, 'PERSON')], - 'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'], - 'deps': ['intj', 'npadvmod', 'punct', 'advmod', 'ROOT', 'nsubj', 'punct'], - 'heads': [4, 0, 4, 4, 4, 4, 4], - 'morphs': ['', 'Number=Sing', 'PunctType=Comm', '', 'Mood=Ind|Tense=Pres|VerbForm=Fin', 'Case=Nom|Person=2|PronType=Prs', 'PunctType=Peri'], - 'cats': {'greeting': 1.0, 'question': 1.0} - } + "entities": [(7, 17, "PERSON")], + "tags": ["PRON", "AUX", "PROPN", "PRON", "PUNCT"], + "heads": [1, 1, 3, 1, 1], + "deps": ["attr", "ROOT", "compound", "nsubj", "punct"], + "morphs": [ + "", + "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "Number=Sing", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"question": 1.0}, + }, ), ( - 'I like London and Berlin.', + "Bob is a nice person.", { - 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')], - 'tags': ['PROPN', 'VERB', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT'], - 'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'], - 'heads': [1, 1, 1, 2, 2, 1], - 'morphs': ['Case=Nom|Number=Sing|Person=1|PronType=Prs', 'Tense=Pres|VerbForm=Fin', 'Number=Sing', 'ConjType=Cmp', 'Number=Sing', 'PunctType=Peri'], - 'cats': {'statement': 1.0} - } - ) + "entities": [(0, 3, "PERSON")], + "tags": ["PROPN", "AUX", "DET", "ADJ", "NOUN", "PUNCT"], + "heads": [1, 1, 4, 4, 1, 1], + "deps": ["nsubj", "ROOT", "det", "amod", "attr", "punct"], + "morphs": [ + "Number=Sing", + "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "Definite=Ind|PronType=Art", + "Degree=Pos", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"statement": 1.0}, + }, + ), + ( + "Hi Anil, how are you?", + { + "entities": [(3, 7, "PERSON")], + "tags": ["INTJ", "PROPN", "PUNCT", "ADV", "AUX", "PRON", "PUNCT"], + "deps": ["intj", "npadvmod", "punct", "advmod", "ROOT", "nsubj", "punct"], + "heads": [4, 0, 4, 4, 4, 4, 4], + "morphs": [ + "", + "Number=Sing", + "PunctType=Comm", + "", + "Mood=Ind|Tense=Pres|VerbForm=Fin", + "Case=Nom|Person=2|PronType=Prs", + "PunctType=Peri", + ], + "cats": {"greeting": 1.0, "question": 1.0}, + }, + ), + ( + "I like London and Berlin.", + { + "entities": [(7, 13, "LOC"), (18, 24, "LOC")], + "tags": ["PROPN", "VERB", "PROPN", "CCONJ", "PROPN", "PUNCT"], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + "heads": [1, 1, 1, 2, 2, 1], + "morphs": [ + "Case=Nom|Number=Sing|Person=1|PronType=Prs", + "Tense=Pres|VerbForm=Fin", + "Number=Sing", + "ConjType=Cmp", + "Number=Sing", + "PunctType=Peri", + ], + "cats": {"statement": 1.0}, + }, + ), ] REHEARSE_DATA = [ ( - 'Hi Anil', + "Hi Anil", { - 'entities': [(3, 7, 'PERSON')], - 'tags': ['INTJ', 'PROPN'], - 'deps': ['ROOT', 'npadvmod'], - 'heads': [0, 0], - 'morphs': ['', 'Number=Sing'], - 'cats': {'greeting': 1.0} - } + "entities": [(3, 7, "PERSON")], + "tags": ["INTJ", "PROPN"], + "deps": ["ROOT", "npadvmod"], + "heads": [0, 0], + "morphs": ["", "Number=Sing"], + "cats": {"greeting": 1.0}, + }, ), ( - 'Hi Ravish, how you doing?', + "Hi Ravish, how you doing?", { - 'entities': [(3, 9, 'PERSON')], - 'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'], - 'deps': ['intj', 'ROOT', 'punct', 'advmod', 'nsubj', 'advcl', 'punct'], - 'heads': [1, 1, 1, 5, 5, 1, 1], - 'morphs': ['', 'VerbForm=Inf', 'PunctType=Comm', '', 'Case=Nom|Person=2|PronType=Prs', 'Aspect=Prog|Tense=Pres|VerbForm=Part', 'PunctType=Peri'], - 'cats': {'greeting': 1.0, 'question': 1.0} - } + "entities": [(3, 9, "PERSON")], + "tags": ["INTJ", "PROPN", "PUNCT", "ADV", "AUX", "PRON", "PUNCT"], + "deps": ["intj", "ROOT", "punct", "advmod", "nsubj", "advcl", "punct"], + "heads": [1, 1, 1, 5, 5, 1, 1], + "morphs": [ + "", + "VerbForm=Inf", + "PunctType=Comm", + "", + "Case=Nom|Person=2|PronType=Prs", + "Aspect=Prog|Tense=Pres|VerbForm=Part", + "PunctType=Peri", + ], + "cats": {"greeting": 1.0, "question": 1.0}, + }, ), # UTENSIL new label ( - 'Natasha bought new forks.', + "Natasha bought new forks.", { - 'entities': [(0, 7, 'PERSON'), (19, 24, 'UTENSIL')], - 'tags': ['PROPN', 'VERB', 'ADJ', 'NOUN', 'PUNCT'], - 'deps': ['nsubj', 'ROOT', 'amod', 'dobj', 'punct'], - 'heads': [1, 1, 3, 1, 1], - 'morphs': ['Number=Sing', 'Tense=Past|VerbForm=Fin', 'Degree=Pos', 'Number=Plur', 'PunctType=Peri'], - 'cats': {'statement': 1.0} - } - ) + "entities": [(0, 7, "PERSON"), (19, 24, "UTENSIL")], + "tags": ["PROPN", "VERB", "ADJ", "NOUN", "PUNCT"], + "deps": ["nsubj", "ROOT", "amod", "dobj", "punct"], + "heads": [1, 1, 3, 1, 1], + "morphs": [ + "Number=Sing", + "Tense=Past|VerbForm=Fin", + "Degree=Pos", + "Number=Plur", + "PunctType=Peri", + ], + "cats": {"statement": 1.0}, + }, + ), ] def _add_ner_label(ner, data): for _, annotations in data: - for ent in annotations['entities']: + for ent in annotations["entities"]: ner.add_label(ent[2]) def _add_tagger_label(tagger, data): for _, annotations in data: - for tag in annotations['tags']: + for tag in annotations["tags"]: tagger.add_label(tag) def _add_parser_label(parser, data): for _, annotations in data: - for dep in annotations['deps']: + for dep in annotations["deps"]: parser.add_label(dep) def _add_textcat_label(textcat, data): for _, annotations in data: - for cat in annotations['cats']: + for cat in annotations["cats"]: textcat.add_label(cat) -def _optimize( - nlp, - component: str, - data: List, - rehearse: bool -): +def _optimize(nlp, component: str, data: List, rehearse: bool): """Run either train or rehearse.""" pipe = nlp.get_pipe(component) - if component == 'ner': + if component == "ner": _add_ner_label(pipe, data) - elif component == 'tagger': + elif component == "tagger": _add_tagger_label(pipe, data) - elif component == 'parser': + elif component == "parser": _add_tagger_label(pipe, data) - elif component == 'textcat_multilabel': + elif component == "textcat_multilabel": _add_textcat_label(pipe, data) else: raise NotImplementedError @@ -160,7 +203,7 @@ def _optimize( return nlp -@pytest.mark.parametrize("component", ['ner', 'tagger', 'parser', 'textcat_multilabel']) +@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat_multilabel"]) def test_rehearse(component): nlp = spacy.blank("en") nlp.add_pipe(component) diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index a4e70b01f..7052504cc 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -253,7 +253,9 @@ def conllu_sentence_to_doc( heads=heads, ) if set_ents: - doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] + doc_x.ents = [ + Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents + ] return doc_x From 3f68bbcfec44ef55d101e6db742d353b72652129 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 25 Feb 2022 16:29:12 +0100 Subject: [PATCH 079/177] Clean up loggers docs (#10351) * update docs to point to spacy-loggers docs * remove unused error code --- spacy/errors.py | 3 --- website/docs/api/legacy.md | 21 ++------------------- website/docs/api/top-level.md | 2 +- 3 files changed, 3 insertions(+), 23 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b45c4f9db..5399e489b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -566,9 +566,6 @@ class Errors(metaclass=ErrorsWithCodes): E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to " "a list of spans, with each span represented by a tuple (start_char, end_char). " "The tuple can be optionally extended with a label and a KB ID.") - E880 = ("The 'wandb' library could not be found - did you install it? " - "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' " - "config section, instead of the 'WandbLogger'.") E884 = ("The pipeline could not be initialized because the vectors " "could not be found at '{vectors}'. If your pipeline was already " "initialized/trained before, call 'resume_training' instead of 'initialize', " diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index 916a5bf7f..e24c37d77 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -248,23 +248,6 @@ the others, but may not be as accurate, especially if texts are short. ## Loggers {#loggers} -These functions are available from `@spacy.registry.loggers`. +Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. -### spacy.WandbLogger.v1 {#WandbLogger_v1} - -The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet -support the `log_dataset_dir` and `model_log_interval` arguments. - -> #### Example config -> -> ```ini -> [training.logger] -> @loggers = "spacy.WandbLogger.v1" -> project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] -> ``` -> -> | Name | Description | -> | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -> | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | -> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | +More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index be19f9c3a..1a3e9da46 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -423,7 +423,7 @@ and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the console in tabular format. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as -a dependency of spaCy, enables other loggers: currently it provides one that +a dependency of spaCy, enables other loggers, such as one that sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can From 8e93fa850748c884c71505b4f26c46d0c98d3ba1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Mar 2022 09:21:25 +0100 Subject: [PATCH 080/177] Fix Vectors.n_keys for floret vectors (#10394) Fix `Vectors.n_keys` for floret vectors to match docstring description and avoid W007 warnings in similarity methods. --- spacy/tests/vocab_vectors/test_vectors.py | 4 ++++ spacy/vectors.pyx | 2 ++ website/docs/api/vectors.md | 10 +++++----- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 0650a7487..ffd7489b2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -535,6 +535,10 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # every word has a vector assert nlp.vocab[word * 5].has_vector + # n_keys is -1 for floret + assert nlp_plain.vocab.vectors.n_keys > 0 + assert nlp.vocab.vectors.n_keys == -1 + # check that single and batched vector lookups are identical words = [s for s in nlp_plain.vocab.vectors] single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words])) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index bc4863703..2b1ea764b 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -170,6 +170,8 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#n_keys """ + if self.mode == Mode.floret: + return -1 return len(self.key2row) def __reduce__(self): diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index b3bee822c..a651c23b0 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -327,9 +327,9 @@ will be counted individually. In `floret` mode, the keys table is not used. > assert vectors.n_keys == 0 > ``` -| Name | Description | -| ----------- | -------------------------------------------- | -| **RETURNS** | The number of all keys in the table. ~~int~~ | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------- | +| **RETURNS** | The number of all keys in the table. Returns `-1` for floret vectors. ~~int~~ | ## Vectors.most_similar {#most_similar tag="method"} @@ -348,7 +348,7 @@ supported for `floret` mode. > ``` | Name | Description | -| -------------- | --------------------------------------------------------------------------- | +| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | | `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | | _keyword-only_ | | | `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | @@ -385,7 +385,7 @@ Change the embedding matrix to use different Thinc ops. > ``` | Name | Description | -|-------|----------------------------------------------------------| +| ----- | -------------------------------------------------------- | | `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | ## Vectors.to_disk {#to_disk tag="method"} From 91acc3ea75d219ad07ed2b106e7b8bdcb01516dd Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 4 Mar 2022 17:17:36 +0900 Subject: [PATCH 081/177] Fix entity linker batching (#9669) * Partial fix of entity linker batching * Add import * Better name * Add `use_gold_ents` option, docs * Change to v2, create stub v1, update docs etc. * Fix error type Honestly no idea what the right type to use here is. ConfigValidationError seems wrong. Maybe a NotImplementedError? * Make mypy happy * Add hacky fix for init issue * Add legacy pipeline entity linker * Fix references to class name * Add __init__.py for legacy * Attempted fix for loss issue * Remove placeholder V1 * formatting * slightly more interesting train data * Handle batches with no usable examples This adds a test for batches that have docs but not entities, and a check in the component that detects such cases and skips the update step as thought the batch were empty. * Remove todo about data verification Check for empty data was moved further up so this should be OK now - the case in question shouldn't be possible. * Fix gradient calculation The model doesn't know which entities are not in the kb, so it generates embeddings for the context of all of them. However, the loss does know which entities aren't in the kb, and it ignores them, as there's no sensible gradient. This has the issue that the gradient will not be calculated for some of the input embeddings, which causes a dimension mismatch in backprop. That should have caused a clear error, but with numpyops it was causing nans to happen, which is another problem that should be addressed separately. This commit changes the loss to give a zero gradient for entities not in the kb. * add failing test for v1 EL legacy architecture * Add nasty but simple working check for legacy arch * Clarify why init hack works the way it does * Clarify use_gold_ents use case * Fix use gold ents related handling * Add tests for no gold ents and fix other tests * Use aligned ents function (not working) This doesn't actually work because the "aligned" ents are gold-only. But if I have a different function that returns the intersection, *then* this will work as desired. * Use proper matching ent check This changes the process when gold ents are not used so that the intersection of ents in the pred and gold is used. * Move get_matching_ents to Example * Use model attribute to check for legacy arch * Rename flag * bump spacy-legacy to lower 3.0.9 Co-authored-by: svlandeg --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/cli/templates/quickstart_training.jinja | 4 +- spacy/ml/extract_spans.py | 2 +- spacy/ml/models/entity_linker.py | 60 ++- spacy/pipeline/entity_linker.py | 135 ++++-- spacy/pipeline/legacy/__init__.py | 3 + spacy/pipeline/legacy/entity_linker.py | 427 ++++++++++++++++++ spacy/tests/pipeline/test_entity_linker.py | 151 ++++++- spacy/training/example.pyx | 23 + website/docs/api/architectures.md | 4 +- website/docs/api/entitylinker.md | 1 + 12 files changed, 765 insertions(+), 49 deletions(-) create mode 100644 spacy/pipeline/legacy/__init__.py create mode 100644 spacy/pipeline/legacy/entity_linker.py diff --git a/requirements.txt b/requirements.txt index ca4099be5..b8970f686 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.8,<3.1.0 +spacy-legacy>=3.0.9,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/setup.cfg b/setup.cfg index 586a044ff..ed3bf63ce 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ setup_requires = thinc>=8.0.12,<8.1.0 install_requires = # Our libraries - spacy-legacy>=3.0.8,<3.1.0 + spacy-legacy>=3.0.9,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index fb79a4f60..da533b767 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -131,7 +131,7 @@ incl_context = true incl_prior = true [components.entity_linker.model] -@architectures = "spacy.EntityLinker.v1" +@architectures = "spacy.EntityLinker.v2" nO = null [components.entity_linker.model.tok2vec] @@ -303,7 +303,7 @@ incl_context = true incl_prior = true [components.entity_linker.model] -@architectures = "spacy.EntityLinker.v1" +@architectures = "spacy.EntityLinker.v2" nO = null [components.entity_linker.model.tok2vec] diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py index edc86ff9c..d5e9bc07c 100644 --- a/spacy/ml/extract_spans.py +++ b/spacy/ml/extract_spans.py @@ -63,4 +63,4 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d: def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: - return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)) + return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 831fee90f..0149bea89 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,34 +1,82 @@ from pathlib import Path -from typing import Optional, Callable, Iterable, List +from typing import Optional, Callable, Iterable, List, Tuple from thinc.types import Floats2d from thinc.api import chain, clone, list2ragged, reduce_mean, residual -from thinc.api import Model, Maxout, Linear +from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged from ...util import registry from ...kb import KnowledgeBase, Candidate, get_candidates from ...vocab import Vocab from ...tokens import Span, Doc +from ..extract_spans import extract_spans +from ...errors import Errors -@registry.architectures("spacy.EntityLinker.v1") +@registry.architectures("spacy.EntityLinker.v2") def build_nel_encoder( tok2vec: Model, nO: Optional[int] = None ) -> Model[List[Doc], Floats2d]: - with Model.define_operators({">>": chain, "**": clone}): + with Model.define_operators({">>": chain, "&": tuplify}): token_width = tok2vec.maybe_get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = ( - tok2vec - >> list2ragged() + ((tok2vec >> list2ragged()) & build_span_maker()) + >> extract_spans() >> reduce_mean() >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type] >> output_layer ) model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) + # flag to show this isn't legacy + model.attrs["include_span_maker"] = True return model +def build_span_maker(n_sents: int = 0) -> Model: + model: Model = Model("span_maker", forward=span_maker_forward) + model.attrs["n_sents"] = n_sents + return model + + +def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]: + ops = model.ops + n_sents = model.attrs["n_sents"] + candidates = [] + for doc in docs: + cands = [] + try: + sentences = [s for s in doc.sents] + except ValueError: + # no sentence info, normal in initialization + for tok in doc: + tok.is_sent_start = tok.i == 0 + sentences = [doc[:]] + for ent in doc.ents: + try: + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: + # Catch the exception when ent.sent is None and provide a user-friendly warning + raise RuntimeError(Errors.E030) from None + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - n_sents) + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) - 1, sent_index + n_sents) + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + # save positions for extraction + cands.append((start_token, end_token)) + + candidates.append(ops.asarray2i(cands)) + candlens = ops.asarray1i([len(cands) for cands in candidates]) + candidates = ops.xp.concatenate(candidates) + outputs = Ragged(candidates, candlens) + # because this is just rearranging docs, the backprop does nothing + return outputs, lambda x: [] + + @registry.misc("spacy.KBFromFile.v1") def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: def kb_from_file(vocab): diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 1169e898d..89e7576bf 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -6,17 +6,17 @@ import srsly import random from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate -import warnings from ..kb import KnowledgeBase, Candidate from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config +from .legacy.entity_linker import EntityLinker_v1 from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples -from ..errors import Errors, Warnings +from ..errors import Errors from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer @@ -26,7 +26,7 @@ BACKWARD_OVERWRITE = True default_model_config = """ [model] -@architectures = "spacy.EntityLinker.v1" +@architectures = "spacy.EntityLinker.v2" [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" @@ -55,6 +55,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, + "use_gold_ents": True, }, default_score_weights={ "nel_micro_f": 1.0, @@ -75,6 +76,7 @@ def make_entity_linker( get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], overwrite: bool, scorer: Optional[Callable], + use_gold_ents: bool, ): """Construct an EntityLinker component. @@ -90,6 +92,22 @@ def make_entity_linker( produces a list of candidates, given a certain knowledge base and a textual mention. scorer (Optional[Callable]): The scoring method. """ + + if not model.attrs.get("include_span_maker", False): + # The only difference in arguments here is that use_gold_ents is not available + return EntityLinker_v1( + nlp.vocab, + model, + name, + labels_discard=labels_discard, + n_sents=n_sents, + incl_prior=incl_prior, + incl_context=incl_context, + entity_vector_length=entity_vector_length, + get_candidates=get_candidates, + overwrite=overwrite, + scorer=scorer, + ) return EntityLinker( nlp.vocab, model, @@ -102,6 +120,7 @@ def make_entity_linker( get_candidates=get_candidates, overwrite=overwrite, scorer=scorer, + use_gold_ents=use_gold_ents, ) @@ -136,6 +155,7 @@ class EntityLinker(TrainablePipe): get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, + use_gold_ents: bool, ) -> None: """Initialize an entity linker. @@ -152,6 +172,8 @@ class EntityLinker(TrainablePipe): produces a list of candidates, given a certain knowledge base and a textual mention. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. + use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another + component must provide entity annotations. DOCS: https://spacy.io/api/entitylinker#init """ @@ -169,6 +191,7 @@ class EntityLinker(TrainablePipe): # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) self.scorer = scorer + self.use_gold_ents = use_gold_ents def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -212,14 +235,48 @@ class EntityLinker(TrainablePipe): doc_sample = [] vector_sample = [] for example in islice(get_examples(), 10): - doc_sample.append(example.x) + doc = example.x + if self.use_gold_ents: + doc.ents = example.y.ents + doc_sample.append(doc) vector_sample.append(self.model.ops.alloc1f(nO)) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(vector_sample) > 0, Errors.E923.format(name=self.name) + + # XXX In order for size estimation to work, there has to be at least + # one entity. It's not used for training so it doesn't have to be real, + # so we add a fake one if none are present. + # We can't use Doc.has_annotation here because it can be True for docs + # that have been through an NER component but got no entities. + has_annotations = any([doc.ents for doc in doc_sample]) + if not has_annotations: + doc = doc_sample[0] + ent = doc[0:1] + ent.label_ = "XXX" + doc.ents = (ent,) + self.model.initialize( X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") ) + if not has_annotations: + # Clean up dummy annotation + doc.ents = [] + + def batch_has_learnable_example(self, examples): + """Check if a batch contains a learnable example. + + If one isn't present, then the update step needs to be skipped. + """ + + for eg in examples: + for ent in eg.predicted.ents: + candidates = list(self.get_candidates(self.kb, ent)) + if candidates: + return True + + return False + def update( self, examples: Iterable[Example], @@ -247,35 +304,29 @@ class EntityLinker(TrainablePipe): if not examples: return losses validate_examples(examples, "EntityLinker.update") - sentence_docs = [] - for eg in examples: - sentences = [s for s in eg.reference.sents] - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.reference.ents: - # KB ID of the first token is the same as the whole span - kb_id = kb_ids[ent.start] - if kb_id: - try: - # find the sentence in the list of sentences. - sent_index = sentences.index(ent.sent) - except AttributeError: - # Catch the exception when ent.sent is None and provide a user-friendly warning - raise RuntimeError(Errors.E030) from None - # get n previous sentences, if there are any - start_sentence = max(0, sent_index - self.n_sents) - # get n posterior sentences, or as many < n as there are - end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) - # get token positions - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - # append that span as a doc to training - sent_doc = eg.predicted[start_token:end_token].as_doc() - sentence_docs.append(sent_doc) + set_dropout_rate(self.model, drop) - if not sentence_docs: - warnings.warn(Warnings.W093.format(name="Entity Linker")) + docs = [eg.predicted for eg in examples] + # save to restore later + old_ents = [doc.ents for doc in docs] + + for doc, ex in zip(docs, examples): + if self.use_gold_ents: + doc.ents = ex.reference.ents + else: + # only keep matching ents + doc.ents = ex.get_matching_ents() + + # make sure we have something to learn from, if not, short-circuit + if not self.batch_has_learnable_example(examples): return losses - sentence_encodings, bp_context = self.model.begin_update(sentence_docs) + + sentence_encodings, bp_context = self.model.begin_update(docs) + + # now restore the ents + for doc, old in zip(docs, old_ents): + doc.ents = old + loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) @@ -288,24 +339,38 @@ class EntityLinker(TrainablePipe): def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): validate_examples(examples, "EntityLinker.get_loss") entity_encodings = [] + eidx = 0 # indices in gold entities to keep + keep_ents = [] # indices in sentence_encodings to keep + for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) + for ent in eg.reference.ents: kb_id = kb_ids[ent.start] if kb_id: entity_encoding = self.kb.get_vector(kb_id) entity_encodings.append(entity_encoding) + keep_ents.append(eidx) + + eidx += 1 entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - if sentence_encodings.shape != entity_encodings.shape: + selected_encodings = sentence_encodings[keep_ents] + + # If the entity encodings list is empty, then + if selected_encodings.shape != entity_encodings.shape: err = Errors.E147.format( method="get_loss", msg="gold entities do not match up" ) raise RuntimeError(err) # TODO: fix typing issue here - gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore - loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore + gradients = self.distance.get_grad(selected_encodings, entity_encodings) # type: ignore + # to match the input size, we need to give a zero gradient for items not in the kb + out = self.model.ops.alloc2f(*sentence_encodings.shape) + out[keep_ents] = gradients + + loss = self.distance.get_loss(selected_encodings, entity_encodings) # type: ignore loss = loss / len(entity_encodings) - return float(loss), gradients + return float(loss), out def predict(self, docs: Iterable[Doc]) -> List[str]: """Apply the pipeline's model to a batch of docs, without modifying them. diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py new file mode 100644 index 000000000..f216840dc --- /dev/null +++ b/spacy/pipeline/legacy/__init__.py @@ -0,0 +1,3 @@ +from .entity_linker import EntityLinker_v1 + +__all__ = ["EntityLinker_v1"] diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py new file mode 100644 index 000000000..6440c18e5 --- /dev/null +++ b/spacy/pipeline/legacy/entity_linker.py @@ -0,0 +1,427 @@ +# This file is present to provide a prior version of the EntityLinker component +# for backwards compatability. For details see #9669. + +from typing import Optional, Iterable, Callable, Dict, Union, List, Any +from thinc.types import Floats2d +from pathlib import Path +from itertools import islice +import srsly +import random +from thinc.api import CosineDistance, Model, Optimizer, Config +from thinc.api import set_dropout_rate +import warnings + +from ...kb import KnowledgeBase, Candidate +from ...ml import empty_kb +from ...tokens import Doc, Span +from ..pipe import deserialize_config +from ..trainable_pipe import TrainablePipe +from ...language import Language +from ...vocab import Vocab +from ...training import Example, validate_examples, validate_get_examples +from ...errors import Errors, Warnings +from ...util import SimpleFrozenList, registry +from ... import util +from ...scorer import Scorer + +# See #9050 +BACKWARD_OVERWRITE = True + + +def entity_linker_score(examples, **kwargs): + return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs) + + +class EntityLinker_v1(TrainablePipe): + """Pipeline component for named entity linking. + + DOCS: https://spacy.io/api/entitylinker + """ + + NIL = "NIL" # string used to refer to a non-existing link + + def __init__( + self, + vocab: Vocab, + model: Model, + name: str = "entity_linker", + *, + labels_discard: Iterable[str], + n_sents: int, + incl_prior: bool, + incl_context: bool, + entity_vector_length: int, + get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], + overwrite: bool = BACKWARD_OVERWRITE, + scorer: Optional[Callable] = entity_linker_score, + ) -> None: + """Initialize an entity linker. + + vocab (Vocab): The shared vocabulary. + model (thinc.api.Model): The Thinc Model powering the pipeline component. + name (str): The component instance name, used to add entries to the + losses during training. + labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + n_sents (int): The number of neighbouring sentences to take into account. + incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. + incl_context (bool): Whether or not to include the local context in the model. + entity_vector_length (int): Size of encoding vectors in the KB. + get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that + produces a list of candidates, given a certain knowledge base and a textual mention. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_links. + + DOCS: https://spacy.io/api/entitylinker#init + """ + self.vocab = vocab + self.model = model + self.name = name + self.labels_discard = list(labels_discard) + self.n_sents = n_sents + self.incl_prior = incl_prior + self.incl_context = incl_context + self.get_candidates = get_candidates + self.cfg: Dict[str, Any] = {"overwrite": overwrite} + self.distance = CosineDistance(normalize=False) + # how many neighbour sentences to take into account + # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. + self.kb = empty_kb(entity_vector_length)(self.vocab) + self.scorer = scorer + + def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): + """Define the KB of this pipe by providing a function that will + create it using this object's vocab.""" + if not callable(kb_loader): + raise ValueError(Errors.E885.format(arg_type=type(kb_loader))) + + self.kb = kb_loader(self.vocab) + + def validate_kb(self) -> None: + # Raise an error if the knowledge base is not initialized. + if self.kb is None: + raise ValueError(Errors.E1018.format(name=self.name)) + if len(self.kb) == 0: + raise ValueError(Errors.E139.format(name=self.name)) + + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language] = None, + kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None, + ): + """Initialize the pipe for training, using a representative set + of data examples. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance. + Note that providing this argument, will overwrite all data accumulated in the current KB. + Use this only when loading a KB as-such from file. + + DOCS: https://spacy.io/api/entitylinker#initialize + """ + validate_get_examples(get_examples, "EntityLinker_v1.initialize") + if kb_loader is not None: + self.set_kb(kb_loader) + self.validate_kb() + nO = self.kb.entity_vector_length + doc_sample = [] + vector_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + vector_sample.append(self.model.ops.alloc1f(nO)) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(vector_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize( + X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") + ) + + def update( + self, + examples: Iterable[Example], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Learn from a batch of documents and gold-standard information, + updating the pipe's model. Delegates to predict and get_loss. + + examples (Iterable[Example]): A batch of Example objects. + drop (float): The dropout rate. + sgd (thinc.api.Optimizer): The optimizer. + losses (Dict[str, float]): Optional record of the loss during training. + Updated using the component name as the key. + RETURNS (Dict[str, float]): The updated losses dictionary. + + DOCS: https://spacy.io/api/entitylinker#update + """ + self.validate_kb() + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + if not examples: + return losses + validate_examples(examples, "EntityLinker_v1.update") + sentence_docs = [] + for eg in examples: + sentences = [s for s in eg.reference.sents] + kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) + for ent in eg.reference.ents: + # KB ID of the first token is the same as the whole span + kb_id = kb_ids[ent.start] + if kb_id: + try: + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: + # Catch the exception when ent.sent is None and provide a user-friendly warning + raise RuntimeError(Errors.E030) from None + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + # append that span as a doc to training + sent_doc = eg.predicted[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + set_dropout_rate(self.model, drop) + if not sentence_docs: + warnings.warn(Warnings.W093.format(name="Entity Linker")) + return losses + sentence_encodings, bp_context = self.model.begin_update(sentence_docs) + loss, d_scores = self.get_loss( + sentence_encodings=sentence_encodings, examples=examples + ) + bp_context(d_scores) + if sgd is not None: + self.finish_update(sgd) + losses[self.name] += loss + return losses + + def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): + validate_examples(examples, "EntityLinker_v1.get_loss") + entity_encodings = [] + for eg in examples: + kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) + for ent in eg.reference.ents: + kb_id = kb_ids[ent.start] + if kb_id: + entity_encoding = self.kb.get_vector(kb_id) + entity_encodings.append(entity_encoding) + entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") + if sentence_encodings.shape != entity_encodings.shape: + err = Errors.E147.format( + method="get_loss", msg="gold entities do not match up" + ) + raise RuntimeError(err) + # TODO: fix typing issue here + gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore + loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore + loss = loss / len(entity_encodings) + return float(loss), gradients + + def predict(self, docs: Iterable[Doc]) -> List[str]: + """Apply the pipeline's model to a batch of docs, without modifying them. + Returns the KB IDs for each entity in each doc, including NIL if there is + no prediction. + + docs (Iterable[Doc]): The documents to predict. + RETURNS (List[str]): The models prediction for each document. + + DOCS: https://spacy.io/api/entitylinker#predict + """ + self.validate_kb() + entity_count = 0 + final_kb_ids: List[str] = [] + if not docs: + return final_kb_ids + if isinstance(docs, Doc): + docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: + # Looping through each entity (TODO: rewrite) + for ent in doc.ents: + sent = ent.sent + sent_index = sentences.index(sent) + assert sent_index >= 0 + # get n_neighbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + xp = self.model.ops.xp + if self.incl_context: + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + entity_count += 1 + if ent.label_ in self.labels_discard: + # ignoring this entity - setting to NIL + final_kb_ids.append(self.NIL) + else: + candidates = list(self.get_candidates(self.kb, ent)) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + else: + random.shuffle(candidates) + # set all prior probabilities to 0 if incl_prior=False + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.incl_prior: + prior_probs = xp.asarray([0.0 for _ in candidates]) + scores = prior_probs + # add in similarity from the context + if self.incl_context: + entity_encodings = xp.asarray( + [c.entity_vector for c in candidates] + ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + if len(entity_encodings) != len(prior_probs): + raise RuntimeError( + Errors.E147.format( + method="predict", + msg="vectors not of equal length", + ) + ) + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm + ) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs * sims) + # TODO: thresholding + best_index = scores.argmax().item() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + if not (len(final_kb_ids) == entity_count): + err = Errors.E147.format( + method="predict", msg="result variables not of equal length" + ) + raise RuntimeError(err) + return final_kb_ids + + def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None: + """Modify a batch of documents, using pre-computed scores. + + docs (Iterable[Doc]): The documents to modify. + kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. + + DOCS: https://spacy.io/api/entitylinker#set_annotations + """ + count_ents = len([ent for doc in docs for ent in doc.ents]) + if count_ents != len(kb_ids): + raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) + i = 0 + overwrite = self.cfg["overwrite"] + for doc in docs: + for ent in doc.ents: + kb_id = kb_ids[i] + i += 1 + for token in ent: + if token.ent_kb_id == 0 or overwrite: + token.ent_kb_id_ = kb_id + + def to_bytes(self, *, exclude=tuple()): + """Serialize the pipe to a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + + DOCS: https://spacy.io/api/entitylinker#to_bytes + """ + self._validate_serialization_attrs() + serialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) + serialize["kb"] = self.kb.to_bytes + serialize["model"] = self.model.to_bytes + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load the pipe from a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (TrainablePipe): The loaded object. + + DOCS: https://spacy.io/api/entitylinker#from_bytes + """ + self._validate_serialization_attrs() + + def load_model(b): + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) + deserialize["kb"] = lambda b: self.kb.from_bytes(b) + deserialize["model"] = load_model + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + ) -> None: + """Serialize the pipe to disk. + + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + + DOCS: https://spacy.io/api/entitylinker#to_disk + """ + serialize = {} + serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) + serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) + serialize["kb"] = lambda p: self.kb.to_disk(p) + serialize["model"] = lambda p: self.model.to_disk(p) + util.to_disk(path, serialize, exclude) + + def from_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + ) -> "EntityLinker_v1": + """Load the pipe from disk. Modifies the object in place and returns it. + + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (EntityLinker): The modified EntityLinker object. + + DOCS: https://spacy.io/api/entitylinker#from_disk + """ + + def load_model(p): + try: + with p.open("rb") as infile: + self.model.from_bytes(infile.read()) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize: Dict[str, Callable[[Any], Any]] = {} + deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) + deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude) + deserialize["kb"] = lambda p: self.kb.from_disk(p) + deserialize["model"] = load_model + util.from_disk(path, deserialize, exclude) + return self + + def rehearse(self, examples, *, sgd=None, losses=None, **config): + raise NotImplementedError + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 3740e430e..7d1382741 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -9,6 +9,9 @@ from spacy.compat import pickle from spacy.kb import Candidate, KnowledgeBase, get_candidates from spacy.lang.en import English from spacy.ml import load_kb +from spacy.pipeline import EntityLinker +from spacy.pipeline.legacy import EntityLinker_v1 +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir from spacy.tokens import Span @@ -168,6 +171,45 @@ def test_issue7065_b(): assert doc +def test_no_entities(): + # Test that having no entities doesn't crash the model + TRAIN_DATA = [ + ( + "The sky is blue.", + { + "sent_starts": [1, 0, 0, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + + def test_partial_links(): # Test that having some entities on the doc without gold links, doesn't crash TRAIN_DATA = [ @@ -650,7 +692,7 @@ TRAIN_DATA = [ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}), ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, - "entities": [(0, 12, "PERSON")], + "entities": [(0, 12, "PERSON"), (34, 43, "ART")], "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}), ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, @@ -693,6 +735,7 @@ def test_overfitting_IO(): # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings @@ -922,3 +965,109 @@ def test_scorer_links(): assert scores["nel_micro_p"] == 2 / 3 assert scores["nel_micro_r"] == 2 / 4 + + +# fmt: off +@pytest.mark.parametrize( + "name,config", + [ + ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), + ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), + ], +) +# fmt: on +def test_legacy_architectures(name, config): + # Ensure that the legacy architectures still work + vector_length = 3 + nlp = English() + + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp.make_doc(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + entity_linker = nlp.add_pipe(name, config={"model": config}) + if config["@architectures"] == "spacy.EntityLinker.v1": + assert isinstance(entity_linker, EntityLinker_v1) + else: + assert isinstance(entity_linker, EntityLinker) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + @pytest.mark.parametrize("patterns", [ + # perfect case + [{"label": "CHARACTER", "pattern": "Kirby"}], + # typo for false negative + [{"label": "PERSON", "pattern": "Korby"}], + # random stuff for false positive + [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], + ] + ) + def test_no_gold_ents(patterns): + # test that annotating components work + TRAIN_DATA = [ + ( + "Kirby is pink", + { + "links": {(0, 5): {"Q613241": 1.0}}, + "entities": [(0, 5, "CHARACTER")], + "sent_starts": [1, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + # Create a ruler to mark entities + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + + # Apply ruler to examples. In a real pipeline this would be an annotating component. + for eg in train_examples: + eg.predicted = ruler(eg.predicted) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Kirby", ["Q613241"], [0.9]) + # Placeholder + mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) + mykb.add_alias("pink", ["pink"], [0.9]) + return mykb + + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) + entity_linker.set_kb(create_kb) + assert entity_linker.use_gold_ents == False + + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index d792c9bbf..a2c5e08e9 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -256,6 +256,29 @@ cdef class Example: x_ents, x_tags = self.get_aligned_ents_and_ner() return x_tags + def get_matching_ents(self, check_label=True): + """Return entities that are shared between predicted and reference docs. + + If `check_label` is True, entities must have matching labels to be + kept. Otherwise only the character indices need to match. + """ + gold = {} + for ent in self.reference: + gold[(ent.start_char, ent.end_char)] = ent.label + + keep = [] + for ent in self.predicted: + key = (ent.start_char, ent.end_char) + if key not in gold: + continue + + if check_label and ent.label != gold[key]: + continue + + keep.append(ent) + + return keep + def to_dict(self): return { "doc_annotation": { diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 07b76393f..5fb3546a7 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -858,13 +858,13 @@ into the "real world". This requires 3 main components: - A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the most plausible ID from the set of candidates. -### spacy.EntityLinker.v1 {#EntityLinker} +### spacy.EntityLinker.v2 {#EntityLinker} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.EntityLinker.v1" +> @architectures = "spacy.EntityLinker.v2" > nO = null > > [model.tok2vec] diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 3d3372679..8e0d6087a 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -59,6 +59,7 @@ architectures and their arguments and hyperparameters. | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | From d89dac4066b3a245adb3982709bb7bb6eb9b9d63 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 4 Mar 2022 11:07:45 +0100 Subject: [PATCH 082/177] hook up meta in load_model_from_config (#10400) --- spacy/util.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 2a8b9f5cc..66e257dd8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -485,13 +485,16 @@ def load_model_from_path( config_path = model_path / "config.cfg" overrides = dict_to_dot(config) config = load_config(config_path, overrides=overrides) - nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude) + nlp = load_model_from_config( + config, vocab=vocab, disable=disable, exclude=exclude, meta=meta + ) return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) def load_model_from_config( config: Union[Dict[str, Any], Config], *, + meta: Dict[str, Any] = SimpleFrozenDict(), vocab: Union["Vocab", bool] = True, disable: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(), @@ -529,6 +532,7 @@ def load_model_from_config( exclude=exclude, auto_fill=auto_fill, validate=validate, + meta=meta, ) return nlp From 6f4f57f3172112eb34336b0d6c0f0a0c930a5d1c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 7 Mar 2022 18:41:03 +0900 Subject: [PATCH 083/177] Update Issue Templates (#10446) * Remove mention of python 3.10 wheels These were released a while ago, just forgot to remove this notice. * Add note about Discussions --- .github/ISSUE_TEMPLATE/01_bugs.md | 2 ++ .github/ISSUE_TEMPLATE/config.yml | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/01_bugs.md b/.github/ISSUE_TEMPLATE/01_bugs.md index 768832c24..255a5241e 100644 --- a/.github/ISSUE_TEMPLATE/01_bugs.md +++ b/.github/ISSUE_TEMPLATE/01_bugs.md @@ -4,6 +4,8 @@ about: Use this template if you came across a bug or unexpected behaviour differ --- + + ## How to reproduce the behaviour diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index fce1a1064..31f89f917 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,5 @@ blank_issues_enabled: false contact_links: - - name: ⚠️ Python 3.10 Support - url: https://github.com/explosion/spaCy/discussions/9418 - about: Python 3.10 wheels haven't been released yet, see the link for details. - name: 🗯 Discussions Forum url: https://github.com/explosion/spaCy/discussions about: Install issues, usage questions, general discussion and anything else that isn't a bug report. From a6d5824e5f8361078f4075541e7fd41b304cf379 Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Mon, 7 Mar 2022 12:47:26 +0100 Subject: [PATCH 084/177] added classy-classification package to spacy universe (#10393) * Update universe.json added classy-classification to Spacy universe * Update universe.json added classy-classification to the spacy universe resources * Update universe.json corrected a small typo in json * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Update universe.json processed merge feedback * Update universe.json Co-authored-by: Sofie Van Landeghem --- website/meta/universe.json | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 6374600f2..0179830d0 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2599,6 +2599,53 @@ }, "category": ["pipeline"] }, + { + "id": "classyclassification", + "slogan": "A Python library for classy few-shot and zero-shot classification within spaCy.", + "description": "Huggingface does offer some nice models for few/zero-shot classification, but these are not tailored to multi-lingual approaches. Rasa NLU has a nice approach for this, but its too embedded in their codebase for easy usage outside of Rasa/chatbots. Additionally, it made sense to integrate sentence-transformers and Huggingface zero-shot, instead of default word embeddings. Finally, I decided to integrate with spaCy, since training a custom spaCy TextCategorizer seems like a lot of hassle if you want something quick and dirty.", + "github": "davidberenstein1957/classy-classification", + "pip": "classy-classification", + "code_example": [ + "import spacy", + "import classy_classification", + "", + "data = {", + " \"furniture\": [\"This text is about chairs.\",", + " \"Couches, benches and televisions.\",", + " \"I really need to get a new sofa.\"],", + " \"kitchen\": [\"There also exist things like fridges.\",", + " \"I hope to be getting a new stove today.\",", + " \"Do you also have some ovens.\"]", + "}", + "", + "nlp = spacy.load('en_core_web_md')", + "", + "classification_type = \"spacy_few_shot\"", + "if classification_type == \"spacy_few_shot\":", + " nlp.add_pipe(\"text_categorizer\", ", + " config={\"data\": data, \"model\": \"spacy\"}", + " )", + "elif classification_type == \"sentence_transformer_few_shot\":", + " nlp.add_pipe(\"text_categorizer\", ", + " config={\"data\": data, \"model\": \"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\"}", + " )", + "elif classification_type == \"huggingface_zero_shot\":", + " nlp.add_pipe(\"text_categorizer\", ", + " config={\"data\": list(data.keys()), \"cat_type\": \"zero\", \"model\": \"facebook/bart-large-mnli\"}", + " )", + "", + "print(nlp(\"I am looking for kitchen appliances.\")._.cats)", + "print([doc._.cats for doc in nlp.pipe([\"I am looking for kitchen appliances.\"])])" + ], + "author": "David Berenstein", + "author_links": { + "github": "davidberenstein1957", + "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" + }, + "category": ["pipeline", "standalone"], + "tags": ["classification", "zero-shot", "few-shot", "sentence-transformers", "huggingface"], + "spacy_version": 3 + }, { "id": "blackstone", "title": "Blackstone", From 7ed7908716094ff41e4d1b2f60479f6b8356d700 Mon Sep 17 00:00:00 2001 From: jnphilipp Date: Mon, 7 Mar 2022 16:20:39 +0100 Subject: [PATCH 085/177] Add Upper Sorbian support. (#10432) * Add support basic support for upper sorbian. * Add tokenizer exceptions and tests. * Update spacy/lang/hsb/examples.py Co-authored-by: Sofie Van Landeghem --- spacy/lang/hsb/__init__.py | 18 ++++++ spacy/lang/hsb/examples.py | 15 +++++ spacy/lang/hsb/lex_attrs.py | 77 ++++++++++++++++++++++++++ spacy/lang/hsb/stop_words.py | 19 +++++++ spacy/lang/hsb/tokenizer_exceptions.py | 18 ++++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/hsb/__init__.py | 0 spacy/tests/lang/hsb/test_text.py | 25 +++++++++ spacy/tests/lang/hsb/test_tokenizer.py | 32 +++++++++++ 9 files changed, 209 insertions(+) create mode 100644 spacy/lang/hsb/__init__.py create mode 100644 spacy/lang/hsb/examples.py create mode 100644 spacy/lang/hsb/lex_attrs.py create mode 100644 spacy/lang/hsb/stop_words.py create mode 100644 spacy/lang/hsb/tokenizer_exceptions.py create mode 100644 spacy/tests/lang/hsb/__init__.py create mode 100644 spacy/tests/lang/hsb/test_text.py create mode 100644 spacy/tests/lang/hsb/test_tokenizer.py diff --git a/spacy/lang/hsb/__init__.py b/spacy/lang/hsb/__init__.py new file mode 100644 index 000000000..034d82319 --- /dev/null +++ b/spacy/lang/hsb/__init__.py @@ -0,0 +1,18 @@ +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ...language import Language, BaseDefaults + + +class UpperSorbianDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + + +class UpperSorbian(Language): + lang = "hsb" + Defaults = UpperSorbianDefaults + + +__all__ = ["UpperSorbian"] diff --git a/spacy/lang/hsb/examples.py b/spacy/lang/hsb/examples.py new file mode 100644 index 000000000..0aafd5cee --- /dev/null +++ b/spacy/lang/hsb/examples.py @@ -0,0 +1,15 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.hsb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin", + "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", + "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!", + "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.", + "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej." +] diff --git a/spacy/lang/hsb/lex_attrs.py b/spacy/lang/hsb/lex_attrs.py new file mode 100644 index 000000000..dfda3e2db --- /dev/null +++ b/spacy/lang/hsb/lex_attrs.py @@ -0,0 +1,77 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "nul", + "jedyn", "jedna", "jedne", + "dwaj", "dwě", + "tři", "třo", + "štyri", "štyrjo", + "pjeć", + "šěsć", + "sydom", + "wosom", + "dźewjeć", + "dźesać", + "jědnaće", + "dwanaće", + "třinaće", + "štyrnaće", + "pjatnaće", + "šěsnaće", + "sydomnaće", + "wosomnaće", + "dźewjatnaće", + "dwaceći" + "třiceći", + "štyrceći", + "pjećdźesat", + "šěsćdźesat", + "sydomdźesat", + "wosomdźesat", + "dźewjećdźesat", + "sto", + "tysac", + "milion", + "miliarda", + "bilion", + "biliarda", + "trilion", + "triliarda", +] + +_ordinal_words = [ + "prěni", "prěnja", "prěnje", + "druhi", "druha", "druhe", + "třeći", "třeća", "třeće", + "štwórty", "štwórta", "štwórte", + "pjaty", "pjata", "pjate", + "šěsty", "šěsta", "šěste", + "sydmy", "sydma", "sydme", + "wosmy", "wosma", "wosme", + "dźewjaty", "dźewjata", "dźewjate", + "dźesaty", "dźesata", "dźesate", + "jědnaty", "jědnata", "jědnate", + "dwanaty", "dwanata", "dwanate" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/hsb/stop_words.py b/spacy/lang/hsb/stop_words.py new file mode 100644 index 000000000..e6fedaf4c --- /dev/null +++ b/spacy/lang/hsb/stop_words.py @@ -0,0 +1,19 @@ +STOP_WORDS = set( + """ +a abo ale ani + +dokelž + +hdyž + +jeli jelizo + +kaž + +pak potom + +tež tohodla + +zo zoby +""".split() +) diff --git a/spacy/lang/hsb/tokenizer_exceptions.py b/spacy/lang/hsb/tokenizer_exceptions.py new file mode 100644 index 000000000..4b9a4f98a --- /dev/null +++ b/spacy/lang/hsb/tokenizer_exceptions.py @@ -0,0 +1,18 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import ORTH, NORM +from ...util import update_exc + +_exc = dict() +for exc_data in [ + {ORTH: "mil.", NORM: "milion"}, + {ORTH: "wob.", NORM: "wobydler"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + +for orth in [ + "resp.", +]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index f9266cb94..7083fd817 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -221,6 +221,11 @@ def ja_tokenizer(): return get_lang_class("ja")().tokenizer +@pytest.fixture(scope="session") +def hsb_tokenizer(): + return get_lang_class("hsb")().tokenizer + + @pytest.fixture(scope="session") def ko_tokenizer(): pytest.importorskip("natto") diff --git a/spacy/tests/lang/hsb/__init__.py b/spacy/tests/lang/hsb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/hsb/test_text.py b/spacy/tests/lang/hsb/test_text.py new file mode 100644 index 000000000..aaa4984eb --- /dev/null +++ b/spacy/tests/lang/hsb/test_text.py @@ -0,0 +1,25 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("jedne", True), + ("dwanaće", True), + ("milion", True), + ("sto", True), + ("załožene", False), + ("wona", False), + ("powšitkownej", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(hsb_tokenizer, text, match): + tokens = hsb_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/hsb/test_tokenizer.py b/spacy/tests/lang/hsb/test_tokenizer.py new file mode 100644 index 000000000..a3ec89ba0 --- /dev/null +++ b/spacy/tests/lang/hsb/test_tokenizer.py @@ -0,0 +1,32 @@ +import pytest + +HSB_BASIC_TOKENIZATION_TESTS = [ + ( + "Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.", + [ + "Hornjoserbšćina", + "wobsteji", + "resp.", + "wobsteješe", + "z", + "wjacorych", + "dialektow", + ",", + "kotrež", + "so", + "zdźěla", + "chětro", + "wot", + "so", + "rozeznawachu", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS) +def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens): + tokens = hsb_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From 61ba5450ff5de3c1bbbca21169772d1239ee822f Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 8 Mar 2022 00:56:57 +0900 Subject: [PATCH 086/177] Fix get_matching_ents (#10451) * Fix get_matching_ents Not sure what happened here - the code prior to this commit simply does not work. It's already covered by entity linker tests, which were succeeding in the NEL PR, but couldn't possibly succeed on master. * Fix test Test was indented inside another test and so doesn't seem to have been running properly. --- spacy/tests/pipeline/test_entity_linker.py | 108 ++++++++++----------- spacy/training/example.pyx | 4 +- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 7d1382741..af2132d73 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - @pytest.mark.parametrize("patterns", [ - # perfect case - [{"label": "CHARACTER", "pattern": "Kirby"}], - # typo for false negative - [{"label": "PERSON", "pattern": "Korby"}], - # random stuff for false positive - [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], - ] - ) - def test_no_gold_ents(patterns): - # test that annotating components work - TRAIN_DATA = [ - ( - "Kirby is pink", - { - "links": {(0, 5): {"Q613241": 1.0}}, - "entities": [(0, 5, "CHARACTER")], - "sent_starts": [1, 0, 0], - }, - ) - ] - nlp = English() - vector_length = 3 - train_examples = [] - for text, annotation in TRAIN_DATA: - doc = nlp(text) - train_examples.append(Example.from_dict(doc, annotation)) +@pytest.mark.parametrize("patterns", [ + # perfect case + [{"label": "CHARACTER", "pattern": "Kirby"}], + # typo for false negative + [{"label": "PERSON", "pattern": "Korby"}], + # random stuff for false positive + [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], + ] +) +def test_no_gold_ents(patterns): + # test that annotating components work + TRAIN_DATA = [ + ( + "Kirby is pink", + { + "links": {(0, 5): {"Q613241": 1.0}}, + "entities": [(0, 5, "CHARACTER")], + "sent_starts": [1, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) - # Create a ruler to mark entities - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) + # Create a ruler to mark entities + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) - # Apply ruler to examples. In a real pipeline this would be an annotating component. - for eg in train_examples: - eg.predicted = ruler(eg.predicted) + # Apply ruler to examples. In a real pipeline this would be an annotating component. + for eg in train_examples: + eg.predicted = ruler(eg.predicted) - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias("Kirby", ["Q613241"], [0.9]) - # Placeholder - mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) - mykb.add_alias("pink", ["pink"], [0.9]) - return mykb + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Kirby", ["Q613241"], [0.9]) + # Placeholder + mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) + mykb.add_alias("pink", ["pink"], [0.9]) + return mykb - # Create and train the Entity Linker - entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) - entity_linker.set_kb(create_kb) - assert entity_linker.use_gold_ents == False + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) + entity_linker.set_kb(create_kb) + assert entity_linker.use_gold_ents == False - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) - # adding additional components that are required for the entity_linker - nlp.add_pipe("sentencizer", first=True) + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) - # this will run the pipeline on the examples and shouldn't crash - results = nlp.evaluate(train_examples) + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index a2c5e08e9..778dfd12a 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -263,11 +263,11 @@ cdef class Example: kept. Otherwise only the character indices need to match. """ gold = {} - for ent in self.reference: + for ent in self.reference.ents: gold[(ent.start_char, ent.end_char)] = ent.label keep = [] - for ent in self.predicted: + for ent in self.predicted.ents: key = (ent.start_char, ent.end_char) if key not in gold: continue From 5ca0dbae765c405f3aa74e32ab9e93d5ce752179 Mon Sep 17 00:00:00 2001 From: jnphilipp Date: Mon, 7 Mar 2022 16:57:14 +0100 Subject: [PATCH 087/177] Add Lower Sorbian support. (#10431) * Add support basic support for lower sorbian. * Add some test for dsb. * Update spacy/lang/dsb/examples.py Co-authored-by: Sofie Van Landeghem --- spacy/lang/dsb/__init__.py | 16 ++++++ spacy/lang/dsb/examples.py | 15 +++++ spacy/lang/dsb/lex_attrs.py | 77 ++++++++++++++++++++++++++ spacy/lang/dsb/stop_words.py | 15 +++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/dsb/__init__.py | 0 spacy/tests/lang/dsb/test_text.py | 25 +++++++++ spacy/tests/lang/dsb/test_tokenizer.py | 29 ++++++++++ 8 files changed, 182 insertions(+) create mode 100644 spacy/lang/dsb/__init__.py create mode 100644 spacy/lang/dsb/examples.py create mode 100644 spacy/lang/dsb/lex_attrs.py create mode 100644 spacy/lang/dsb/stop_words.py create mode 100644 spacy/tests/lang/dsb/__init__.py create mode 100644 spacy/tests/lang/dsb/test_text.py create mode 100644 spacy/tests/lang/dsb/test_tokenizer.py diff --git a/spacy/lang/dsb/__init__.py b/spacy/lang/dsb/__init__.py new file mode 100644 index 000000000..c66092a0c --- /dev/null +++ b/spacy/lang/dsb/__init__.py @@ -0,0 +1,16 @@ +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS +from ...language import Language, BaseDefaults + + +class LowerSorbianDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class LowerSorbian(Language): + lang = "dsb" + Defaults = LowerSorbianDefaults + + +__all__ = ["LowerSorbian"] diff --git a/spacy/lang/dsb/examples.py b/spacy/lang/dsb/examples.py new file mode 100644 index 000000000..28b8c41f1 --- /dev/null +++ b/spacy/lang/dsb/examples.py @@ -0,0 +1,15 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.dsb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.", + "Mi so tu jara derje spodoba.", + "Kotre nowniny chceće měć?", + "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.", + "Zwóstanjo pótakem hyšći wjele źěła." +] diff --git a/spacy/lang/dsb/lex_attrs.py b/spacy/lang/dsb/lex_attrs.py new file mode 100644 index 000000000..75fb2e590 --- /dev/null +++ b/spacy/lang/dsb/lex_attrs.py @@ -0,0 +1,77 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "nul", + "jaden", "jadna", "jadno", + "dwa", "dwě", + "tśi", "tśo", + "styri", "styrjo", + "pěś", "pěśo", + "šesć", "šesćo", + "sedym", "sedymjo", + "wósym", "wósymjo", + "źewjeś", "źewjeśo", + "źaseś", "źaseśo", + "jadnassćo", + "dwanassćo", + "tśinasćo", + "styrnasćo", + "pěśnasćo", + "šesnasćo", + "sedymnasćo", + "wósymnasćo", + "źewjeśnasćo", + "dwanasćo", "dwaźasća", + "tśiźasća", + "styrźasća", + "pěśźaset", + "šesćźaset", + "sedymźaset", + "wósymźaset", + "źewjeśźaset", + "sto", + "tysac", + "milion", + "miliarda", + "bilion", + "biliarda", + "trilion", + "triliarda", +] + +_ordinal_words = [ + "prědny", "prědna", "prědne", + "drugi", "druga", "druge", + "tśeśi", "tśeśa", "tśeśe", + "stwórty", "stwórta", "stwórte", + "pêty", "pěta", "pête", + "šesty", "šesta", "šeste", + "sedymy", "sedyma", "sedyme", + "wósymy", "wósyma", "wósyme", + "źewjety", "źewjeta", "źewjete", + "źasety", "źaseta", "źasete", + "jadnasty", "jadnasta", "jadnaste", + "dwanasty", "dwanasta", "dwanaste" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/dsb/stop_words.py b/spacy/lang/dsb/stop_words.py new file mode 100644 index 000000000..376e04aa6 --- /dev/null +++ b/spacy/lang/dsb/stop_words.py @@ -0,0 +1,15 @@ +STOP_WORDS = set( + """ +a abo aby ako ale až + +daniž dokulaž + +gaž + +jolic + +pak pótom + +teke togodla +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 7083fd817..24474c71e 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -99,6 +99,11 @@ def de_vocab(): return get_lang_class("de")().vocab +@pytest.fixture(scope="session") +def dsb_tokenizer(): + return get_lang_class("dsb")().tokenizer + + @pytest.fixture(scope="session") def el_tokenizer(): return get_lang_class("el")().tokenizer diff --git a/spacy/tests/lang/dsb/__init__.py b/spacy/tests/lang/dsb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/dsb/test_text.py b/spacy/tests/lang/dsb/test_text.py new file mode 100644 index 000000000..40f2c15e0 --- /dev/null +++ b/spacy/tests/lang/dsb/test_text.py @@ -0,0 +1,25 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("jadno", True), + ("dwanassćo", True), + ("milion", True), + ("sto", True), + ("ceła", False), + ("kopica", False), + ("narěcow", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(dsb_tokenizer, text, match): + tokens = dsb_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/dsb/test_tokenizer.py b/spacy/tests/lang/dsb/test_tokenizer.py new file mode 100644 index 000000000..135974fb8 --- /dev/null +++ b/spacy/tests/lang/dsb/test_tokenizer.py @@ -0,0 +1,29 @@ +import pytest + +DSB_BASIC_TOKENIZATION_TESTS = [ + ( + "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.", + [ + "Ale", + "eksistěrujo", + "mimo", + "togo", + "ceła", + "kopica", + "narěcow", + ",", + "ako", + "na", + "pśikład", + "slěpjańska", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS) +def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens): + tokens = dsb_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From b2bbefd0b542fcad527b9badf97fd1c3c69a7bbf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Mar 2022 17:03:45 +0100 Subject: [PATCH 088/177] Add Finnish, Korean, and Swedish models and Korean support notes (#10355) * Add Finnish, Korean, and Swedish models to website * Add Korean language support notes --- website/docs/usage/models.md | 47 +++++++++++++++++++++++++++++++++--- website/meta/languages.json | 21 +++++++++++++--- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 3b79c4d0d..f82da44d9 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -259,6 +259,45 @@ used for training the current [Japanese pipelines](/models/ja). +### Korean language support {#korean} + +> #### mecab-ko tokenizer +> +> ```python +> nlp = spacy.blank("ko") +> ``` + +The default MeCab-based Korean tokenizer requires: + +- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) +- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) +- [natto-py](https://github.com/buruzaemon/natto-py) + +For some Korean datasets and tasks, the +[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited +than MeCab. To configure a Korean pipeline with the rule-based tokenizer: + +> #### Rule-based tokenizer +> +> ```python +> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}} +> nlp = spacy.blank("ko", config=config) +> ``` + +```ini +### config.cfg +[nlp] +lang = "ko" +tokenizer = {"@tokenizers" = "spacy.Tokenizer.v1"} +``` + + + +The [Korean trained pipelines](/models/ko) use the rule-based tokenizer, so no +additional dependencies are required. + + + ## Installing and using trained pipelines {#download} The easiest way to download a trained pipeline is via spaCy's @@ -417,10 +456,10 @@ doc = nlp("This is a sentence.") You can use the [`info`](/api/cli#info) command or -[`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline -package's meta data before loading it. Each `Language` object with a loaded -pipeline also exposes the pipeline's meta data as the attribute `meta`. For -example, `nlp.meta['version']` will return the package version. +[`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline package's +meta data before loading it. Each `Language` object with a loaded pipeline also +exposes the pipeline's meta data as the attribute `meta`. For example, +`nlp.meta['version']` will return the package version. diff --git a/website/meta/languages.json b/website/meta/languages.json index a7dda6482..1c4379b6d 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -114,7 +114,12 @@ { "code": "fi", "name": "Finnish", - "has_examples": true + "has_examples": true, + "models": [ + "fi_core_news_sm", + "fi_core_news_md", + "fi_core_news_lg" + ] }, { "code": "fr", @@ -227,7 +232,12 @@ } ], "example": "이것은 문장입니다.", - "has_examples": true + "has_examples": true, + "models": [ + "ko_core_news_sm", + "ko_core_news_md", + "ko_core_news_lg" + ] }, { "code": "ky", @@ -388,7 +398,12 @@ { "code": "sv", "name": "Swedish", - "has_examples": true + "has_examples": true, + "models": [ + "sv_core_news_sm", + "sv_core_news_md", + "sv_core_news_lg" + ] }, { "code": "ta", From 60520d86693699c1221a4414a133f76ffb9601b0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 8 Mar 2022 13:51:11 +0100 Subject: [PATCH 089/177] Fix types in API docs for moves in parser and ner (#10464) --- website/docs/api/dependencyparser.md | 2 +- website/docs/api/entityrecognizer.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 118cdc611..103e0826e 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -100,7 +100,7 @@ shortcut for this and instantiate the component using its string name and | `vocab` | The shared vocabulary. ~~Vocab~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[TransitionSystem]~~ | | _keyword-only_ | | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 14b6fece4..7c153f064 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -62,7 +62,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | | `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER will learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | @@ -98,7 +98,7 @@ shortcut for this and instantiate the component using its string name and | `vocab` | The shared vocabulary. ~~Vocab~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[TransitionSystem]~~ | | _keyword-only_ | | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `incorrect_spans_key` | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group in [`Doc.spans`](/api/doc#spans), under this key. Defaults to `None`. ~~Optional[str]~~ | From 191e8b31fa75f60b32f9e4779fe629b3c31e7c5e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 8 Mar 2022 14:28:46 +0100 Subject: [PATCH 090/177] Remove English tokenizer exception May. (#10463) --- spacy/lang/en/tokenizer_exceptions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 55b544e42..2c20b8c27 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -447,7 +447,6 @@ for exc_data in [ {ORTH: "La.", NORM: "Louisiana"}, {ORTH: "Mar.", NORM: "March"}, {ORTH: "Mass.", NORM: "Massachusetts"}, - {ORTH: "May.", NORM: "May"}, {ORTH: "Mich.", NORM: "Michigan"}, {ORTH: "Minn.", NORM: "Minnesota"}, {ORTH: "Miss.", NORM: "Mississippi"}, From 01ec6349eab7fd1d426a29bd6b9546826fb38bfa Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:04:10 -0500 Subject: [PATCH 091/177] Add `path.mkdir` to custom component examples of `to_disk` (#10348) * add `path.mkdir` to examples * add ensure_path + mkdir * update highlights --- website/docs/usage/processing-pipelines.md | 6 +++++- website/docs/usage/saving-loading.md | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 11fd1459d..9e6ee54df 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1081,13 +1081,17 @@ on [serialization methods](/usage/saving-loading/#serialization-methods). > directory. ```python -### Custom serialization methods {highlight="6-7,9-11"} +### Custom serialization methods {highlight="7-11,13-15"} import srsly +from spacy.util import ensure_path class AcronymComponent: # other methods here... def to_disk(self, path, exclude=tuple()): + path = ensure_path(path) + if not path.exists(): + path.mkdir() srsly.write_json(path / "data.json", self.data) def from_disk(self, path, exclude=tuple()): diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 9dad077e7..af140e7a7 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -202,7 +202,9 @@ the data to and from a JSON file. > rules _with_ the component data. ```python -### {highlight="14-18,20-25"} +### {highlight="16-23,25-30"} +from spacy.util import ensure_path + @Language.factory("my_component") class CustomComponent: def __init__(self): @@ -218,6 +220,9 @@ class CustomComponent: def to_disk(self, path, exclude=tuple()): # This will receive the directory path + /my_component + path = ensure_path(path) + if not path.exists(): + path.mkdir() data_path = path / "data.json" with data_path.open("w", encoding="utf8") as f: f.write(json.dumps(self.data)) @@ -467,7 +472,12 @@ pipeline package. When you save out a pipeline using `nlp.to_disk` and the component exposes a `to_disk` method, it will be called with the disk path. ```python +from spacy.util import ensure_path + def to_disk(self, path, exclude=tuple()): + path = ensure_path(path) + if not path.exists(): + path.mkdir() snek_path = path / "snek.txt" with snek_path.open("w", encoding="utf8") as snek_file: snek_file.write(self.snek) From 297dd82c86372c7aa0a181e55dc72512718aafe8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Mar 2022 10:50:47 +0100 Subject: [PATCH 092/177] Fix initial special cases for Tokenizer.explain (#10460) Add the missing initial check for special cases to `Tokenizer.explain` to align with `Tokenizer._tokenize_affixes`. --- spacy/tests/tokenizer/test_tokenizer.py | 13 +++++++++++ spacy/tokenizer.pyx | 4 ++++ website/docs/usage/linguistic-features.md | 28 ++++++++++++++--------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index a7270cb1e..ed11508b4 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -521,3 +521,16 @@ def test_tokenizer_infix_prefix(en_vocab): assert tokens == ["±10", "%"] explain_tokens = [t[1] for t in tokenizer.explain("±10%")] assert tokens == explain_tokens + + +def test_tokenizer_initial_special_case_explain(en_vocab): + tokenizer = Tokenizer( + en_vocab, + token_match=re.compile("^id$").match, + rules={ + "id": [{"ORTH": "i"}, {"ORTH": "d"}], + } + ) + tokens = [t.text for t in tokenizer("id")] + explain_tokens = [t[1] for t in tokenizer.explain("id")] + assert tokens == explain_tokens diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 91f228032..ac55a61f3 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -643,6 +643,10 @@ cdef class Tokenizer: for substring in text.split(): suffixes = [] while substring: + if substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' + continue while prefix_search(substring) or suffix_search(substring): if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f8baf5588..c3f25565a 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -799,6 +799,10 @@ def tokenizer_pseudo_code( for substring in text.split(): suffixes = [] while substring: + if substring in special_cases: + tokens.extend(special_cases[substring]) + substring = "" + continue while prefix_search(substring) or suffix_search(substring): if token_match(substring): tokens.append(substring) @@ -851,20 +855,22 @@ def tokenizer_pseudo_code( The algorithm can be summarized as follows: 1. Iterate over space-separated substrings. -2. Look for a token match. If there is a match, stop processing and keep this - token. -3. Check whether we have an explicitly defined special case for this substring. +2. Check whether we have an explicitly defined special case for this substring. If we do, use it. -4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, +3. Look for a token match. If there is a match, stop processing and keep this + token. +4. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +5. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #3, so that the token match and special cases always get priority. -5. If we didn't consume a prefix, try to consume a suffix and then go back to - #2. -6. If we can't consume a prefix or a suffix, look for a URL match. -7. If there's no URL match, then look for a special case. -8. Look for "infixes" – stuff like hyphens etc. and split the substring into +6. If we didn't consume a prefix, try to consume a suffix and then go back to + #3. +7. If we can't consume a prefix or a suffix, look for a URL match. +8. If there's no URL match, then look for a special case. +9. Look for "infixes" – stuff like hyphens etc. and split the substring into tokens on all infixes. -9. Once we can't consume any more of the string, handle it as a single token. -10. Make a final pass over the text to check for special cases that include +10. Once we can't consume any more of the string, handle it as a single token. +11. Make a final pass over the text to check for special cases that include spaces or that were missed due to the incremental processing of affixes. From 1bbf23207487da4463e8de96efdb2145b408823e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 11 Mar 2022 12:20:23 +0100 Subject: [PATCH 093/177] Auto-format code with black (#10479) * Auto-format code with black * Update spacy/lang/hsb/lex_attrs.py Co-authored-by: explosion-bot Co-authored-by: Adriane Boyd --- spacy/lang/dsb/examples.py | 2 +- spacy/lang/dsb/lex_attrs.py | 82 ++++++++++++++++------ spacy/lang/hsb/examples.py | 2 +- spacy/lang/hsb/lex_attrs.py | 63 ++++++++++++----- spacy/tests/pipeline/test_entity_linker.py | 24 ++++--- 5 files changed, 121 insertions(+), 52 deletions(-) diff --git a/spacy/lang/dsb/examples.py b/spacy/lang/dsb/examples.py index 28b8c41f1..6e9143826 100644 --- a/spacy/lang/dsb/examples.py +++ b/spacy/lang/dsb/examples.py @@ -11,5 +11,5 @@ sentences = [ "Mi so tu jara derje spodoba.", "Kotre nowniny chceće měć?", "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.", - "Zwóstanjo pótakem hyšći wjele źěła." + "Zwóstanjo pótakem hyšći wjele źěła.", ] diff --git a/spacy/lang/dsb/lex_attrs.py b/spacy/lang/dsb/lex_attrs.py index 75fb2e590..367b3afb8 100644 --- a/spacy/lang/dsb/lex_attrs.py +++ b/spacy/lang/dsb/lex_attrs.py @@ -2,16 +2,27 @@ from ...attrs import LIKE_NUM _num_words = [ "nul", - "jaden", "jadna", "jadno", - "dwa", "dwě", - "tśi", "tśo", - "styri", "styrjo", - "pěś", "pěśo", - "šesć", "šesćo", - "sedym", "sedymjo", - "wósym", "wósymjo", - "źewjeś", "źewjeśo", - "źaseś", "źaseśo", + "jaden", + "jadna", + "jadno", + "dwa", + "dwě", + "tśi", + "tśo", + "styri", + "styrjo", + "pěś", + "pěśo", + "šesć", + "šesćo", + "sedym", + "sedymjo", + "wósym", + "wósymjo", + "źewjeś", + "źewjeśo", + "źaseś", + "źaseśo", "jadnassćo", "dwanassćo", "tśinasćo", @@ -21,7 +32,8 @@ _num_words = [ "sedymnasćo", "wósymnasćo", "źewjeśnasćo", - "dwanasćo", "dwaźasća", + "dwanasćo", + "dwaźasća", "tśiźasća", "styrźasća", "pěśźaset", @@ -40,18 +52,42 @@ _num_words = [ ] _ordinal_words = [ - "prědny", "prědna", "prědne", - "drugi", "druga", "druge", - "tśeśi", "tśeśa", "tśeśe", - "stwórty", "stwórta", "stwórte", - "pêty", "pěta", "pête", - "šesty", "šesta", "šeste", - "sedymy", "sedyma", "sedyme", - "wósymy", "wósyma", "wósyme", - "źewjety", "źewjeta", "źewjete", - "źasety", "źaseta", "źasete", - "jadnasty", "jadnasta", "jadnaste", - "dwanasty", "dwanasta", "dwanaste" + "prědny", + "prědna", + "prědne", + "drugi", + "druga", + "druge", + "tśeśi", + "tśeśa", + "tśeśe", + "stwórty", + "stwórta", + "stwórte", + "pêty", + "pěta", + "pête", + "šesty", + "šesta", + "šeste", + "sedymy", + "sedyma", + "sedyme", + "wósymy", + "wósyma", + "wósyme", + "źewjety", + "źewjeta", + "źewjete", + "źasety", + "źaseta", + "źasete", + "jadnasty", + "jadnasta", + "jadnaste", + "dwanasty", + "dwanasta", + "dwanaste", ] diff --git a/spacy/lang/hsb/examples.py b/spacy/lang/hsb/examples.py index 0aafd5cee..21f6f7584 100644 --- a/spacy/lang/hsb/examples.py +++ b/spacy/lang/hsb/examples.py @@ -11,5 +11,5 @@ sentences = [ "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!", "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.", - "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej." + "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.", ] diff --git a/spacy/lang/hsb/lex_attrs.py b/spacy/lang/hsb/lex_attrs.py index dfda3e2db..5f300a73d 100644 --- a/spacy/lang/hsb/lex_attrs.py +++ b/spacy/lang/hsb/lex_attrs.py @@ -2,10 +2,15 @@ from ...attrs import LIKE_NUM _num_words = [ "nul", - "jedyn", "jedna", "jedne", - "dwaj", "dwě", - "tři", "třo", - "štyri", "štyrjo", + "jedyn", + "jedna", + "jedne", + "dwaj", + "dwě", + "tři", + "třo", + "štyri", + "štyrjo", "pjeć", "šěsć", "sydom", @@ -21,7 +26,7 @@ _num_words = [ "sydomnaće", "wosomnaće", "dźewjatnaće", - "dwaceći" + "dwaceći", "třiceći", "štyrceći", "pjećdźesat", @@ -40,18 +45,42 @@ _num_words = [ ] _ordinal_words = [ - "prěni", "prěnja", "prěnje", - "druhi", "druha", "druhe", - "třeći", "třeća", "třeće", - "štwórty", "štwórta", "štwórte", - "pjaty", "pjata", "pjate", - "šěsty", "šěsta", "šěste", - "sydmy", "sydma", "sydme", - "wosmy", "wosma", "wosme", - "dźewjaty", "dźewjata", "dźewjate", - "dźesaty", "dźesata", "dźesate", - "jědnaty", "jědnata", "jědnate", - "dwanaty", "dwanata", "dwanate" + "prěni", + "prěnja", + "prěnje", + "druhi", + "druha", + "druhe", + "třeći", + "třeća", + "třeće", + "štwórty", + "štwórta", + "štwórte", + "pjaty", + "pjata", + "pjate", + "šěsty", + "šěsta", + "šěste", + "sydmy", + "sydma", + "sydme", + "wosmy", + "wosma", + "wosme", + "dźewjaty", + "dźewjata", + "dźewjate", + "dźesaty", + "dźesata", + "dźesate", + "jědnaty", + "jědnata", + "jědnate", + "dwanaty", + "dwanata", + "dwanate", ] diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index af2132d73..83d5bf0e2 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1009,14 +1009,17 @@ def test_legacy_architectures(name, config): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) -@pytest.mark.parametrize("patterns", [ - # perfect case - [{"label": "CHARACTER", "pattern": "Kirby"}], - # typo for false negative - [{"label": "PERSON", "pattern": "Korby"}], - # random stuff for false positive - [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], - ] + +@pytest.mark.parametrize( + "patterns", + [ + # perfect case + [{"label": "CHARACTER", "pattern": "Kirby"}], + # typo for false negative + [{"label": "PERSON", "pattern": "Korby"}], + # random stuff for false positive + [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], + ], ) def test_no_gold_ents(patterns): # test that annotating components work @@ -1055,9 +1058,10 @@ def test_no_gold_ents(patterns): mykb.add_alias("pink", ["pink"], [0.9]) return mykb - # Create and train the Entity Linker - entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) + entity_linker = nlp.add_pipe( + "entity_linker", config={"use_gold_ents": False}, last=True + ) entity_linker.set_kb(create_kb) assert entity_linker.use_gold_ents == False From 6af6c2e86cc7b08573b261563786bd1ab87d45e9 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:41:31 +0800 Subject: [PATCH 094/177] Add a note to the dev docs on mypy (#10485) --- extra/DEVELOPER_DOCS/Code Conventions.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md index eba466c46..37cd8ff27 100644 --- a/extra/DEVELOPER_DOCS/Code Conventions.md +++ b/extra/DEVELOPER_DOCS/Code Conventions.md @@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho ## Type hints -We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. +We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues. If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values. @@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]: return callback ``` +For typing variables, we prefer the explicit format. + +```diff +- var = value # type: Type ++ var: Type = value +``` + For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type). ```python From 23bc93d3d286ca050ae18a9e120331d94454229d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 14 Mar 2022 15:17:22 +0100 Subject: [PATCH 095/177] limit pytest to <7.1 (#10488) * limit pytest to <7.1 * 7.1.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b8970f686..a034dec27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8" # Development dependencies pre-commit>=2.13.0 cython>=0.25,<3.0 -pytest>=5.2.0 +pytest>=5.2.0,<7.1.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<3.10.0 From b68bf43f5bf07b78c062777f35240f031374fe00 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 14 Mar 2022 15:47:57 +0100 Subject: [PATCH 096/177] Add spans to doc.to_json (#10073) * Add spans to to_json * adjustments to_json * Change docstring * change doc key naming * Update spacy/tokens/doc.pyx Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_to_json.py | 12 +++++++++++- spacy/tokens/doc.pyx | 11 ++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 9ebee6c88..202281654 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,5 +1,5 @@ import pytest -from spacy.tokens import Doc +from spacy.tokens import Doc, Span @pytest.fixture() @@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc): Doc.set_extension("json_test4", method=lambda doc: doc.text) with pytest.raises(ValueError): doc.to_json(underscore=["json_test4"]) + + +def test_doc_to_json_span(doc): + """Test that Doc.to_json() includes spans""" + doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")] + json_doc = doc.to_json() + assert "spans" in json_doc + assert len(json_doc["spans"]) == 1 + assert len(json_doc["spans"]["test"]) == 2 + assert json_doc["spans"]["test"][0]["start"] == 0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d33764ac9..1a48705fd 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1457,7 +1457,7 @@ cdef class Doc: underscore (list): Optional list of string names of custom doc._. attributes. Attribute values need to be JSON-serializable. Values will be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. - RETURNS (dict): The data in spaCy's JSON format. + RETURNS (dict): The data in JSON format. """ data = {"text": self.text} if self.has_annotation("ENT_IOB"): @@ -1486,6 +1486,15 @@ cdef class Doc: token_data["dep"] = token.dep_ token_data["head"] = token.head.i data["tokens"].append(token_data) + + if self.spans: + data["spans"] = {} + for span_group in self.spans: + data["spans"][span_group] = [] + for span in self.spans[span_group]: + span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_} + data["spans"][span_group].append(span_data) + if underscore: data["_"] = {} for attr in underscore: From 2eef47dd26a5acbc3f667a2bc3b1ddf16f2d1b07 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:46:58 +0100 Subject: [PATCH 097/177] Save span candidates produced by spancat suggesters (#10413) * Add save_candidates attribute * Change spancat api * Add unit test * reimplement method to produce a list of doc * Add method to docs * Add new version tag * Add intended use to docstring * prettier formatting --- spacy/pipeline/spancat.py | 18 ++++++++++++++++++ spacy/tests/pipeline/test_spancat.py | 22 ++++++++++++++++++++++ website/docs/api/spancategorizer.md | 18 ++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 3759466d1..0a6138fbc 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -272,6 +272,24 @@ class SpanCategorizer(TrainablePipe): scores = self.model.predict((docs, indices)) # type: ignore return indices, scores + def set_candidates( + self, docs: Iterable[Doc], *, candidates_key: str = "candidates" + ) -> None: + """Use the spancat suggester to add a list of span candidates to a list of docs. + This method is intended to be used for debugging purposes. + + docs (Iterable[Doc]): The documents to modify. + candidates_key (str): Key of the Doc.spans dict to save the candidate spans under. + + DOCS: https://spacy.io/api/spancategorizer#set_candidates + """ + suggester_output = self.suggester(docs, ops=self.model.ops) + + for candidates, doc in zip(suggester_output, docs): # type: ignore + doc.spans[candidates_key] = [] + for index in candidates.dataXd: + doc.spans[candidates_key].append(doc[index[0] : index[1]]) + def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: """Modify a batch of Doc objects, using pre-computed scores. diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 8060bc621..15256a763 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -397,3 +397,25 @@ def test_zero_suggestions(): assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer) + + +def test_set_candidates(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + train_examples = make_examples(nlp) + nlp.initialize(get_examples=lambda: train_examples) + texts = [ + "Just a sentence.", + "I like London and Berlin", + "I like Berlin", + "I eat ham.", + ] + + docs = [nlp(text) for text in texts] + spancat.set_candidates(docs) + + assert len(docs) == len(texts) + assert type(docs[0].spans["candidates"]) == SpanGroup + assert len(docs[0].spans["candidates"]) == 9 + assert docs[0].spans["candidates"][0].text == "Just" + assert docs[0].spans["candidates"][4].text == "Just a" diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md index 26fcaefdf..fc666aaf7 100644 --- a/website/docs/api/spancategorizer.md +++ b/website/docs/api/spancategorizer.md @@ -239,6 +239,24 @@ Delegates to [`predict`](/api/spancategorizer#predict) and | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +## SpanCategorizer.set_candidates {#set_candidates tag="method", new="3.3"} + +Use the suggester to add a list of [`Span`](/api/span) candidates to a list of +[`Doc`](/api/doc) objects. This method is intended to be used for debugging +purposes. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> spancat.set_candidates(docs, "candidates") +> ``` + +| Name | Description | +| ---------------- | -------------------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `candidates_key` | Key of the Doc.spans dict to save the candidate spans under. ~~str~~ | + ## SpanCategorizer.get_loss {#get_loss tag="method"} Find the loss and gradient of loss for the batch of documents and their From 0dc454ba9577262ba23279e66f5ea384dd6677fb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 15 Mar 2022 09:10:47 +0100 Subject: [PATCH 098/177] Update docs for Vocab.get_vector (#10486) * Update docs for Vocab.get_vector * Clarify description of 0-vector dimensions --- spacy/vocab.pyx | 5 +++-- website/docs/api/vocab.md | 9 +++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index badd291ed..58036fffa 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -354,8 +354,9 @@ cdef class Vocab: def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary. Words can be looked - up by string or int ID. If no vectors data is loaded, ValueError is - raised. + up by string or int ID. If the current vectors do not contain an entry + for the word, a 0-vector with the same number of dimensions as the + current vectors is returned. orth (int / unicode): The hash value of a word, or its unicode string. RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index c0a269d95..4698c68c3 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -168,22 +168,19 @@ cosines are calculated in minibatches to reduce memory usage. ## Vocab.get_vector {#get_vector tag="method" new="2"} Retrieve a vector for a word in the vocabulary. Words can be looked up by string -or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn` -is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s -subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`). +or hash value. If the current vectors do not contain an entry for the word, a +0-vector with the same number of dimensions +([`Vocab.vectors_length`](#attributes)) as the current vectors is returned. > #### Example > > ```python > nlp.vocab.get_vector("apple") -> nlp.vocab.get_vector("apple", minn=1, maxn=5) > ``` | Name | Description | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | -| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | -| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.set_vector {#set_vector tag="method" new="2"} From 610001e8c724ee57fec301469454d80e955385a8 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Tue, 15 Mar 2022 11:12:04 +0100 Subject: [PATCH 099/177] Update universe.json (#10490) The project moved away from Rasa and into my personal GitHub account. --- website/meta/universe.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 0179830d0..e178eab1f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -377,10 +377,10 @@ "title": "whatlies", "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.", "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.", - "github": "rasahq/whatlies", + "github": "koaning/whatlies", "pip": "whatlies", "thumb": "https://i.imgur.com/rOkOiLv.png", - "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif", + "image": "https://raw.githubusercontent.com/koaning/whatlies/master/docs/gif-two.gif", "code_example": [ "from whatlies import EmbeddingSet", "from whatlies.language import SpacyLanguage", From e8357923ec873e5a66129a0ee84e05d42e9234cb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 15 Mar 2022 11:12:50 +0100 Subject: [PATCH 100/177] Various install docs updates (#10487) * Simplify quickstart source install to use only editable pip install * Update pytorch install instructions to more recent versions --- website/docs/usage/embeddings-transformers.md | 12 ++++++------ website/src/widgets/quickstart-install.js | 9 +-------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 708cdd8bf..70fa95099 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -211,23 +211,23 @@ PyTorch as a dependency below, but it may not find the best version for your setup. ```bash -### Example: Install PyTorch 1.7.1 for CUDA 10.1 with pip +### Example: Install PyTorch 1.11.0 for CUDA 11.3 with pip # See: https://pytorch.org/get-started/locally/ -$ pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html +$ pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html ``` Next, install spaCy with the extras for your CUDA version and transformers. The -CUDA extra (e.g., `cuda92`, `cuda102`, `cuda111`) installs the correct version -of [`cupy`](https://docs.cupy.dev/en/stable/install.html#installing-cupy), which +CUDA extra (e.g., `cuda102`, `cuda113`) installs the correct version of +[`cupy`](https://docs.cupy.dev/en/stable/install.html#installing-cupy), which is just like `numpy`, but for GPU. You may also need to set the `CUDA_PATH` environment variable if your CUDA runtime is installed in a non-standard -location. Putting it all together, if you had installed CUDA 10.2 in +location. Putting it all together, if you had installed CUDA 11.3 in `/opt/nvidia/cuda`, you would run: ```bash ### Installation with CUDA $ export CUDA_PATH="/opt/nvidia/cuda" -$ pip install -U %%SPACY_PKG_NAME[cuda102,transformers]%%SPACY_PKG_FLAGS +$ pip install -U %%SPACY_PKG_NAME[cuda113,transformers]%%SPACY_PKG_FLAGS ``` For [`transformers`](https://huggingface.co/transformers/) v4.0.0+ and models diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 1c8ad19da..fbf043c7d 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -214,16 +214,9 @@ const QuickstartInstall = ({ id, title }) => { {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''} cd spaCy - - export PYTHONPATH=`pwd` - - - set PYTHONPATH=C:\path\to\spaCy - pip install -r requirements.txt - python setup.py build_ext --inplace - pip install {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'} + pip install --no-build-isolation --editable {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'} # packages only available via pip From e5debc68e4910384351938f574ede7c9b35a2a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 15 Mar 2022 14:15:31 +0100 Subject: [PATCH 101/177] Tagger: use unnormalized probabilities for inference (#10197) * Tagger: use unnormalized probabilities for inference Using unnormalized softmax avoids use of the relatively expensive exp function, which can significantly speed up non-transformer models (e.g. I got a speedup of 27% on a German tagging + parsing pipeline). * Add spacy.Tagger.v2 with configurable normalization Normalization of probabilities is disabled by default to improve performance. * Update documentation, models, and tests to spacy.Tagger.v2 * Move Tagger.v1 to spacy-legacy * docs/architectures: run prettier * Unnormalized softmax is now a Softmax_v2 option * Require thinc 8.0.14 and spacy-legacy 3.0.9 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/cli/templates/quickstart_training.jinja | 8 +++---- spacy/ml/models/tagger.py | 10 +++++---- spacy/pipeline/morphologizer.pyx | 2 +- spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/tagger.pyx | 2 +- spacy/tests/pipeline/test_tok2vec.py | 6 +++--- .../tests/serialize/test_serialize_config.py | 4 ++-- .../serialize/test_serialize_language.py | 2 +- spacy/tests/training/test_pretraining.py | 6 +++--- spacy/tests/training/test_training.py | 2 +- website/docs/api/architectures.md | 21 ++++++++++++++----- 14 files changed, 43 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f81484d43..a43b4c814 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.12,<8.1.0", + "thinc>=8.0.14,<8.1.0", "blis>=0.4.0,<0.8.0", "pathy", "numpy>=1.15.0", diff --git a/requirements.txt b/requirements.txt index a034dec27..4da6d5df6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.12,<8.1.0 +thinc>=8.0.14,<8.1.0 blis>=0.4.0,<0.8.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index ed3bf63ce..3c5ba884a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,7 +38,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.12,<8.1.0 + thinc>=8.0.14,<8.1.0 install_requires = # Our libraries spacy-legacy>=3.0.9,<3.1.0 @@ -46,7 +46,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.12,<8.1.0 + thinc>=8.0.14,<8.1.0 blis>=0.4.0,<0.8.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index da533b767..b84fb3a8f 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -54,7 +54,7 @@ stride = 96 factory = "morphologizer" [components.morphologizer.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null [components.morphologizer.model.tok2vec] @@ -70,7 +70,7 @@ grad_factor = 1.0 factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] @@ -238,7 +238,7 @@ maxout_pieces = 3 factory = "morphologizer" [components.morphologizer.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null [components.morphologizer.model.tok2vec] @@ -251,7 +251,7 @@ width = ${components.tok2vec.model.encode.width} factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 9c7fe042d..9f8ef7b2b 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,14 +1,14 @@ from typing import Optional, List -from thinc.api import zero_init, with_array, Softmax, chain, Model +from thinc.api import zero_init, with_array, Softmax_v2, chain, Model from thinc.types import Floats2d from ...util import registry from ...tokens import Doc -@registry.architectures("spacy.Tagger.v1") +@registry.architectures("spacy.Tagger.v2") def build_tagger_model( - tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None + tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False ) -> Model[List[Doc], List[Floats2d]]: """Build a tagger model, using a provided token-to-vector component. The tagger model simply adds a linear layer with softmax activation to predict scores @@ -19,7 +19,9 @@ def build_tagger_model( """ # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - output_layer = Softmax(nO, t2v_width, init_W=zero_init) + output_layer = Softmax_v2( + nO, t2v_width, init_W=zero_init, normalize_outputs=normalize + ) softmax = with_array(output_layer) # type: ignore model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 73d3799b1..24f98508f 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -25,7 +25,7 @@ BACKWARD_EXTEND = False default_model_config = """ [model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [model.tok2vec] @architectures = "spacy.Tok2Vec.v2" diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 6d00e829d..6808fe70e 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -20,7 +20,7 @@ BACKWARD_OVERWRITE = False default_model_config = """ [model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index e21a9096e..d6ecbf084 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -27,7 +27,7 @@ BACKWARD_OVERWRITE = False default_model_config = """ [model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index a5ac85e1e..37104c78a 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -100,7 +100,7 @@ cfg_string = """ factory = "tagger" [components.tagger.model] - @architectures = "spacy.Tagger.v1" + @architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] @@ -263,7 +263,7 @@ cfg_string_multi = """ factory = "tagger" [components.tagger.model] - @architectures = "spacy.Tagger.v1" + @architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] @@ -373,7 +373,7 @@ cfg_string_multi_textcat = """ factory = "tagger" [components.tagger.model] - @architectures = "spacy.Tagger.v1" + @architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1d50fd1d1..85e6f8b2c 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -59,7 +59,7 @@ subword_features = true factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" @@ -110,7 +110,7 @@ subword_features = true factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 6e7fa0e4e..c03287548 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -70,7 +70,7 @@ factory = "ner" factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index 8ee54b544..9359c8485 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -38,7 +38,7 @@ subword_features = true factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" @@ -62,7 +62,7 @@ pipeline = ["tagger"] factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [components.tagger.model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" @@ -106,7 +106,7 @@ subword_features = true factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 0d73300d8..f1f8ce9d4 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -241,7 +241,7 @@ maxout_pieces = 3 factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null [components.tagger.model.tok2vec] diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 5fb3546a7..2bddcb28c 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -104,7 +104,7 @@ consisting of a CNN and a layer-normalized maxout activation function. > factory = "tagger" > > [components.tagger.model] -> @architectures = "spacy.Tagger.v1" +> @architectures = "spacy.Tagger.v2" > > [components.tagger.model.tok2vec] > @architectures = "spacy.Tok2VecListener.v1" @@ -158,8 +158,8 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based representation. If pretrained vectors are available, they can be included in the -representation as well, with the vectors table kept static (i.e. it's -not updated). +representation as well, with the vectors table kept static (i.e. it's not +updated). | Name | Description | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -613,14 +613,15 @@ same signature, but the `use_upper` argument was `True` by default. ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} -### spacy.Tagger.v1 {#Tagger} +### spacy.Tagger.v2 {#Tagger} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.Tagger.v1" +> @architectures = "spacy.Tagger.v2" > nO = null +> normalize = false > > [model.tok2vec] > # ... @@ -634,8 +635,18 @@ the token vectors. | ----------- | ------------------------------------------------------------------------------------------ | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | | `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ | +| `normalize` | Normalize probabilities during inference. Defaults to `False`. ~~bool~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | + + +- The `normalize` argument was added in `spacy.Tagger.v2`. `spacy.Tagger.v1` + always normalizes probabilities during inference. + +The other arguments are shared between all versions. + + + ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} A text classification architecture needs to take a [`Doc`](/api/doc) as input, From e021dc6279621ccdb00bd69961d12a19e47218a1 Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Tue, 15 Mar 2022 16:42:33 +0100 Subject: [PATCH 102/177] Updated explenation for for classy classification (#10484) * Update universe.json added classy-classification to Spacy universe * Update universe.json added classy-classification to the spacy universe resources * Update universe.json corrected a small typo in json * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem * Update universe.json processed merge feedback * Update universe.json * updated information for Classy Classificaiton Made a more comprehensible and easy description for Classy Classification based on feedback of Philip Vollet to prepare for sharing. * added note about examples * corrected for wrong formatting changes * Update website/meta/universe.json with small typo correction Co-authored-by: Adriane Boyd * resolved another typo * Update website/meta/universe.json Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem Co-authored-by: Adriane Boyd --- website/meta/universe.json | 43 +++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index e178eab1f..a930363a4 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2601,8 +2601,9 @@ }, { "id": "classyclassification", - "slogan": "A Python library for classy few-shot and zero-shot classification within spaCy.", - "description": "Huggingface does offer some nice models for few/zero-shot classification, but these are not tailored to multi-lingual approaches. Rasa NLU has a nice approach for this, but its too embedded in their codebase for easy usage outside of Rasa/chatbots. Additionally, it made sense to integrate sentence-transformers and Huggingface zero-shot, instead of default word embeddings. Finally, I decided to integrate with spaCy, since training a custom spaCy TextCategorizer seems like a lot of hassle if you want something quick and dirty.", + "title": "Classy Classification", + "slogan": "Have you ever struggled with needing a spaCy TextCategorizer but didn't have the time to train one from scratch? Classy Classification is the way to go!", + "description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).", "github": "davidberenstein1957/classy-classification", "pip": "classy-classification", "code_example": [ @@ -2618,32 +2619,36 @@ " \"Do you also have some ovens.\"]", "}", "", + "# see github repo for examples on sentence-transformers and Huggingface", "nlp = spacy.load('en_core_web_md')", - "", - "classification_type = \"spacy_few_shot\"", - "if classification_type == \"spacy_few_shot\":", - " nlp.add_pipe(\"text_categorizer\", ", - " config={\"data\": data, \"model\": \"spacy\"}", - " )", - "elif classification_type == \"sentence_transformer_few_shot\":", - " nlp.add_pipe(\"text_categorizer\", ", - " config={\"data\": data, \"model\": \"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\"}", - " )", - "elif classification_type == \"huggingface_zero_shot\":", - " nlp.add_pipe(\"text_categorizer\", ", - " config={\"data\": list(data.keys()), \"cat_type\": \"zero\", \"model\": \"facebook/bart-large-mnli\"}", - " )", + "nlp.add_pipe(\"text_categorizer\", ", + " config={", + " \"data\": data,", + " \"model\": \"spacy\"", + " }", + ")", "", "print(nlp(\"I am looking for kitchen appliances.\")._.cats)", - "print([doc._.cats for doc in nlp.pipe([\"I am looking for kitchen appliances.\"])])" + "# Output:", + "#", + "# [{\"label\": \"furniture\", \"score\": 0.21}, {\"label\": \"kitchen\", \"score\": 0.79}]" ], "author": "David Berenstein", "author_links": { "github": "davidberenstein1957", "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" }, - "category": ["pipeline", "standalone"], - "tags": ["classification", "zero-shot", "few-shot", "sentence-transformers", "huggingface"], + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "classification", + "zero-shot", + "few-shot", + "sentence-transformers", + "huggingface" + ], "spacy_version": 3 }, { From a79cd3542b3dd667d8a97293462e22ed26a04ee5 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Thu, 17 Mar 2022 01:14:34 +0800 Subject: [PATCH 103/177] Add displacy support for overlapping Spans (#10332) * Fix docstring for EntityRenderer * Add warning in displacy if doc.spans are empty * Implement parse_spans converter One notable change here is that the default spans_key is sc, and it's set by the user through the options. * Implement SpanRenderer Here, I implemented a SpanRenderer that looks similar to the EntityRenderer except for some templates. The spans_key, by default, is set to sc, but can be configured in the options (see parse_spans). The way I rendered these spans is per-token, i.e., I first check if each token (1) belongs to a given span type and (2) a starting token of a given span type. Once I have this information, I render them into the markup. * Fix mypy issues on typing * Add tests for displacy spans support * Update colors from RGB to hex Co-authored-by: Ines Montani * Remove unnecessary CSS properties * Add documentation for website * Remove unnecesasry scripts * Update wording on the documentation Co-authored-by: Sofie Van Landeghem * Put typing dependency on top of file * Put back z-index so that spans overlap properly * Make warning more explicit for spans_key Co-authored-by: Ines Montani Co-authored-by: Sofie Van Landeghem --- spacy/displacy/__init__.py | 41 +++- spacy/displacy/render.py | 179 +++++++++++++++++- spacy/displacy/templates.py | 49 +++++ spacy/errors.py | 4 + spacy/tests/test_displacy.py | 86 +++++++++ website/docs/api/top-level.md | 32 +++- website/docs/images/displacy-span-custom.html | 31 +++ website/docs/images/displacy-span.html | 41 ++++ website/docs/usage/visualizers.md | 53 ++++++ 9 files changed, 501 insertions(+), 15 deletions(-) create mode 100644 website/docs/images/displacy-span-custom.html create mode 100644 website/docs/images/displacy-span.html diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 25d530c83..aa00c95d8 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -4,10 +4,10 @@ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ -from typing import Union, Iterable, Optional, Dict, Any, Callable +from typing import List, Union, Iterable, Optional, Dict, Any, Callable import warnings -from .render import DependencyRenderer, EntityRenderer +from .render import DependencyRenderer, EntityRenderer, SpanRenderer from ..tokens import Doc, Span from ..errors import Errors, Warnings from ..util import is_in_jupyter @@ -44,6 +44,7 @@ def render( factories = { "dep": (DependencyRenderer, parse_deps), "ent": (EntityRenderer, parse_ents), + "span": (SpanRenderer, parse_spans), } if style not in factories: raise ValueError(Errors.E087.format(style=style)) @@ -203,6 +204,42 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: return {"text": doc.text, "ents": ents, "title": title, "settings": settings} +def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: + """Generate spans in [{start: i, end: i, label: 'label'}] format. + + doc (Doc): Document to parse. + options (Dict[str, any]): Span-specific visualisation options. + RETURNS (dict): Generated span types keyed by text (original text) and spans. + """ + kb_url_template = options.get("kb_url_template", None) + spans_key = options.get("spans_key", "sc") + spans = [ + { + "start": span.start_char, + "end": span.end_char, + "start_token": span.start, + "end_token": span.end, + "label": span.label_, + "kb_id": span.kb_id_ if span.kb_id_ else "", + "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#", + } + for span in doc.spans[spans_key] + ] + tokens = [token.text for token in doc] + + if not spans: + warnings.warn(Warnings.W117.format(spans_key=spans_key)) + title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None + settings = get_doc_settings(doc) + return { + "text": doc.text, + "spans": spans, + "title": title, + "settings": settings, + "tokens": tokens, + } + + def set_render_wrapper(func: Callable[[str], str]) -> None: """Set an optional wrapper function that is called around the generated HTML markup on displacy.render. This can be used to allow integration into diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index a032d843b..2925c68a0 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -1,12 +1,15 @@ -from typing import Dict, Any, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import uuid +import itertools -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS -from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE -from .templates import TPL_ENTS, TPL_KB_LINK -from ..util import minify_html, escape_html, registry from ..errors import Errors - +from ..util import escape_html, minify_html, registry +from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS +from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS +from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN +from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL +from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS +from .templates import TPL_TITLE DEFAULT_LANG = "en" DEFAULT_DIR = "ltr" @@ -33,6 +36,168 @@ DEFAULT_LABEL_COLORS = { } +class SpanRenderer: + """Render Spans as SVGs.""" + + style = "span" + + def __init__(self, options: Dict[str, Any] = {}) -> None: + """Initialise span renderer + + options (dict): Visualiser-specific options (colors, spans) + """ + # Set up the colors and overall look + colors = dict(DEFAULT_LABEL_COLORS) + user_colors = registry.displacy_colors.get_all() + for user_color in user_colors.values(): + if callable(user_color): + # Since this comes from the function registry, we want to make + # sure we support functions that *return* a dict of colors + user_color = user_color() + if not isinstance(user_color, dict): + raise ValueError(Errors.E925.format(obj=type(user_color))) + colors.update(user_color) + colors.update(options.get("colors", {})) + self.default_color = DEFAULT_ENTITY_COLOR + self.colors = {label.upper(): color for label, color in colors.items()} + + # Set up how the text and labels will be rendered + self.direction = DEFAULT_DIR + self.lang = DEFAULT_LANG + self.top_offset = options.get("top_offset", 40) + self.top_offset_step = options.get("top_offset_step", 17) + + # Set up which templates will be used + template = options.get("template") + if template: + self.span_template = template["span"] + self.span_slice_template = template["slice"] + self.span_start_template = template["start"] + else: + if self.direction == "rtl": + self.span_template = TPL_SPAN_RTL + self.span_slice_template = TPL_SPAN_SLICE_RTL + self.span_start_template = TPL_SPAN_START_RTL + else: + self.span_template = TPL_SPAN + self.span_slice_template = TPL_SPAN_SLICE + self.span_start_template = TPL_SPAN_START + + def render( + self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False + ) -> str: + """Render complete markup. + + parsed (list): Dependency parses to render. + page (bool): Render parses wrapped as full HTML page. + minify (bool): Minify HTML markup. + RETURNS (str): Rendered HTML markup. + """ + rendered = [] + for i, p in enumerate(parsed): + if i == 0: + settings = p.get("settings", {}) + self.direction = settings.get("direction", DEFAULT_DIR) + self.lang = settings.get("lang", DEFAULT_LANG) + rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title"))) + + if page: + docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered]) + markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction) + else: + markup = "".join(rendered) + if minify: + return minify_html(markup) + return markup + + def render_spans( + self, + tokens: List[str], + spans: List[Dict[str, Any]], + title: Optional[str], + ) -> str: + """Render span types in text. + + Spans are rendered per-token, this means that for each token, we check if it's part + of a span slice (a member of a span type) or a span start (the starting token of a + given span type). + + tokens (list): Individual tokens in the text + spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. + title (str / None): Document title set in Doc.user_data['title']. + """ + per_token_info = [] + for idx, token in enumerate(tokens): + # Identify if a token belongs to a Span (and which) and if it's a + # start token of said Span. We'll use this for the final HTML render + token_markup: Dict[str, Any] = {} + token_markup["text"] = token + entities = [] + for span in spans: + ent = {} + if span["start_token"] <= idx < span["end_token"]: + ent["label"] = span["label"] + ent["is_start"] = True if idx == span["start_token"] else False + kb_id = span.get("kb_id", "") + kb_url = span.get("kb_url", "#") + ent["kb_link"] = ( + TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else "" + ) + entities.append(ent) + token_markup["entities"] = entities + per_token_info.append(token_markup) + + markup = self._render_markup(per_token_info) + markup = TPL_SPANS.format(content=markup, dir=self.direction) + if title: + markup = TPL_TITLE.format(title=title) + markup + return markup + + def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: + """Render the markup from per-token information""" + markup = "" + for token in per_token_info: + entities = sorted(token["entities"], key=lambda d: d["label"]) + if entities: + slices = self._get_span_slices(token["entities"]) + starts = self._get_span_starts(token["entities"]) + markup += self.span_template.format( + text=token["text"], span_slices=slices, span_starts=starts + ) + else: + markup += escape_html(token["text"] + " ") + return markup + + def _get_span_slices(self, entities: List[Dict]) -> str: + """Get the rendered markup of all Span slices""" + span_slices = [] + for entity, step in zip(entities, itertools.count(step=self.top_offset_step)): + color = self.colors.get(entity["label"].upper(), self.default_color) + span_slice = self.span_slice_template.format( + bg=color, top_offset=self.top_offset + step + ) + span_slices.append(span_slice) + return "".join(span_slices) + + def _get_span_starts(self, entities: List[Dict]) -> str: + """Get the rendered markup of all Span start tokens""" + span_starts = [] + for entity, step in zip(entities, itertools.count(step=self.top_offset_step)): + color = self.colors.get(entity["label"].upper(), self.default_color) + span_start = ( + self.span_start_template.format( + bg=color, + top_offset=self.top_offset + step, + label=entity["label"], + kb_link=entity["kb_link"], + ) + if entity["is_start"] + else "" + ) + span_starts.append(span_start) + return "".join(span_starts) + + class DependencyRenderer: """Render dependency parses as SVGs.""" @@ -242,7 +407,7 @@ class EntityRenderer: style = "ent" def __init__(self, options: Dict[str, Any] = {}) -> None: - """Initialise dependency renderer. + """Initialise entity renderer. options (dict): Visualiser-specific options (colors, ents) """ diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index e7d3d4266..ff81e7a1d 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -62,6 +62,55 @@ TPL_ENT_RTL = """ """ +TPL_SPANS = """ +
      {content}
      +""" + +TPL_SPAN = """ + + {text} + {span_slices} + {span_starts} + +""" + +TPL_SPAN_SLICE = """ + + +""" + + +TPL_SPAN_START = """ + + + {label}{kb_link} + + + +""" + +TPL_SPAN_RTL = """ + + {text} + {span_slices} + {span_starts} + +""" + +TPL_SPAN_SLICE_RTL = """ + + +""" + +TPL_SPAN_START_RTL = """ + + + {label}{kb_link} + + +""" + + # Important: this needs to start with a space! TPL_KB_LINK = """ {kb_id} diff --git a/spacy/errors.py b/spacy/errors.py index 5399e489b..fe37351f7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -192,6 +192,10 @@ class Warnings(metaclass=ErrorsWithCodes): W115 = ("Skipping {method}: the floret vector table cannot be modified. " "Vectors are calculated from character ngrams.") W116 = ("Unable to clean attribute '{attr}'.") + W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is " + "surprising to you, make sure the Doc was processed using a model " + "that supports span categorization, and check the `doc.spans[spans_key]` " + "property manually if necessary.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 392c95e42..ccad7e342 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -96,6 +96,92 @@ def test_issue5838(): assert found == 4 +def test_displacy_parse_spans(en_vocab): + """Test that spans on a Doc are converted into displaCy's format.""" + doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"]) + doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")] + spans = displacy.parse_spans(doc) + assert isinstance(spans, dict) + assert spans["text"] == "Welcome to the Bank of China " + assert spans["spans"] == [ + { + "start": 15, + "end": 28, + "start_token": 3, + "end_token": 6, + "label": "ORG", + "kb_id": "", + "kb_url": "#", + }, + { + "start": 23, + "end": 28, + "start_token": 5, + "end_token": 6, + "label": "GPE", + "kb_id": "", + "kb_url": "#", + }, + ] + + +def test_displacy_parse_spans_with_kb_id_options(en_vocab): + """Test that spans with kb_id on a Doc are converted into displaCy's format""" + doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"]) + doc.spans["sc"] = [ + Span(doc, 3, 6, "ORG", kb_id="Q790068"), + Span(doc, 5, 6, "GPE", kb_id="Q148"), + ] + + spans = displacy.parse_spans( + doc, {"kb_url_template": "https://wikidata.org/wiki/{}"} + ) + assert isinstance(spans, dict) + assert spans["text"] == "Welcome to the Bank of China " + assert spans["spans"] == [ + { + "start": 15, + "end": 28, + "start_token": 3, + "end_token": 6, + "label": "ORG", + "kb_id": "Q790068", + "kb_url": "https://wikidata.org/wiki/Q790068", + }, + { + "start": 23, + "end": 28, + "start_token": 5, + "end_token": 6, + "label": "GPE", + "kb_id": "Q148", + "kb_url": "https://wikidata.org/wiki/Q148", + }, + ] + + +def test_displacy_parse_spans_different_spans_key(en_vocab): + """Test that spans in a different spans key will be parsed""" + doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"]) + doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")] + doc.spans["custom"] = [Span(doc, 3, 6, "BANK")] + spans = displacy.parse_spans(doc, options={"spans_key": "custom"}) + + assert isinstance(spans, dict) + assert spans["text"] == "Welcome to the Bank of China " + assert spans["spans"] == [ + { + "start": 15, + "end": 28, + "start_token": 3, + "end_token": 6, + "label": "BANK", + "kb_id": "", + "kb_url": "#", + } + ] + + def test_displacy_parse_ents(en_vocab): """Test that named entities on a Doc are converted into displaCy's format.""" doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 1a3e9da46..6d7431f28 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -320,12 +320,31 @@ If a setting is not present in the options, the default value will be used. | `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | | `kb_url_template` 3.2.1 | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ | -By default, displaCy comes with colors for all entity types used by -[spaCy's trained pipelines](/models). If you're using custom entity types, you -can use the `colors` setting to add your own colors for them. Your application -or pipeline package can also expose a -[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) -to add custom labels and their colors automatically. + +#### Span Visualizer options {#displacy_options-span} + +> #### Example +> +> ```python +> options = {"spans_key": "sc"} +> displacy.serve(doc, style="span", options=options) +> ``` + +| Name | Description | +|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ | +| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ | +| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | + + +By default, displaCy comes with colors for all entity types used by [spaCy's +trained pipelines](/models) for both entity and span visualizer. If you're +using custom entity types, you can use the `colors` setting to add your own +colors for them. Your application or pipeline package can also expose a +[`spacy_displacy_colors` entry +point](/usage/saving-loading#entry-points-displacy) to add custom labels and +their colors automatically. By default, displaCy links to `#` for entities without a `kb_id` set on their span. If you wish to link an entity to their URL then consider using the @@ -335,6 +354,7 @@ span. If you wish to link an entity to their URL then consider using the should redirect you to their Wikidata page, in this case `https://www.wikidata.org/wiki/Q95`. + ## registry {#registry source="spacy/util.py" new="3"} spaCy's function registry extends diff --git a/website/docs/images/displacy-span-custom.html b/website/docs/images/displacy-span-custom.html new file mode 100644 index 000000000..97dd3b140 --- /dev/null +++ b/website/docs/images/displacy-span-custom.html @@ -0,0 +1,31 @@ +
      + Welcome to the + + Bank + + + + + BANK + + + + + of + + + + + China + + + + + . +
      \ No newline at end of file diff --git a/website/docs/images/displacy-span.html b/website/docs/images/displacy-span.html new file mode 100644 index 000000000..9bbc6403c --- /dev/null +++ b/website/docs/images/displacy-span.html @@ -0,0 +1,41 @@ +
      + Welcome to the + + Bank + + + + + ORG + + + + + of + + + + + + China + + + + + + + GPE + + + + . +
      \ No newline at end of file diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 072718f91..f98c43224 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -167,6 +167,59 @@ This feature is especially handy if you're using displaCy to compare performance at different stages of a process, e.g. during training. Here you could use the title for a brief description of the text example and the number of iterations. +## Visualizing spans {#span} + +The span visualizer, `span`, highlights overlapping spans in a text. + +```python +### Span example +import spacy +from spacy import displacy +from spacy.tokens import Span + +text = "Welcome to the Bank of China." + +nlp = spacy.blank("en") +doc = nlp(text) + +doc.spans["sc"] = [ + Span(doc, 3, 6, "ORG"), + Span(doc, 5, 6, "GPE"), +] + +displacy.serve(doc, style="span") +``` + +import DisplacySpanHtml from 'images/displacy-span.html' + +