From 953f638aa503703edad17d0c6c9adac8c948a67b Mon Sep 17 00:00:00 2001 From: Phaninder Pasupula Date: Mon, 8 May 2017 11:48:05 +0100 Subject: [PATCH 001/195] Update _data.json --- website/docs/usage/_data.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 2ffbf9d68..0632d3972 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -149,6 +149,11 @@ "author": "Johannes Gontrum", "description": "spaCy accessed by a REST API, wrapped in a Docker container." }, + "spacy-nlp-zeromq": { + "url": "https://github.com/pasupulaphani/spacy-nlp-docker", + "author": "Phaninder Pasupula", + "description": "SpaCy with zeromq bindings docker." + }, "textacy": { "url": "https://github.com/chartbeat-labs/textacy", "author": " Burton DeWilde (Chartbeat)", From cdaefae60ac08fc0093f86a83d0c5953197eb9fd Mon Sep 17 00:00:00 2001 From: oeg Date: Fri, 12 May 2017 16:15:19 +0200 Subject: [PATCH 002/195] feature(populate_vocab): Enable pruning out rare words from clusters data --- spacy/cli/model.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 3b9a77b93..4e7e0282b 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -98,10 +98,6 @@ def read_clusters(clusters_path): def populate_vocab(vocab, clusters, probs, oov_prob): - # Ensure probs has entries for all words seen during clustering. - for word in clusters: - if word not in probs: - probs[word] = oov_prob for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob From e506811a93962c194c195ee277ca374267ba7cf0 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 03:27:50 +0200 Subject: [PATCH 003/195] Update description --- website/docs/usage/_data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 0632d3972..78e8b3e27 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -152,7 +152,7 @@ "spacy-nlp-zeromq": { "url": "https://github.com/pasupulaphani/spacy-nlp-docker", "author": "Phaninder Pasupula", - "description": "SpaCy with zeromq bindings docker." + "description": "Docker image exposing spaCy with ZeroMQ bindings." }, "textacy": { "url": "https://github.com/chartbeat-labs/textacy", From ad590feaa88f245b206daefe5743811de2ee2102 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 May 2017 11:36:19 +0200 Subject: [PATCH 004/195] Fix test, which imported English incorrectly --- spacy/tests/doc/test_doc_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 6c8f61a81..064cbecfd 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -219,7 +219,6 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): def test_parse_tree(EN): text = 'I like New York in Autumn.' - EN = English(parser=False) doc = EN(text, tag=True) doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) # full method parse_tree(text) is a trivial composition From c5669450a06f182bb369bc36fdaaaac5140d91a6 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 12:33:57 +0200 Subject: [PATCH 005/195] Fix formatting --- spacy/tokens/printers.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index d70088540..1cadfc5bf 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -1,13 +1,20 @@ from copy import deepcopy +# coding: utf8 +from __future__ import unicode_literals def merge_ents(doc): - '''Helper: merge adjacent entities into single tokens; modifies the doc.''' + """ + Helper: merge adjacent entities into single tokens; modifies the doc. + """ for ent in doc.ents: ent.merge(ent.root.tag_, ent.text, ent.label_) return doc + def format_POS(token, light, flat): - '''helper: form the POS output for a token''' + """ + Helper: form the POS output for a token. + """ subtree = dict([ ("word", token.text), ("lemma", token.lemma_), # trigger @@ -26,16 +33,21 @@ def format_POS(token, light, flat): return subtree def POS_tree(root, light, flat): - '''Helper: generate a POS tree for a root token. - The doc must have merge_ents(doc) ran on it. - ''' + + """ + Helper: generate a POS tree for a root token. The doc must have + merge_ents(doc) ran on it. + """ subtree = format_POS(root, light=light, flat=flat) for c in root.children: subtree["modifiers"].append(POS_tree(c)) return subtree + def parse_tree(doc, light=False, flat=False): - """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc + """ + Makes a copy of the doc, then construct a syntactic parse tree, similar to + the one used in displaCy. Generates the POS tree for all sentences in a doc. Args: doc: The doc for parsing. From bd428c0a70e589457d1112b4ef5d674cba4c82dd Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 12:34:05 +0200 Subject: [PATCH 006/195] Set defaults for light and flat kwargs --- spacy/tokens/printers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index 1cadfc5bf..8ab111120 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -32,8 +32,8 @@ def format_POS(token, light, flat): subtree.pop("modifiers") return subtree -def POS_tree(root, light, flat): +def POS_tree(root, light=False, flat=False): """ Helper: generate a POS tree for a root token. The doc must have merge_ents(doc) ran on it. From 573f0ba867d41c81a0f9c0af3f2158463e4c972c Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 12:34:14 +0200 Subject: [PATCH 007/195] Replace deepcopy --- spacy/tokens/printers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index 8ab111120..f9b1f3972 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -1,7 +1,10 @@ -from copy import deepcopy # coding: utf8 from __future__ import unicode_literals +from .doc import Doc +from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE + + def merge_ents(doc): """ Helper: merge adjacent entities into single tokens; modifies the doc. @@ -61,6 +64,8 @@ def parse_tree(doc, light=False, flat=False): >>> trees = doc.print_tree() [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] """ - doc_clone = deepcopy(doc) + doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) + doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], + doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) merge_ents(doc_clone) # merge the entities into single tokens first return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] From 6e1dbc608eaa1725566cb48dddf4d969a027c88c Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 12:34:20 +0200 Subject: [PATCH 008/195] Fix parse_tree test --- spacy/tests/doc/test_doc_api.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 064cbecfd..1bc534ecd 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -217,10 +217,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): assert doc.has_vector -def test_parse_tree(EN): +def test_parse_tree(en_tokenizer): + """Tests doc.print_tree() method.""" text = 'I like New York in Autumn.' - doc = EN(text, tag=True) - doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) + heads = [1, 0, 1, -2, -3, -1, -5] + tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, tags=tags) # full method parse_tree(text) is a trivial composition trees = doc.print_tree() assert len(trees) > 0 From 24e973b17f23ce3be4abcf4ad7b7b3a6506a2701 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 13:09:00 +0200 Subject: [PATCH 009/195] Rename about.__docs__ to about.__docs_models__ --- spacy/about.py | 2 +- spacy/cli/download.py | 2 +- spacy/deprecated.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index ad4a021c2..8c0e0afd3 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -10,7 +10,7 @@ __author__ = 'Matthew Honnibal' __email__ = 'matt@explosion.ai' __license__ = 'MIT' -__docs__ = 'https://spacy.io/docs/usage' +__docs_models__ = 'https://spacy.io/docs/usage' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0419de118..70ca64b22 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -79,5 +79,5 @@ def check_error_depr(model): "As of v1.7.0, the download all command is deprecated. Please " "download the models individually via spacy.download [model name] " "or pip install. For more info on this, see the documentation: " - "{d}".format(d=about.__docs__), + "{d}".format(d=about.__docs_models__), title="Deprecated command") diff --git a/spacy/deprecated.py b/spacy/deprecated.py index 65053089a..3f3c69b88 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -146,7 +146,7 @@ class ModelDownload(): "The spacy.{l}.download command is now deprecated. Please use " "python -m spacy download [model name or shortcut] instead. For more " "info and available models, see the documentation: {d}. " - "Downloading default '{l}' model now...".format(d=about.__docs__, l=lang), + "Downloading default '{l}' model now...".format(d=about.__docs_models__, l=lang), title="Warning: deprecated command") download(lang) From 9003fd25e5e966bd8d1b67a18f3ebd6010d6f718 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 13 May 2017 13:14:02 +0200 Subject: [PATCH 010/195] Fix error messages if model is required (resolves #1051) Rename about.__docs__ to about.__docs_models__. --- spacy/lexeme.pyx | 10 +++++----- spacy/tokens/doc.pyx | 13 ++++++------- spacy/tokens/span.pyx | 6 +++--- spacy/tokens/token.pyx | 10 +++++----- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5d9ce7b98..05d8bddc6 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -24,6 +24,7 @@ from .attrs cimport IS_QUOTE from .attrs cimport IS_LEFT_PUNCT from .attrs cimport IS_RIGHT_PUNCT from .attrs cimport IS_OOV +from . import about memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) @@ -137,11 +138,10 @@ cdef class Lexeme: cdef int length = self.vocab.vectors_length if length == 0: raise ValueError( - "Word vectors set to length 0. This may be because the " - "data is not installed. If you haven't already, run" - "\npython -m spacy download %s\n" - "to install the data." % self.vocab.lang - ) + "Word vectors set to length 0. This may be because you " + "don't have a model installed or loaded, or because your " + "model doesn't include word vectors. For more info, see " + "the documentation: \n%s\n" % about.__docs_models__) vector_view = self.c.vector return numpy.asarray(vector_view) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fe8e019ec..2089199a0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -29,6 +29,7 @@ from ..serialize.bits cimport BitArray from ..util import normalize_slice from ..syntax.iterators import CHUNKERS from ..compat import is_config +from .. import about DEF PADDING = 5 @@ -403,9 +404,8 @@ cdef class Doc: if not self.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy download %s\n" - "to install the data" % self.vocab.lang) + "requires data to be installed. For more info, see the " + "documentation: \n%s\n" % about.__docs_models__) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. # The tricky thing here is that Span accepts its tokenisation changing, @@ -435,10 +435,9 @@ cdef class Doc: if not self.is_parsed: raise ValueError( - "sentence boundary detection requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy download %s\n" - "to install the data" % self.vocab.lang) + "Sentence boundary detection requires the dependency parse, which " + "requires data to be installed. For more info, see the " + "documentation: \n%s\n" % about.__docs_models__) cdef int i start = 0 for i in range(1, self.length): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index fb1e5c732..09927ab4c 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -16,6 +16,7 @@ from ..util import normalize_slice from ..attrs cimport IS_PUNCT, IS_SPACE from ..lexeme cimport Lexeme from ..compat import is_config +from .. import about cdef class Span: @@ -221,9 +222,8 @@ cdef class Span: if not self.doc.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy download %s\n" - "to install the data" % self.vocab.lang) + "requires data to be installed. For more info, see the " + "documentation: \n%s\n" % about.__docs_models__) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. # The tricky thing here is that Span accepts its tokenisation changing, diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f146f5cd6..daef48233 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -26,6 +26,7 @@ from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV from ..lexeme cimport Lexeme from ..compat import is_config +from .. import about cdef class Token: @@ -237,11 +238,10 @@ cdef class Token: cdef int length = self.vocab.vectors_length if length == 0: raise ValueError( - "Word vectors set to length 0. This may be because the " - "data is not installed. If you haven't already, run" - "\npython -m spacy download %s\n" - "to install the data." % self.vocab.lang - ) + "Word vectors set to length 0. This may be because you " + "don't have a model installed or loaded, or because your " + "model doesn't include word vectors. For more info, see " + "the documentation: \n%s\n" % about.__docs_models__) vector_view = self.c.lex.vector return numpy.asarray(vector_view) From e6f850f0148c84ca1bc6bd0563a989ce0400e762 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 16 May 2017 14:45:15 +0200 Subject: [PATCH 011/195] Add pip to requirements.txt and setup.py (resolves #1064) --- requirements.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 42910d1be..8194dee58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ requests>=2.13.0,<3.0.0 regex==2017.4.5 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 +pip>=9.0.0,<10.0.0 diff --git a/setup.py b/setup.py index a112a6e80..89aaf8eba 100755 --- a/setup.py +++ b/setup.py @@ -197,6 +197,7 @@ def setup_package(): 'preshed>=1.0.0,<2.0.0', 'thinc>=6.5.0,<6.6.0', 'plac<1.0.0,>=0.9.6', + 'pip>=9.0.0,<10.0.0', 'six', 'pathlib', 'ujson>=1.35', From ce0658d75bda75a508fd75fac5da2443093c0623 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 18:20:33 +0200 Subject: [PATCH 012/195] Add help icon --- website/assets/img/icons.svg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg index f62901592..e970bb52c 100644 --- a/website/assets/img/icons.svg +++ b/website/assets/img/icons.svg @@ -27,5 +27,8 @@ + + + From 8a415fc402c878900bf00068166193c681389292 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 18:20:44 +0200 Subject: [PATCH 013/195] Fix light version of colours to be more explicit --- website/assets/css/_variables.sass | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index 5f9453ea6..ad0739838 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -27,6 +27,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier, // Colors $colors: ( blue: #09a3d5, red: #d9515d ) +$colors-light: (blue: #cceaf4, red: #f9d7da) $color-back: #fff !default $color-front: #1a1e23 !default @@ -34,7 +35,7 @@ $color-dark: lighten($color-front, 20) !default $color-theme: map-get($colors, $theme) $color-theme-dark: darken(map-get($colors, $theme), 5) -$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 5) +$color-theme-light: map-get($colors-light, $theme) $color-subtle: #ddd !default $color-subtle-light: #f6f6f6 !default From 841ad29f6187afcd79b0f1db20d1b1b887817f63 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 18:20:53 +0200 Subject: [PATCH 014/195] Add tooltips component --- website/assets/css/_components/_tooltips.sass | 29 +++++++++++++++++++ website/assets/css/style.sass | 1 + 2 files changed, 30 insertions(+) create mode 100644 website/assets/css/_components/_tooltips.sass diff --git a/website/assets/css/_components/_tooltips.sass b/website/assets/css/_components/_tooltips.sass new file mode 100644 index 000000000..a19456aa5 --- /dev/null +++ b/website/assets/css/_components/_tooltips.sass @@ -0,0 +1,29 @@ +//- đŸ’Ģ CSS > COMPONENTS > TOOLTIPS + +[data-tooltip] + position: relative + + @include breakpoint(min, sm) + &:before + @include position(absolute, top, left, 125%, 50%) + display: inline-block + content: attr(data-tooltip) + background: $color-front + border-radius: 2px + color: $color-back + font-family: inherit + font-size: 1.3rem + line-height: 1.25 + opacity: 0 + padding: 0.5em 0.75em + transform: translateX(-50%) translateY(-2px) + transition: opacity 0.1s ease-out, transform 0.1s ease-out + visibility: hidden + min-width: 200px + max-width: 300px + z-index: 200 + + &:hover:before + opacity: 1 + transform: translateX(-50%) translateY(0) + visibility: visible diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass index a8d2edad4..259d563c3 100644 --- a/website/assets/css/style.sass +++ b/website/assets/css/style.sass @@ -32,3 +32,4 @@ $theme: blue !default @import _components/navigation @import _components/sidebar @import _components/tables +@import _components/tooltips From 8a5f1cd35abc11d4f1eca2942a239b28a6fee032 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 18:21:01 +0200 Subject: [PATCH 015/195] Fix font weight on code blocks --- website/assets/css/_components/_code.sass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass index 83462ef72..b2ba9c60e 100644 --- a/website/assets/css/_components/_code.sass +++ b/website/assets/css/_components/_code.sass @@ -18,7 +18,7 @@ .c-code-block__content display: block - font: normal normal 1.1rem/#{2} $font-code + font: normal 600 1.1rem/#{2} $font-code padding: 1em 2em From 22a4d19fd8365c55cf966a4a34f315ab967d548c Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 18:21:13 +0200 Subject: [PATCH 016/195] Add help mixin that displays icon with tooltip --- website/_includes/_mixins.jade | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 2f89b0ec4..aeee54f52 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -47,6 +47,14 @@ mixin api(path) | #[+icon("book", 18).o-icon--inline.u-color-subtle] +//- Help icon with tooltip + tooltip - [string] Tooltip text + +mixin help(tooltip) + span(data-tooltip=tooltip)&attributes(attributes) + +icon("help", 16).i-icon--inline + + //- Aside for text label - [string] aside title (optional) From 35795c88c4dd562a48c703e7ae0150950be87684 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 18:22:04 +0200 Subject: [PATCH 017/195] Add quickstart.js widget --- website/_harp.json | 27 +++++- website/_includes/_mixins-base.jade | 41 +++++++++ website/_includes/_scripts.jade | 8 +- .../assets/css/_components/_quickstart.sass | 90 +++++++++++++++++++ website/assets/css/style.sass | 1 + website/assets/js/quickstart.js | 8 ++ website/docs/usage/_data.json | 1 + website/docs/usage/index.jade | 43 +++++++-- 8 files changed, 207 insertions(+), 12 deletions(-) create mode 100644 website/assets/css/_components/_quickstart.sass create mode 100644 website/assets/js/quickstart.js diff --git a/website/_harp.json b/website/_harp.json index 672640405..ef2e48239 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -55,7 +55,32 @@ } }, - "V_CSS": "1.6", + "QUICKSTART": [ + { "id": "os", "title": "Operating system", "options": [ + { "id": "mac", "title": "macOS / OSX", "checked": true }, + { "id": "windows", "title": "Windows" }, + { "id": "linux", "title": "Linux" }] + }, + { "id": "package", "title": "Package manager", "options": [ + { "id": "pip", "title": "pip", "checked": true }, + { "id": "conda", "title": "conda" }, + { "id": "source", "title": "from source" }] + }, + { "id": "python", "title": "Python version", "options": [ + { "id": 2, "title": "2.x" }, + { "id": 3, "title": "3.x", "checked": true }] + }, + { "id": "config", "title": "Configuration", "multiple": true, "options": [ + {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }] + }, + { "id": "model", "title": "Models", "multiple": true, "options": [ + { "id": "en", "title": "English", "meta": "50MB" }, + { "id": "de", "title": "German", "meta": "645MB" }, + { "id": "fr", "title": "French", "meta": "1.33GB" }] + } + ], + + "V_CSS": "1.7", "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 94b1bfd84..359839d67 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -121,6 +121,47 @@ mixin badge(name) img(src=site.badge alt="{name} version" height="20") +//- Quickstart widget + quickstart.js with manual markup, inspired by PyTorch's "Getting started" + groups - [object] option groups, uses global variable QUICKSTART + headline - [string] optional text to be rendered as widget headline + +mixin quickstart(groups, headline) + .c-quickstart.o-block-small#qs + .c-quickstart__content + if headline + +h(2)=headline + for group in groups + .c-quickstart__group.u-text-small(data-qs-group=group.id) + .c-quickstart__legend=group.title + .c-quickstart__fields + for option in group.options + input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked) + label.c-quickstart__label(for=option.id)=option.title + if option.meta + | #[span.c-quickstart__label__meta (#{option.meta})] + if option.help + | #[+help(option.help).c-quickstart__label__meta] + + pre.c-code-block + code.c-code-block__content.c-quickstart__code(data-qs-results="") + block + + .c-quickstart__info.u-text-tiny.o-block.u-text-right + | Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]! + + +//- Quickstart code item + data [object] - Rendering conditions (keyed by option group ID, value: option) + +mixin qs(data) + - args = {} + for value, setting in data + - args['data-qs-' + setting] = value + span.c-quickstart__line&attributes(args) + block + + //- Logo mixin logo() diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade index 544cf0977..b31c7119e 100644 --- a/website/_includes/_scripts.jade +++ b/website/_includes/_scripts.jade @@ -1,9 +1,13 @@ //- đŸ’Ģ INCLUDES > SCRIPTS -script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript") -script(src="/assets/js/prism.js", type="text/javascript") +script(src="/assets/js/main.js?v#{V_JS}") +script(src="/assets/js/prism.js") if SECTION == "docs" + if quickstart + script(src="/assets/js/quickstart.js") + script var qs = new Quickstart("#qs"); + script. ((window.gitter = {}).chat = {}).options = { useStyles: false, diff --git a/website/assets/css/_components/_quickstart.sass b/website/assets/css/_components/_quickstart.sass new file mode 100644 index 000000000..a3e0bff9c --- /dev/null +++ b/website/assets/css/_components/_quickstart.sass @@ -0,0 +1,90 @@ +//- đŸ’Ģ CSS > COMPONENTS > QUICKSTART + +.c-quickstart + border: 1px solid $color-subtle + border-radius: 2px + display: none + background: $color-subtle-light + + &:not([style]) + .c-quickstart__info + display: none + +.c-quickstart__content + padding: 2rem 3rem + +.c-quickstart__input + @include size(0) + opacity: 0 + position: absolute + left: -9999px + +.c-quickstart__label + cursor: pointer + background: $color-back + border: 1px solid $color-subtle + border-radius: 2px + display: inline-block + padding: 0.75rem 1.25rem + margin: 0 0.5rem 0.5rem 0 + font-weight: bold + + &:hover + background: lighten($color-theme-light, 5) + + .c-quickstart__input:focus + & + border: 1px solid $color-theme + + .c-quickstart__input--radio:checked + & + color: $color-back + border-color: $color-theme + background: $color-theme + + .c-quickstart__input--check + &:before + content: "" + background: $color-back + display: inline-block + width: 20px + height: 20px + border: 1px solid $color-subtle + vertical-align: middle + margin-right: 1rem + cursor: pointer + border-radius: 50% + + .c-quickstart__input--check:checked + &:before + background: $color-theme url() + background-size: contain + border-color: $color-theme + +.c-quickstart__label__meta + font-weight: normal + color: $color-subtle-dark + +.c-quickstart__group + @include breakpoint(min, md) + display: flex + flex-flow: row nowrap + + &:not(:last-child) + margin-bottom: 1rem + +.c-quickstart__fields + flex: 100% + +.c-quickstart__legend + color: $color-subtle-dark + margin-right: 2rem + padding-top: 0.75rem + flex: 1 1 35% + font-weight: bold + +.c-quickstart__line + display: block + + &:before + color: $color-theme + margin-right: 1em + content: "$" + +.c-quickstart__code + font-size: 1.6rem diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass index 259d563c3..809598663 100644 --- a/website/assets/css/style.sass +++ b/website/assets/css/style.sass @@ -33,3 +33,4 @@ $theme: blue !default @import _components/sidebar @import _components/tables @import _components/tooltips +@import _components/quickstart diff --git a/website/assets/js/quickstart.js b/website/assets/js/quickstart.js new file mode 100644 index 000000000..d062aa91f --- /dev/null +++ b/website/assets/js/quickstart.js @@ -0,0 +1,8 @@ +/** + * quickstart.js + * A micro-form for user-specific installation instructions + * + * @author Ines Montani + * @version 0.0.1 + * @license MIT + */'use strict';var _createClass=function(){function a(b,c){for(var e,d=0;d['+this.dpfx+'-'+c+']'+e+' {display: none}';this._$('['+this.dpfx+'-style="'+c+'"]').textContent=g}},{key:'updateContainer',value:function updateContainer(){if(!this._$('['+this.dpfx+'-results]')){var b=this.childNodes(this.container,'pre'),c=b?b[0]:this._c('pre',this.pfx+'-code'),d=this.childNodes(c,'code')||this.childNodes(this.container,'code'),e=d?d[0]:this._c('code',this.pfx+'-results');e.setAttribute(this.dpfx+'-results','');var f=this.childNodes(e,'span')||this.childNodes(c,'span')||this.childNodes(this.container,'span');f&&f.forEach(function(g){return e.appendChild(g)}),c.appendChild(e),this.container.appendChild(c)}}},{key:'createGroup',value:function createGroup(b){var d=this,c=this._c('fieldset',this.pfx+'-group');c.setAttribute(this.dpfx+'-group',b.id),c.innerHTML=this.createStyles(b.id).outerHTML,c.innerHTML+=''+b.title+'',c.innerHTML+=b.options.map(function(e){var f=b.multiple?'checkbox':'radio';return''}).join(''),this.container.insertBefore(c,this.container.firstChild),this.initGroup(c,b.id)}},{key:'createStyles',value:function createStyles(b){var c=this._c('style');return c.setAttribute(this.dpfx+'-style',b),c.textContent='['+this.dpfx+'-results]>['+this.dpfx+'-'+b+'] {display: none}',c}},{key:'childNodes',value:function childNodes(b,c){var d=c.toUpperCase();if(!b.hasChildNodes)return!1;var e=[].concat(_toConsumableArray(b.childNodes)).filter(function(f){return f.nodeName===d});return!!e.length&&e}},{key:'_$',value:function _$(b){return document.querySelector(b)}},{key:'_$$',value:function _$$(b){return[].concat(_toConsumableArray(document.querySelectorAll(b)))}},{key:'_c',value:function _c(b,c){var d=document.createElement(b);return c&&(d.className=c),d}}]),a}(); diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 78e8b3e27..703a185d6 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -33,6 +33,7 @@ "index": { "title": "Install spaCy", + "quickstart": true, "next": "models" }, diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 48fe6b783..da13f4d81 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -12,6 +12,39 @@ p | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X] | and #[a(href="#source-windows") Windows] for details. ++quickstart(QUICKSTART, "Quickstart") + +qs({config: 'venv', python: 2}) python -m pip install -U virtualenv + +qs({config: 'venv', python: 3}) python -m pip install -U venv + +qs({config: 'venv', python: 2}) virtualenv .env + +qs({config: 'venv', python: 3}) venv .env + +qs({config: 'venv', os: 'mac'}) source .env/bin/activate + +qs({config: 'venv', os: 'linux'}) source .env/bin/activate + +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate + + +qs({package: 'pip'}) pip install -U spacy + + +qs({package: 'conda'}) conda config --add channels conda-forge + +qs({package: 'conda'}) conda install spacy + + +qs({package: 'source'}) git clone https://github.com/explosion/spaCy + +qs({package: 'source'}) cd spaCy + +qs({package: 'source'}) pip install -r requirements.txt + +qs({package: 'source'}) pip install -e . + + +qs({model: 'en'}) python -m spacy download en + +qs({model: 'de'}) python -m spacy download de + +qs({model: 'fr'}) python -m spacy download fr + ++h(2, "installation") Installation instructions + ++h(3, "pip") pip + +badge("pipy") + +p Using pip, spaCy releases are currently only available as source packages. + ++code(false, "bash"). + pip install -U spacy + +aside("Download models") | After installation you need to download a language model. For more info | and available models, see the #[+a("/docs/usage/models") docs on models]. @@ -22,14 +55,6 @@ p >>> import spacy >>> nlp = spacy.load('en') -+h(2, "pip") pip - +badge("pipy") - -p Using pip, spaCy releases are currently only available as source packages. - -+code(false, "bash"). - pip install -U spacy - p | When using pip it is generally recommended to install packages in a | #[code virtualenv] to avoid modifying system state: @@ -39,7 +64,7 @@ p source .env/bin/activate pip install spacy -+h(2, "conda") conda ++h(3, "conda") conda +badge("conda") p From f37d078d6a840db1a47066c97d4ec01ba966a2b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 18 May 2017 09:59:38 +0200 Subject: [PATCH 018/195] Fix issue #1069 with custom hook `Doc.sents` definition --- spacy/tokens/doc.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2089199a0..cfc146e6a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -431,7 +431,9 @@ cdef class Doc: """ def __get__(self): if 'sents' in self.user_hooks: - return self.user_hooks['sents'](self) + for sent in self.user_hooks['sents'](self): + yield sent + return if not self.is_parsed: raise ValueError( From 6381ebfb14c537c5525e3a240c8d3b2bd6f3cc2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 18 May 2017 10:42:35 +0200 Subject: [PATCH 019/195] Use yield from syntax --- spacy/tokens/doc.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index cfc146e6a..ca5a3d696 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -431,8 +431,7 @@ cdef class Doc: """ def __get__(self): if 'sents' in self.user_hooks: - for sent in self.user_hooks['sents'](self): - yield sent + yield from self.user_hooks['sents'](self) return if not self.is_parsed: From c56c264510c701b4f72d2ccdc358177e4dcf6716 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 18 May 2017 13:49:43 +0200 Subject: [PATCH 020/195] Tidy up .gitignore --- .gitignore | 94 ++++++++++++++++++++++++------------------------------ 1 file changed, 42 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 8716a8ef0..b165abf4b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,50 +1,43 @@ -# Vim -*.swp -*.sw* -Profile.prof -tmp/ -.dev -.denv -.pypyenv -.eggs -*.tgz -.sass-cache -.python-version - -MANIFEST - +# spaCy +spacy/data/ corpora/ models/ keys/ -spacy/syntax/*.cpp -spacy/syntax/*.html -spacy/en/*.cpp -spacy/tokens/*.cpp -spacy/serialize/*.cpp -spacy/en/data/* -spacy/*.cpp -spacy/ner/*.cpp -spacy/orthography/*.cpp -ext/murmurhash.cpp -ext/sparsehash.cpp +# Website +website/www/ +website/_deploy.sh +website/package.json +website/announcement.jade +website/.gitignore -/spacy/data/ - -_build/ -.env/ -tmp/ +# Cython / C extensions cythonize.json - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] - -# C extensions +spacy/*.html +*.cpp *.so -# Distribution / packaging +# Vim / VSCode / editors +*.swp +*.sw* +Profile.prof +.vscode +.sass-cache + +# Python .Python +.python-version +__pycache__/ +*.py[cod] +.env/ +.~env/ +.venv +venv/ +.dev +.denv +.pypyenv + +# Distribution / packaging env/ bin/ build/ @@ -59,6 +52,12 @@ var/ *.egg-info/ .installed.cfg *.egg +.eggs +MANIFEST + +# Temporary files +*.~* +tmp/ # Installer logs pip-log.txt @@ -87,25 +86,16 @@ coverage.xml *.log *.pot -# Windows local helper files +# Windows *.bat +Thumbs.db +Desktop.ini # Mac OS X *.DS_Store -# Temporary files / Dropbox hack -*.~* - # Komodo project files *.komodoproject -# Website -website/_deploy.sh -website/package.json -website/announcement.jade -website/www/ -website/.gitignore - -# Python virtualenv -venv -venv/* +# Other +*.tgz From d40b0839345dabd4cf8c1d981640c15d86ddc5dc Mon Sep 17 00:00:00 2001 From: Niko Rebenich Date: Thu, 18 May 2017 14:50:43 -0700 Subject: [PATCH 021/195] Print list comprehension Turn the generator expression into a list comprehension before printing --- website/docs/usage/language-processing-pipeline.jade | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index c372dfbf4..3ddf9179c 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -17,10 +17,10 @@ p | trying to do. +code. - import spacy # See "Installing spaCy" - nlp = spacy.load('en') # You are here. - doc = nlp(u'Hello, spacy!') # See "Using the pipeline" - print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token" + import spacy # See "Installing spaCy" + nlp = spacy.load('en') # You are here. + doc = nlp(u'Hello, spacy!') # See "Using the pipeline" + print([(w.text, w.pos_) for w in doc]) # See "Doc, Span and Token" +aside("Why do we have to preload?") | Loading the models takes ~200x longer than From 7e4f31c36224d9c97539e456232a30780b3472aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 19 May 2017 21:12:41 +0200 Subject: [PATCH 022/195] Deleting (legacy?) whitespace attribute token.whitespace raises an AttributeError --- website/docs/api/token.jade | 5 ----- 1 file changed, 5 deletions(-) diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 7a09f9d11..de3498353 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -238,11 +238,6 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc. +cell #[code text_with_ws] +cell unicode +cell Text content, with trailing space character if present. - - +row - +cell #[code whitespace] - +cell int - +cell Trailing space character if present. +row +cell #[code whitespace_] +cell unicode From a3302873048ba6f549ff4f9aa7b2a9bfe78e26bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 19 May 2017 21:17:31 +0200 Subject: [PATCH 023/195] Add Token.orth and Token.orth_ description in doc --- website/docs/api/token.jade | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 7a09f9d11..ee837b54a 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -67,6 +67,16 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc. +cell unicode +cell Base form of the word, with no inflectional suffixes. + +row + +cell #[code orth] + +cell int + +cell word's string. + + +row + +cell #[code orth_] + +cell unicode + +cell word's string. + +row +cell #[code lower] +cell int From af3d121ec9f8b4320521b783346a84f66adce0ce Mon Sep 17 00:00:00 2001 From: Yuval Pinter Date: Mon, 22 May 2017 10:56:03 -0400 Subject: [PATCH 024/195] extend suffixes from first to last reverse suffix list in `tokenizer_pseudo_code()` so the order of returned tokens matches input order --- website/docs/usage/customizing-tokenizer.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index d43fb438f..b1fbba652 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -113,7 +113,7 @@ p else: tokens.append(substring) substring = '' - tokens.extend(suffixes) + tokens.extend(reversed(suffixes)) return tokens p From 7f6be41f212c2a6f65612beeccf170665e0ba106 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 23 May 2017 12:18:00 +0200 Subject: [PATCH 025/195] Fix typo in English tokenizer exceptions (resolves #1071) --- spacy/en/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 3d009241b..d9aa01734 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -178,7 +178,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: EXC[orth + "ve"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + {ORTH: "ve", LEMMA: "have", TAG: "VB"} ] EXC[orth + "'d"] = [ From 68b387ffc30b6b542c4a09cc38a2d3a89bbac77a Mon Sep 17 00:00:00 2001 From: Yuval Pinter Date: Tue, 23 May 2017 10:46:17 -0400 Subject: [PATCH 026/195] Fixed link link to Doc API documentation fixed --- website/docs/usage/deep-learning.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade index fec01b4ba..739cf858c 100644 --- a/website/docs/usage/deep-learning.jade +++ b/website/docs/usage/deep-learning.jade @@ -36,7 +36,7 @@ p | to #[code spacy.load()]. The function should take a | #[code spacy.language.Language] object as its only argument, and return | a sequence of callables. Each callable should accept a - | #[+api("docs") #[code Doc]] object, modify it in place, and return + | #[+api("doc") #[code Doc]] object, modify it in place, and return | #[code None]. p From cb418c7aef475607088b80d5acf083198b5ee2d4 Mon Sep 17 00:00:00 2001 From: Yuval Pinter Date: Tue, 23 May 2017 10:54:13 -0400 Subject: [PATCH 027/195] Fixed span example error Span as written gives empty text. --- website/docs/api/doc.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 72fe34f8c..adcd111a3 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -103,7 +103,7 @@ p Get a #[code Token] object. doc = nlp(u'Give it back! He pleaded.') assert doc[0].text == 'Give' assert doc[-1].text == '.' - span = doc[1:1] + span = doc[1:3] assert span.text == 'it back' +table(["Name", "Type", "Description"]) From ab83dd5d25498a3814a2b302f0f75d66e8b935de Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 17:53:41 +0200 Subject: [PATCH 028/195] Fix lightning tour example --- website/docs/usage/lightning-tour.jade | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 967d0c61e..138b0058d 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -148,24 +148,20 @@ p +code. def put_spans_around_tokens(doc, get_classes): - '''Given some function to compute class names, put each token in a - span element, with the appropriate classes computed. - - All whitespace is preserved, outside of the spans. (Yes, I know HTML - won't display it. But the point is no information is lost, so you can - calculate what you need, e.g.
tags,

tags, etc.) - ''' + """Given some function to compute class names, put each token in a + span element, with the appropriate classes computed. All whitespace is + preserved, outside of the spans. (Of course, HTML won't display more than + one whitespace character it – but the point is, no information is lost + and you can calculate what you need, e.g. <br />, <p> etc.) + """ output = [] - template = '{word}{space}' + html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: - output.append(token.orth_) + output.append(token.text) else: - output.append( - template.format( - classes=' '.join(get_classes(token)), - word=token.orth_, - space=token.whitespace_)) + classes = ' '.join(get_classes(token)) + output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') From 1e918b871cb4b7144979a223c94b1eee611e5da7 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 17:53:47 +0200 Subject: [PATCH 029/195] Remove infoboxes --- website/docs/api/_annotation/_dep-labels.jade | 5 ----- website/docs/api/_annotation/_named-entities.jade | 5 ----- website/docs/api/_annotation/_pos-tags.jade | 5 ----- 3 files changed, 15 deletions(-) diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/docs/api/_annotation/_dep-labels.jade index 9e1e89324..427b2f53a 100644 --- a/website/docs/api/_annotation/_dep-labels.jade +++ b/website/docs/api/_annotation/_dep-labels.jade @@ -1,10 +1,5 @@ //- đŸ’Ģ DOCS > API > ANNOTATION > DEPENDENCY LABELS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a label. For example, - | #[code spacy.explain("prt")] will return "particle". - +h(3, "dependency-parsing-english") English dependency labels p diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade index 68b3bd17d..476659d4a 100644 --- a/website/docs/api/_annotation/_named-entities.jade +++ b/website/docs/api/_annotation/_named-entities.jade @@ -1,10 +1,5 @@ //- đŸ’Ģ DOCS > API > ANNOTATION > NAMED ENTITIES -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of an entity label. For example, - | #[code spacy.explain("LANGUAGE")] will return "any named language". - +table([ "Type", "Description" ]) +row +cell #[code PERSON] diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index d3ceef777..ea3a225bf 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -1,10 +1,5 @@ //- đŸ’Ģ DOCS > API > ANNOTATION > POS TAGS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a tag. For example, - | #[code spacy.explain("RB")] will return "adverb". - +h(3, "pos-tagging-english") English part-of-speech tag scheme p From a6d99f8dabeca07f339bedec3d48f0bad7b45be9 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 17:56:18 +0200 Subject: [PATCH 030/195] Add prefix to option IDs to avoid conflicts --- website/_includes/_mixins-base.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 359839d67..106f8f1ca 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -136,8 +136,8 @@ mixin quickstart(groups, headline) .c-quickstart__legend=group.title .c-quickstart__fields for option in group.options - input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked) - label.c-quickstart__label(for=option.id)=option.title + input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) + label.c-quickstart__label(for="qs-#{option.id}")=option.title if option.meta | #[span.c-quickstart__label__meta (#{option.meta})] if option.help From 36b20d66bfab75cfc931358688ba17c79fd1d7cc Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 18:11:49 +0200 Subject: [PATCH 031/195] Add alpha banner --- website/assets/img/graphics.svg | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg index c24473b4c..a449c3d04 100644 --- a/website/assets/img/graphics.svg +++ b/website/assets/img/graphics.svg @@ -1,5 +1,16 @@ + + spaCy v2.0.0 alpha + + + + + + + + + spaCy user survey 2017 From 5b385e7d78fd955d97b59024645d2592bdbc0949 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 2 Jun 2017 08:14:06 +0200 Subject: [PATCH 032/195] feat(spanish model): add the spanish noun chunker --- spacy/syntax/iterators.pyx | 55 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index e1c44da7f..b0d1c78ca 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ..parts_of_speech cimport NOUN, PROPN, PRON +from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX def english_noun_chunks(obj): @@ -66,4 +66,55 @@ def german_noun_chunks(obj): yield word.left_edge.i, rbracket, np_label -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} +def es_noun_chunks(obj): + + doc = obj.doc + np_label = doc.vocab.strings['NP'] + + left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed'] + right_labels = ['flat', 'fixed', 'compound', 'neg'] + stop_labels = ['punct'] + + np_left_deps = [doc.vocab.strings[label] for label in left_labels] + np_right_deps = [doc.vocab.strings[label] for label in right_labels] + stop_deps = [doc.vocab.strings[label] for label in stop_labels] + + def next_token(token): + try: + return token.nbor() + except: + return None + + def noun_bounds(root): + + def is_verb_token(token): + return token.pos in [VERB, AUX] + + left_bound = root + for token in reversed(list(root.lefts)): + if token.dep in np_left_deps: + left_bound = token + + right_bound = root + for token in root.rights: + if (token.dep in np_right_deps): + left, right = noun_bounds(token) + + if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, doc[left_bound.i: right.i])): + break + else: + right_bound = right + + return left_bound, right_bound + + + token = doc[0] + while token and token.i < len(doc): + if token.pos in [PROPN, NOUN, PRON]: + left, right = noun_bounds(token) + yield left.i, right.i+1, np_label + token = right + token = next_token(token) + + +CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks} From 70a21801994d7c9023f050ecfa2e3ec8a5d52d04 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 2 Jun 2017 08:19:57 +0200 Subject: [PATCH 033/195] fix(spanish sentence segmentation): remove tokenizer exceptions the break sentence segmentation. Aligned with training corpus --- spacy/es/tokenizer_exceptions.py | 33 ++------------------------------ 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index e60bcd104..fb274f907 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -6,44 +6,15 @@ from ..language_data import PRON_LEMMA, DET_LEMMA TOKENIZER_EXCEPTIONS = { - "al": [ - {ORTH: "a", LEMMA: "a", TAG: ADP}, - {ORTH: "el", LEMMA: "el", TAG: DET} - ], - - "consigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} - ], - - "conmigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} - ], - - "contigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} - ], - - "del": [ - {ORTH: "de", LEMMA: "de", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], - - "pel": [ - {ORTH: "pe", LEMMA: "per", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], "pal": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} + {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"} ], "pala": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "la", LEMMA: DET_LEMMA} + {ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"} ], "aprox.": [ From 86277d4ef2efb80f8a30559700c9e724a1c08aff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:13:59 +0200 Subject: [PATCH 034/195] Create appveyor.yml --- appveyor.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 appveyor.yml diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 000000000..2cca96974 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,9 @@ +branches: + only: + - master + - develop + +notifications: + - provider: Slack + incoming_webhook: https://hooks.slack.com/services/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 + channel: '#dev' From 0404b5f43b5821196d4a5e8b9d4d3cfcfef260b6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:18:51 +0200 Subject: [PATCH 035/195] Update appveyor.yml --- appveyor.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 2cca96974..30626c977 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,3 +1,5 @@ +build: off + branches: only: - master From 3c2cce8efc56ed00759c2ad10076c2f6cde42585 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:27:36 +0200 Subject: [PATCH 036/195] Update appveyor.yml --- appveyor.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 30626c977..d1c70a166 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -4,8 +4,3 @@ branches: only: - master - develop - -notifications: - - provider: Slack - incoming_webhook: https://hooks.slack.com/services/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 - channel: '#dev' From 3e16535fef0c6025ad937261e9eb3276df3d1e60 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:31:31 +0200 Subject: [PATCH 037/195] Update appveyor.yml --- appveyor.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index d1c70a166..4dd7b0a31 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1 @@ build: off - -branches: - only: - - master - - develop From af466496f1f659236eee60e7aea22c3d7a0f4440 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:33:57 +0200 Subject: [PATCH 038/195] Rename appveyor.yml to .appveyor.yml --- appveyor.yml => .appveyor.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename appveyor.yml => .appveyor.yml (100%) diff --git a/appveyor.yml b/.appveyor.yml similarity index 100% rename from appveyor.yml rename to .appveyor.yml From c4e62c76519f534ff83d67f30275f164762ebd1e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:39:44 +0200 Subject: [PATCH 039/195] Update README.rst --- README.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 24b0c232a..4a34770e8 100644 --- a/README.rst +++ b/README.rst @@ -15,7 +15,11 @@ MIT license. .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy - :alt: Build Status + :alt: Travis Build Status + +.. image:: https://img.shields.io/appveyor/ci/explosion/spacy.svg?style=flat-square + :target: https://ci.appveyor.com/project/explosion/spacy + :alt: Appveyor Build Status .. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square :target: https://github.com/explosion/spaCy/releases From 83467a00a76ace3b1e858374bbcd5081555447ee Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 12:42:22 +0200 Subject: [PATCH 040/195] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 4a34770e8..76bd333d8 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,7 @@ MIT license. :target: https://travis-ci.org/explosion/spaCy :alt: Travis Build Status -.. image:: https://img.shields.io/appveyor/ci/explosion/spacy.svg?style=flat-square +.. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square :target: https://ci.appveyor.com/project/explosion/spacy :alt: Appveyor Build Status From e7ef51b3828f9acf0dc815c2d5eeddca5eda3d28 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Jun 2017 19:00:01 +0200 Subject: [PATCH 041/195] Update tokenizer_exceptions.py --- spacy/es/tokenizer_exceptions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index fb274f907..f9c955338 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -6,10 +6,9 @@ from ..language_data import PRON_LEMMA, DET_LEMMA TOKENIZER_EXCEPTIONS = { - "pal": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"} + {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} ], "pala": [ From e66cd9cc70f40288aef24b241d22fadbd5c7bc59 Mon Sep 17 00:00:00 2001 From: Pascal van Kooten Date: Mon, 5 Jun 2017 20:41:28 +0200 Subject: [PATCH 042/195] for easy copy & paste --- website/docs/usage/rule-based-matching.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index aea943a61..db7c70608 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -19,11 +19,11 @@ p Here's a minimal example. We first add a pattern that specifies three tokens: p | Once we've added the pattern, we can use the #[code matcher] as a | callable, to receive a list of #[code (ent_id, start, end)] tuples. - | Note that #[code LOWER] and #[code IS_PUNCT] are data attributes - | of #[code spacy.attrs]. +code. from spacy.matcher import Matcher + from spacy.attrs import IS_PUNCT, LOWER + matcher = Matcher(nlp.vocab) matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}]) From 4cbe55622d921569943147a6a0eb1ab3fb489686 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Jun 2017 21:32:36 +0200 Subject: [PATCH 043/195] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 76bd333d8..3eeca36bc 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, released under the MIT license. -📊 **Help us improve the library!** `Take the spaCy user survey `_. +⭐ī¸ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. `_ đŸ’Ģ **Version 1.8 out now!** `Read the release notes here. `_ From 99d02b2bb626c80ef53d1d03e226aee322b6fa57 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 6 Jun 2017 03:20:20 +0200 Subject: [PATCH 044/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b64dc8db3..ea6096a52 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -16,6 +16,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo) * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28) +* Francisco Aranda, [@frascuchon](https://github.com/frascuchon) * Greg Baker, [@solresol](https://github.com/solresol) * GrÊgory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard) * GyÃļrgy Orosz, [@oroszgy](https://github.com/oroszgy) From 6071c727d263d97c9251b076858d24de30da5f78 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 6 Jun 2017 12:49:17 +0200 Subject: [PATCH 045/195] Add more env options to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b165abf4b..2209f5b4a 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,8 @@ Profile.prof __pycache__/ *.py[cod] .env/ +.env2/ +.env3/ .~env/ .venv venv/ From 6ef04afdc89eab35e6351c90b8be2c56c025324b Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 6 Jun 2017 12:49:25 +0200 Subject: [PATCH 046/195] Update docs with Spanish model --- website/_harp.json | 7 ++++--- website/docs/usage/_models-list.jade | 1 + website/docs/usage/index.jade | 1 + website/index.jade | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index ef2e48239..cb476541a 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -14,8 +14,8 @@ "SPACY_VERSION": "1.8", "LATEST_NEWS": { - "url": "https://survey.spacy.io/", - "title": "Take the spaCy user survey and help us improve the library!" + "url": "/docs/usage/models", + "title": "The first official Spanish model is here!" }, "SOCIAL": { @@ -76,7 +76,8 @@ { "id": "model", "title": "Models", "multiple": true, "options": [ { "id": "en", "title": "English", "meta": "50MB" }, { "id": "de", "title": "German", "meta": "645MB" }, - { "id": "fr", "title": "French", "meta": "1.33GB" }] + { "id": "fr", "title": "French", "meta": "1.33GB" }, + { "id": "es", "title": "Spanish", "meta": "377MB"}] } ], diff --git a/website/docs/usage/_models-list.jade b/website/docs/usage/_models-list.jade index 942de28c4..36de137e5 100644 --- a/website/docs/usage/_models-list.jade +++ b/website/docs/usage/_models-list.jade @@ -25,3 +25,4 @@ p +model-row("en_vectors_glove_md", "English", [1, 0, 0, 1], "727 MB", "CC BY-SA") +model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true) +model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true) + +model-row("es_core_web_md", "Spanish", [1, 1, 1, 1], "377 MB", "CC BY-SA", true, true) diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index da13f4d81..9ad2fde5f 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -34,6 +34,7 @@ p +qs({model: 'en'}) python -m spacy download en +qs({model: 'de'}) python -m spacy download de +qs({model: 'fr'}) python -m spacy download fr + +qs({model: 'es'}) python -m spacy download es +h(2, "installation") Installation instructions diff --git a/website/index.jade b/website/index.jade index 17b564b42..df5428316 100644 --- a/website/index.jade +++ b/website/index.jade @@ -11,7 +11,7 @@ include _includes/_mixins h2.c-landing__title.o-block.u-heading-1 | in Python - +landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!") + +landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!") +grid.o-content +grid-col("third").o-card From 3cceabbf32df2897eff6d694da9284e87dbea735 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 6 Jun 2017 14:39:54 +0200 Subject: [PATCH 047/195] Update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3eeca36bc..0f3efc146 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports -English, German and French, as well as tokenization for Spanish, Italian, +English, German, French and Spanish, as well as tokenization for Italian, Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, released under the MIT license. @@ -89,7 +89,7 @@ Features * GIL-free **multi-threading** * Efficient binary serialization * Easy **deep learning** integration -* Statistical models for **English** and **German** +* Statistical models for **English**, **German**, **French** and **Spanish** * State-of-the-art speed * Robust, rigorously evaluated accuracy From fa26041da62321d6f34306ce4f287cc938f27b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Orosz?= Date: Wed, 7 Jun 2017 16:19:08 +0200 Subject: [PATCH 048/195] Fixed typo in cli/package.py --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 102b07472..74086613a 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -47,7 +47,7 @@ def package(input_dir, output_dir, meta_path, force): def check_dirs(input_path, output_path, meta_path): if not input_path.exists(): - util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found") + util.sys_exit(unicode_(input_path.as_posix()), title="Model directory not found") if not output_path.exists(): util.sys_exit(unicode_(output_path), title="Output directory not found") if meta_path and not meta_path.exists(): From e55199d454c6490775cc3403793768da88b0d2dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 9 May 2017 22:50:50 +0200 Subject: [PATCH 049/195] Implementation of Pattern --- spacy/compat.py | 6 + spacy/pattern/__init__.py | 4 + spacy/pattern/parser.py | 364 ++++++++++++++++++++++++++++++++++++++ spacy/pattern/pattern.py | 312 ++++++++++++++++++++++++++++++++ 4 files changed, 686 insertions(+) create mode 100644 spacy/pattern/__init__.py create mode 100644 spacy/pattern/parser.py create mode 100644 spacy/pattern/pattern.py diff --git a/spacy/compat.py b/spacy/compat.py index 1ca8a59fe..8d962976b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -16,6 +16,10 @@ try: except ImportError: import copyreg as copy_reg +try: + import Queue as queue +except ImportError: + import queue is_python2 = six.PY2 is_python3 = six.PY3 @@ -32,6 +36,7 @@ if is_python2: basestring_ = basestring input_ = raw_input json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8') + intern = intern elif is_python3: bytes_ = bytes @@ -39,6 +44,7 @@ elif is_python3: basestring_ = str input_ = input json_dumps = lambda data: ujson.dumps(data, indent=2) + intern = sys.intern def symlink_to(orig, dest): diff --git a/spacy/pattern/__init__.py b/spacy/pattern/__init__.py new file mode 100644 index 000000000..325ba04ea --- /dev/null +++ b/spacy/pattern/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 + +from .pattern import DependencyTree +from .parser import PatternParser diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py new file mode 100644 index 000000000..9ebb9bd5c --- /dev/null +++ b/spacy/pattern/parser.py @@ -0,0 +1,364 @@ +# coding: utf-8 + +from spacy.compat import intern, queue +from operator import itemgetter +import re +from hashlib import md5 +import json + +from .pattern import DependencyPattern + +TOKEN_INITIAL = intern('initial') + + +class PatternParser(object): + """Compile a Pattern query into a :class:`Pattern`, that can be used to + match :class:`DependencyTree`s.""" + whitespace_re = re.compile(r'\s+', re.U) + newline_re = re.compile(r'(\r\n|\r|\n)') + name_re = re.compile(r'\w+', re.U) + + TOKEN_BLOCK_BEGIN = '[' + TOKEN_BLOCK_END = ']' + EDGE_BLOCK_BEGIN = '>' + WHITESPACE = ' ' + + @classmethod + def parse(cls, query): + """Parse the given `query`, and compile it into a :class:`Pattern`.""" + pattern = DependencyPattern() + + for lineno, token_stream in enumerate(cls.tokenize(query)): + try: + cls._parse_line(token_stream, pattern, lineno+1) + except StopIteration: + raise SyntaxError("A token is missing, please check your " + "query.") + + if not pattern.nodes: + return + + return pattern + + @classmethod + def _parse_line(cls, stream, pattern, lineno): + while not stream.closed: + token = stream.current + + if token.type == 'name': + next_token = stream.look() + + if next_token.type == 'node': + cls.parse_node_def(stream, pattern) + + elif next_token.type == 'edge': + cls.parse_edge_def(stream, pattern) + + else: + raise SyntaxError("line %d: A 'node' or 'edge' token must " + "follow a 'name' token." % lineno) + + elif token.type == 'node': + next_token = stream.look() + + if next_token.type == 'edge': + cls.parse_edge_def(stream, pattern) + else: + raise SyntaxError("line %d: an 'edge' token is " + "expected." % lineno) + + if not stream.closed: + next(stream) + + @classmethod + def parse_node_def(cls, stream, pattern): + name_token = stream.current + next(stream) + node_token = stream.current + cls.add_node(node_token, pattern, name_token) + + @classmethod + def add_node(cls, node_token, pattern, name_token=None): + token_name = None + if name_token is not None: + token_id = name_token.value + token_name = name_token.value + else: + token_id = node_token.hash() + + if token_id in pattern.nodes: + raise SyntaxError("Token with ID '{}' already registered.".format( + token_id)) + + token_attr = cls.parse_node_attributes(node_token.value) + token_attr['_name'] = token_name + pattern.add_node(token_id, token_attr) + + @classmethod + def parse_edge_def(cls, stream, pattern): + token = stream.current + + if token.type == 'name': + token_id = token.value + if token_id not in pattern.nodes: + raise SyntaxError("Token '{}' with ID '{}' is not " + "defined.".format(token, token_id)) + + elif token.type == 'node': + token_id = token.hash() + cls.add_node(token, pattern) + + next(stream) + edge_attr = cls.parse_edge_attributes(stream.current.value) + next(stream) + + head_token = stream.current + if head_token.type == 'name': + head_token_id = head_token.value + if head_token_id not in pattern.nodes: + raise SyntaxError("Token '{}' with ID '{}' is not " + "defined.".format(head_token, head_token_id)) + elif head_token.type == 'node': + head_token_id = head_token.hash() + cls.add_node(head_token, pattern) + else: + raise SyntaxError("A 'node' or 'name' token was expected.") + + # inverse the dependency to have an actual tree + pattern.add_edge(head_token_id, token_id, edge_attr) + + @classmethod + def parse_node_attributes(cls, string): + string = string[1:] # remove the trailing '[' + end_delimiter_idx = string.find(']') + + attr_str = string[:end_delimiter_idx] + attr = {} + + try: + attr = json.loads(attr_str) + except json.JSONDecodeError: + for pair in attr_str.split(","): + key, value = pair.split(':') + attr[key] = value + + for key, value in attr.items(): + attr[key] = cls.compile_expression(value) + + alias = string[end_delimiter_idx+2:] + + if alias: + attr['_alias'] = alias + + return attr + + @classmethod + def parse_edge_attributes(cls, string): + string = string[1:] # remove the trailing '>' + + if not string: + return None + + return cls.compile_expression(string) + + @staticmethod + def compile_expression(expr): + if expr.startswith('/') and expr.endswith('/'): + string = expr[1:-1] + return re.compile(string, re.U) + + return expr + + @classmethod + def tokenize(cls, text): + lines = text.splitlines() + + for lineno, line in enumerate(lines): + yield TokenStream(cls._tokenize_line(line, lineno+1)) + + @classmethod + def _tokenize_line(cls, line, lineno): + reader = Reader(line) + + while reader.remaining(): + char = reader.next() + + if char == cls.TOKEN_BLOCK_BEGIN: + token = 'node' + idx = reader.find(cls.TOKEN_BLOCK_END) + + if idx == -1: + raise SyntaxError("A token block end ']' was expected.") + + idx += 1 + if len(reader) > idx and reader[idx] == '=': + # The node has a name + idx = reader.find(cls.WHITESPACE, start=idx) + + if idx == -1: + idx = reader.remaining() + + elif char == cls.EDGE_BLOCK_BEGIN: + token = 'edge' + idx = reader.find(cls.WHITESPACE) + + elif cls.name_re.match(char): + token = 'name' + idx = reader.find(cls.WHITESPACE) + + if idx == -1: + whole_name_match = cls.name_re.match(str(reader)) + idx = whole_name_match.end() + + elif cls.newline_re.match(char) or cls.whitespace_re.match(char): + # skip the whitespace + reader.consume() + continue + + else: + raise SyntaxError("Unrecognized token BEGIN char: '{" + "}'".format(char)) + + if idx == -1: + raise SyntaxError("Ending character of token '{}' not " + "found.".format(token)) + value = reader.consume(idx) + + yield Token(lineno, token, value) + + +class Token(tuple): + """Token class.""" + __slots__ = () + lineno, type, value = (property(itemgetter(x)) for x in range(3)) + + def __new__(cls, lineno, type, value): + return tuple.__new__(cls, (lineno, intern(str(type)), value)) + + def hash(self): + string = str(self.value) + return md5(string.encode('utf-8')).hexdigest() + + def __repr__(self): + return 'Token(%r, %r, %r)' % ( + self.lineno, + self.type, + self.value) + + +class Reader(object): + """A class used by the :class:`PatternParser` to tokenize the `text`.""" + __slots__ = ('text', 'pos') + + def __init__(self, text): + self.text = text + self.pos = 0 + + def find(self, needle, start=0, end=None): + pos = self.pos + start += pos + if end is None: + index = self.text.find(needle, start) + else: + end += pos + index = self.text.find(needle, start, end) + if index != -1: + index -= pos + return index + + def consume(self, count=1): + new_pos = self.pos + count + s = self.text[self.pos:new_pos] + self.pos = new_pos + return s + + def next(self): + return self.text[self.pos:self.pos+1] + + def remaining(self): + return len(self.text) - self.pos + + def __len__(self): + return self.remaining() + + def __getitem__(self, key): + if key < 0: + return self.text[key] + else: + return self.text[self.pos + key] + + def __str__(self): + return self.text[self.pos:] + + +class TokenStreamIterator(object): + """The iterator for tokenstreams. Iterate over the stream until the + stream is empty. + """ + + def __init__(self, stream): + self.stream = stream + + def __iter__(self): + return self + + def __next__(self): + token = self.stream.current + try: + next(self.stream) + except StopIteration: + self.stream.close() + raise StopIteration() + + return token + + +class TokenStream(object): + """A token stream is an iterable that yields :class:`Token`s. The + current active token is stored as :attr:`current`. + """ + + def __init__(self, generator): + self._iter = iter(generator) + self._pushed = queue.deque() + self.closed = False + self.current = Token(1, TOKEN_INITIAL, '') + next(self) + + def __iter__(self): + return TokenStreamIterator(self) + + def __bool__(self): + return bool(self._pushed) + __nonzero__ = __bool__ # py2 + + def push(self, token): + """Push a token back to the stream.""" + self._pushed.append(token) + + def look(self): + """Look at the next token.""" + old_token = next(self) + result = self.current + self.push(result) + self.current = old_token + return result + + def __next__(self): + """Go one token ahead and return the old one.""" + rv = self.current + if self._pushed: + self.current = self._pushed.popleft() + else: + if self.closed: + raise StopIteration("No token left.") + try: + self.current = next(self._iter) + except StopIteration: + self.close() + return rv + + def close(self): + """Close the stream.""" + self._iter = None + self.closed = True diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py new file mode 100644 index 000000000..42a15f769 --- /dev/null +++ b/spacy/pattern/pattern.py @@ -0,0 +1,312 @@ +# coding: utf-8 + +import logging +from collections import defaultdict + + +logger = logging.getLogger(__name__) + + +class Tree(object): + def __init__(self): + self.adjacency = defaultdict(dict) + self.nodes = {} + + def __getitem__(self, item): + return self.nodes[item] + + def number_of_nodes(self): + return len(self) + + def __len__(self): + return len(self.nodes) + + def number_of_edges(self): + return sum(len(adj_dict) for adj_dict in self.adjacency.values()) + + def edges_iter(self, origin=None, data=True): + nbunch = (self.adjacency.items() if origin is None + else [(origin, self.adjacency[origin])]) + + for u, nodes in nbunch: + for v, dep in nodes.items(): + if data: + yield (u, v, dep) + else: + yield (u, v) + + def nodes_iter(self): + for node in self.nodes.keys(): + yield node + + def is_connected(self): + if len(self) == 0: + raise ValueError('Connectivity is undefined for the null graph.') + return len(set(self._plain_bfs(next(self.nodes_iter()), + undirected=True))) == len(self) + + def _plain_bfs(self, source, undirected=False): + """A fast BFS node generator. + :param: source: the source node + """ + seen = set() + next_level = {source} + while next_level: + this_level = next_level + next_level = set() + for v in this_level: + if v not in seen: + yield v + seen.add(v) + next_level.update(self.adjacency[v].keys()) + + if undirected: + for n, adj in self.adjacency.items(): + if v in adj.keys(): + next_level.add(n) + + +class DependencyPattern(Tree): + def add_node(self, node, attr_dict=None): + attr_dict = attr_dict or {} + self.nodes[node] = attr_dict + + def add_edge(self, u, v, dep=None): + if u not in self.nodes or v not in self.nodes: + raise ValueError("Each node must be defined before adding an edge.") + + self.adjacency[u][v] = dep + + @property + def root_node(self): + if self.number_of_nodes() == 1: + # if the graph has a single node, it is the root + return next(iter(self.nodes.keys())) + + if not self.is_connected(): + return None + + in_node = set() + out_node = set() + for u, v in self.edges_iter(data=False): + in_node.add(v) + out_node.add(u) + + try: + return list(out_node.difference(in_node))[0] + except IndexError: + return None + + +class DependencyTree(Tree): + def __init__(self, doc): + super(DependencyTree, self).__init__() + + for token in doc: + self.nodes[token.i] = token + # inverse the dependency to have an actual tree + self.adjacency[token.head.i][token.i] = token.dep_ + + def __getitem__(self, item): + return self.nodes[item] + + def match_nodes(self, attr_dict, **kwargs): + results = [] + for token_idx, token in self.nodes.items(): + if match_token(token, attr_dict, **kwargs): + results.append(token_idx) + + return results + + def match(self, pattern): + """Return a list of matches between the given + :class:`DependencyPattern` and `self` if any, or None. + + :param pattern: a :class:`DependencyPattern` + """ + pattern_root_node = pattern.root_node + pattern_root_node_attr = pattern[pattern_root_node] + dep_root_nodes = self.match_nodes(pattern_root_node_attr) + + matches = [] + for candidate_root_node in dep_root_nodes: + match_list = subtree_in_graph(candidate_root_node, self, + pattern_root_node, pattern) + for mapping in match_list: + match = PatternMatch(mapping, pattern, self) + matches.append(match) + + return matches + + +class PatternMatch(object): + def __init__(self, mapping, pattern, tree): + for pattern_node_id, tree_node_id in mapping.items(): + mapping[pattern_node_id] = tree[tree_node_id] + self.mapping = mapping + self.pattern = pattern + self.tree = tree + + self.alias_map = {} + for pattern_node_id in self.mapping: + pattern_node = self.pattern[pattern_node_id] + + alias = pattern_node.get('_alias') + if alias: + self.alias_map[alias] = self.mapping[pattern_node_id] + + def __repr__(self): + return "".format(len(self.mapping)) + + def __getitem__(self, item): + return self.alias_map[item] + + +def subtree_in_graph(dep_tree_node, dep_tree, pattern_node, pattern): + """Return a list of matches of `pattern` as a subtree of `dep_tree`. + :param dep_tree_node: the token (identified by its index) to start from + (int) + :param dep_tree: a :class:`DependencyTree` + :param pattern_node: the pattern node to start from + :param pattern: a :class:`DependencyPattern` + :return: found matches (list) + """ + results = [] + association_dict = {pattern_node: dep_tree_node} + _subtree_in_graph(dep_tree_node, dep_tree, pattern_node, + pattern, results=results, + association_dict=association_dict) + results = results or [] + return results + + +def _subtree_in_graph(dep_tree_node, dep_tree, pattern_node, pattern, + association_dict=None, results=None): + token = dep_tree[dep_tree_node] + logger.debug("Starting from token '{}'".format(token.orth_)) + + adjacent_edges = list(pattern.edges_iter(origin=pattern_node)) + if adjacent_edges: + for (_, adjacent_pattern_node, + dep) in adjacent_edges: + adjacent_pattern_node_attr = pattern[adjacent_pattern_node] + logger.debug("Exploring relation {} -[{}]-> {} from " + "pattern".format(pattern[pattern_node], + dep, + adjacent_pattern_node_attr)) + + adjacent_nodes = find_adjacent_nodes(dep_tree, + dep_tree_node, + dep, + adjacent_pattern_node_attr) + + if not adjacent_nodes: + logger.debug("No adjacent nodes in dep_tree satisfying these " + "conditions.") + return None + + for adjacent_node in adjacent_nodes: + logger.debug("Found adjacent node '{}' in " + "dep_tree".format(dep_tree[adjacent_node].orth_)) + association_dict[adjacent_pattern_node] = adjacent_node + recursive_return = _subtree_in_graph(adjacent_node, + dep_tree, + adjacent_pattern_node, + pattern, + association_dict, + results=results) + + if recursive_return is None: + # No Match + return None + + association_dict, results = recursive_return + + else: + if len(association_dict) == pattern.number_of_nodes(): + logger.debug("Add to results: {}".format(association_dict)) + results.append(dict(association_dict)) + + else: + logger.debug("{} nodes in subgraph, only {} " + "mapped".format(pattern.number_of_nodes(), + len(association_dict))) + + logger.debug("Return intermediate: {}".format(association_dict)) + return association_dict, results + + +def find_adjacent_nodes(dep_tree, node, target_dep, node_attributes): + """Find nodes adjacent to ``node`` that fulfill specified attributes + values on edge and node. + + :param dep_tree: a :class:`DependencyTree` + :param node: initial node to search from + :param target_dep: edge attributes that must be fulfilled (pair-value) + :type target_dep: dict + :param node_attributes: node attributes that must be fulfilled (pair-value) + :type node_attributes: dict + :return: adjacent nodes that fulfill the given criteria (list) + """ + results = [] + for _, adj_node, adj_dep in dep_tree.edges_iter(origin=node): + adj_token = dep_tree[adj_node] + if (match_edge(adj_dep, target_dep) + and match_token(adj_token, node_attributes)): + results.append(adj_node) + + return results + + +def match_edge(token_dep, target_dep): + if target_dep is None: + return True + + if hasattr(target_dep, 'match'): + return target_dep.match(token_dep) is not None + + if token_dep == target_dep: + return True + + return False + + +def match_token(token, + target_attributes, + ignore_special_key=True, + lower=True): + bind_map = { + 'word': lambda t: t.orth_, + 'lemma': lambda t: t.lemma_, + } + + if lower: + bind_map = {key: lambda t: func(t).lower() for key, func in + bind_map.items()} + + for target_key, target_value in target_attributes.items(): + is_special_key = target_key[0] == '_' + + if ignore_special_key and is_special_key: + continue + + if lower and hasattr(target_value, 'lower'): + target_value = target_value.lower() + + if target_key in bind_map: + token_attr = bind_map[target_key](token) + + if hasattr(target_value, 'match'): # if it is a compiled regex + if target_value.match(token_attr) is None: + break + else: + if not token_attr == target_value: + break + + else: + raise ValueError("Unknown key: '{}'".format(target_key)) + + else: # the loop was not broken + return True + + return False From 8ff4f512a25f2ad9607d5491f054d07e90128c0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:28:36 +0200 Subject: [PATCH 050/195] Check in PatternParser that the generated Pattern is valid --- spacy/pattern/parser.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py index 9ebb9bd5c..a36446a1a 100644 --- a/spacy/pattern/parser.py +++ b/spacy/pattern/parser.py @@ -38,8 +38,18 @@ class PatternParser(object): if not pattern.nodes: return + cls.check_pattern(pattern) return pattern + @staticmethod + def check_pattern(pattern): + if not pattern.is_connected(): + raise ValueError("The pattern tree must be a fully connected " + "graph.") + + if pattern.root_node is None: + raise ValueError("The root node of the tree could not be found.") + @classmethod def _parse_line(cls, stream, pattern, lineno): while not stream.closed: From d9c567371f9ae85cb75b86ecd90bae5b00544905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:29:28 +0200 Subject: [PATCH 051/195] Move add_node and add_edge methods to the Tree base class --- spacy/pattern/pattern.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py index 42a15f769..de7a54e05 100644 --- a/spacy/pattern/pattern.py +++ b/spacy/pattern/pattern.py @@ -15,6 +15,16 @@ class Tree(object): def __getitem__(self, item): return self.nodes[item] + def add_node(self, node, attr_dict=None): + attr_dict = attr_dict or {} + self.nodes[node] = attr_dict + + def add_edge(self, u, v, dep=None): + if u not in self.nodes or v not in self.nodes: + raise ValueError("Each node must be defined before adding an edge.") + + self.adjacency[u][v] = dep + def number_of_nodes(self): return len(self) @@ -67,16 +77,6 @@ class Tree(object): class DependencyPattern(Tree): - def add_node(self, node, attr_dict=None): - attr_dict = attr_dict or {} - self.nodes[node] = attr_dict - - def add_edge(self, u, v, dep=None): - if u not in self.nodes or v not in self.nodes: - raise ValueError("Each node must be defined before adding an edge.") - - self.adjacency[u][v] = dep - @property def root_node(self): if self.number_of_nodes() == 1: From 4ca8a396a2934ddef3b7c71bab932f6bbe649759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:30:01 +0200 Subject: [PATCH 052/195] Do not add the root token to the adjacency map --- spacy/pattern/pattern.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py index de7a54e05..f21edf5a6 100644 --- a/spacy/pattern/pattern.py +++ b/spacy/pattern/pattern.py @@ -104,8 +104,10 @@ class DependencyTree(Tree): for token in doc: self.nodes[token.i] = token - # inverse the dependency to have an actual tree - self.adjacency[token.head.i][token.i] = token.dep_ + + if token.head.i != token.i: + # inverse the dependency to have an actual tree + self.adjacency[token.head.i][token.i] = token.dep_ def __getitem__(self, item): return self.nodes[item] From d010f5a123e724a168e2f30a3a6c903f7c0443d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:30:28 +0200 Subject: [PATCH 053/195] Fix node matching bug caused by lower function --- spacy/pattern/pattern.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py index f21edf5a6..d47022fec 100644 --- a/spacy/pattern/pattern.py +++ b/spacy/pattern/pattern.py @@ -282,10 +282,6 @@ def match_token(token, 'lemma': lambda t: t.lemma_, } - if lower: - bind_map = {key: lambda t: func(t).lower() for key, func in - bind_map.items()} - for target_key, target_value in target_attributes.items(): is_special_key = target_key[0] == '_' @@ -298,6 +294,9 @@ def match_token(token, if target_key in bind_map: token_attr = bind_map[target_key](token) + if lower: + token_attr = token_attr.lower() + if hasattr(target_value, 'match'): # if it is a compiled regex if target_value.match(token_attr) is None: break From 4289a21703d97e72c3dc81105c897efe69a45b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:30:53 +0200 Subject: [PATCH 054/195] Add 'ent' to node matching key --- spacy/pattern/pattern.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py index d47022fec..282cea0e3 100644 --- a/spacy/pattern/pattern.py +++ b/spacy/pattern/pattern.py @@ -280,6 +280,7 @@ def match_token(token, bind_map = { 'word': lambda t: t.orth_, 'lemma': lambda t: t.lemma_, + 'ent': lambda t: t.ent_type_, } for target_key, target_value in target_attributes.items(): From 1849a110e3abf9938f92563213f6583ab459931b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:31:19 +0200 Subject: [PATCH 055/195] Improve logging --- spacy/pattern/pattern.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/pattern/pattern.py b/spacy/pattern/pattern.py index 282cea0e3..552283066 100644 --- a/spacy/pattern/pattern.py +++ b/spacy/pattern/pattern.py @@ -130,6 +130,10 @@ class DependencyTree(Tree): pattern_root_node_attr = pattern[pattern_root_node] dep_root_nodes = self.match_nodes(pattern_root_node_attr) + if not dep_root_nodes: + logger.debug("No node matches the pattern root " + "'{}'".format(pattern_root_node_attr)) + matches = [] for candidate_root_node in dep_root_nodes: match_list = subtree_in_graph(candidate_root_node, self, From 46637369aaffc0ba0e62cec675289b8275d149c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 11 Jun 2017 18:34:38 +0200 Subject: [PATCH 056/195] Add basic unit tests for Pattern --- spacy/tests/pattern/__init__.py | 1 + spacy/tests/pattern/parser.py | 68 +++++++++++++++++++++++++++++++++ spacy/tests/pattern/pattern.py | 61 +++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 spacy/tests/pattern/__init__.py create mode 100644 spacy/tests/pattern/parser.py create mode 100644 spacy/tests/pattern/pattern.py diff --git a/spacy/tests/pattern/__init__.py b/spacy/tests/pattern/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/spacy/tests/pattern/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/spacy/tests/pattern/parser.py b/spacy/tests/pattern/parser.py new file mode 100644 index 000000000..a56bda20a --- /dev/null +++ b/spacy/tests/pattern/parser.py @@ -0,0 +1,68 @@ +# coding: utf-8 + + +import re +from ...pattern.parser import PatternParser + + +class TestPatternParser: + def test_empty_query(self): + assert PatternParser.parse('') is None + assert PatternParser.parse(' ') is None + + def test_define_node(self): + query = "fox [lemma:fox,word:fox]=alias" + pattern = PatternParser.parse(query) + + assert pattern is not None + assert pattern.number_of_nodes() == 1 + assert pattern.number_of_edges() == 0 + + assert 'fox' in pattern.nodes + + attrs = pattern['fox'] + assert attrs.get('lemma') == 'fox' + assert attrs.get('word') == 'fox' + assert attrs.get('_name') == 'fox' + assert attrs.get('_alias') == 'alias' + + for adj_list in pattern.adjacency.values(): + assert not adj_list + + def test_define_node_with_regex(self): + query = "fox [lemma:/fo.*/]" + pattern = PatternParser.parse(query) + + attrs = pattern['fox'] + assert attrs.get('lemma') == re.compile(r'fo.*', re.U) + + def test_define_edge(self): + query = "[word:quick] >amod [word:fox]" + pattern = PatternParser.parse(query) + + assert pattern is not None + assert pattern.number_of_nodes() == 2 + assert pattern.number_of_edges() == 1 + + base_node_id = list(pattern.adjacency.keys())[0] + adj_map = pattern.adjacency[base_node_id] + + assert len(adj_map) == 1 + head_node_id = list(adj_map.keys())[0] + dep = adj_map[head_node_id] + + assert dep == 'amod' + assert pattern[base_node_id]['word'] == 'fox' + assert pattern[head_node_id]['word'] == 'quick' + + def test_define_edge_with_regex(self): + query = "[word:quick] >/amod|nsubj/ [word:fox]" + pattern = PatternParser.parse(query) + + base_node_id = list(pattern.adjacency.keys())[0] + adj_map = pattern.adjacency[base_node_id] + + assert len(adj_map) == 1 + head_node_id = list(adj_map.keys())[0] + dep = adj_map[head_node_id] + assert dep == re.compile(r'amod|nsubj', re.U) diff --git a/spacy/tests/pattern/pattern.py b/spacy/tests/pattern/pattern.py new file mode 100644 index 000000000..a476f92f7 --- /dev/null +++ b/spacy/tests/pattern/pattern.py @@ -0,0 +1,61 @@ +# coding: utf-8 + +from ..util import get_doc +from ...pattern.pattern import Tree, DependencyTree +from ...pattern.parser import PatternParser + +import pytest + +import logging +logger = logging.getLogger() +logger.addHandler(logging.StreamHandler()) +logger.setLevel(logging.DEBUG) + + +@pytest.fixture +def doc(en_vocab): + words = ['I', "'m", 'going', 'to', 'the', 'zoo', 'next', 'week', '.'] + doc = get_doc(en_vocab, + words=words, + deps=['nsubj', 'aux', 'ROOT', 'prep', 'det', 'pobj', + 'amod', 'npadvmod', 'punct'], + heads=[2, 1, 0, -1, 1, -2, 1, -5, -6]) + return doc + + +class TestTree: + def test_is_connected(self): + tree = Tree() + tree.add_node(1) + tree.add_node(2) + tree.add_edge(1, 2) + + assert tree.is_connected() + + tree.add_node(3) + assert not tree.is_connected() + + +class TestDependencyTree: + def test_from_doc(self, doc): + dep_tree = DependencyTree(doc) + + assert len(dep_tree) == len(doc) + assert dep_tree.is_connected() + assert dep_tree.number_of_edges() == len(doc) - 1 + + def test_simple_matching(self, doc): + dep_tree = DependencyTree(doc) + pattern = PatternParser.parse("""root [word:going] + to [word:to] + [word:week]=date > root + [word:/zoo|park/]=place >pobj to + to >prep root + """) + assert pattern is not None + matches = dep_tree.match(pattern) + assert len(matches) == 1 + + match = matches[0] + assert match['place'] == doc[5] + assert match['date'] == doc[7] From e4a45ae55fba89a65fc0851783fd712ae6d1755d Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 12 Jun 2017 12:28:51 +0200 Subject: [PATCH 057/195] Very minor documentation fix --- website/docs/usage/customizing-tokenizer.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index b1fbba652..354a56c22 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -214,7 +214,7 @@ p def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer - spaces = [True] * len(word) + spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces) p From d19ce29a23de1805be3bb2b0a694a38d671fdfb3 Mon Sep 17 00:00:00 2001 From: Ian Mobbs Date: Mon, 12 Jun 2017 13:21:44 -0400 Subject: [PATCH 058/195] Create requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8194dee58..20c587841 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ thinc>=6.5.0,<6.6.0 murmurhash>=0.26,<0.27 plac<1.0.0,>=0.9.6 six +html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 From 81166c3d563bf5c3ca86924b06c4fd44dd6e3a11 Mon Sep 17 00:00:00 2001 From: Nathan Glenn Date: Wed, 21 Jun 2017 19:22:30 +0200 Subject: [PATCH 059/195] fix confusing typo This document describes the `Vocab` class, not the `Span` class. --- website/docs/api/vocab.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index 7490bccf4..c036c650b 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -124,7 +124,7 @@ p +cell #[code Lexeme] +cell The lexeme indicated by the given ID. -+h(2, "iter") Span.__iter__ ++h(2, "iter") Vocab.__iter__ +tag method p Iterate over the lexemes in the vocabulary. From f69ff1508959e60ced2a0bf329aae07710bc9bde Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 27 Jun 2017 14:49:02 +0200 Subject: [PATCH 060/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ea6096a52..c419a03cf 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -25,6 +25,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Regan, [@jimregan](https://github.com/jimregan) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) From 84041a2bb517841d725781bdd72b1daf4f8e603d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Jun 2017 01:18:05 +0900 Subject: [PATCH 061/195] Make create_tokenizer work with Japanese --- spacy/ja/__init__.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 07e40ada6..1c85ded95 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function from os import path -from ..language import Language +from ..language import Language, BaseDefaults +from ..tokenizer import Tokenizer from ..attrs import LANG from ..tokens import Doc from .language_data import * - -class Japanese(Language): - lang = 'ja' - - def make_doc(self, text): +class JapaneseTokenizer(object): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome library: " "https://github.com/mocobeta/janome") - words = [x.surface for x in Tokenizer().tokenize(text)] + self.tokenizer = Tokenizer() + + def __call__(self, text): + words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) + +class JapaneseDefaults(BaseDefaults): + @classmethod + def create_tokenizer(cls, nlp=None): + return JapaneseTokenizer(cls, nlp) + +class Japanese(Language): + lang = 'ja' + + Defaults = JapaneseDefaults + + def make_doc(self, text): + words = self.tokenizer(text) + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + + From e56fea14eb7e807d5ea4ee5fdd12f7ca0610690a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Jun 2017 01:24:25 +0900 Subject: [PATCH 062/195] Add basic Japanese tokenizer test --- spacy/tests/conftest.py | 8 +++++++- spacy/tests/ja/__init__.py | 0 spacy/tests/ja/test_tokenizer.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/ja/__init__.py create mode 100644 spacy/tests/ja/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b8ada1d9a..b0f11b5a4 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -5,6 +5,7 @@ from ..en import English from ..de import German from ..es import Spanish from ..it import Italian +from ..ja import Japanese from ..fr import French from ..pt import Portuguese from ..nl import Dutch @@ -27,7 +28,7 @@ import os import pytest -LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, +LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] @@ -76,6 +77,11 @@ def fi_tokenizer(): return Finnish.Defaults.create_tokenizer() +@pytest.fixture +def ja_tokenizer(): + return Japanese.Defaults.create_tokenizer() + + @pytest.fixture def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() diff --git a/spacy/tests/ja/__init__.py b/spacy/tests/ja/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py new file mode 100644 index 000000000..8d45c822d --- /dev/null +++ b/spacy/tests/ja/test_tokenizer.py @@ -0,0 +1,8 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_japanese_tokenizer(ja_tokenizer): + tokens = ja_tokenizer("æ—ĨæœŦčĒžã ã‚ˆ") + assert len(tokens) == 3 From 1b3a5d87bad69dcb8ec9cdb26ec030f7894708ec Mon Sep 17 00:00:00 2001 From: Alexis Date: Wed, 28 Jun 2017 14:11:20 +0200 Subject: [PATCH 063/195] French NUM_WORDS and ORDINAL_WORDS --- spacy/fr/stop_words.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/spacy/fr/stop_words.py b/spacy/fr/stop_words.py index d9b820537..71f124d6c 100644 --- a/spacy/fr/stop_words.py +++ b/spacy/fr/stop_words.py @@ -86,3 +86,28 @@ votre vous vous-mÃĒmes vu vÊ vôtre vôtres zut """.split()) + + + +# Number words + +NUM_WORDS = set(""" +zero un deux trois quatre cinq six sept huit neuf dix +onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf +vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante +cent mille mil million milliard billion quadrillion quintillion +sextillion septillion octillion nonillion decillion +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième +onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième +vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième +centième millième millionnième milliardième billionnième quadrillionnième quintillionnième +sextillionnième septillionnième octillionnième nonillionnième decillionnième +""".split()) + + + From 30a34ebb6edb513e262d1f47b6742b4480282f3c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 29 Jun 2017 00:09:20 +0900 Subject: [PATCH 064/195] Add importorskip for janome --- spacy/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b0f11b5a4..222f9aa1d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -79,6 +79,7 @@ def fi_tokenizer(): @pytest.fixture def ja_tokenizer(): + janome = pytest.importorskip("janome") return Japanese.Defaults.create_tokenizer() From c33619339217dbeff75243d7493dc60685ddf28c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 29 Jun 2017 00:09:40 +0900 Subject: [PATCH 065/195] Parametrize and extend Japanese tokenizer tests --- spacy/tests/ja/test_tokenizer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py index 8d45c822d..58700b353 100644 --- a/spacy/tests/ja/test_tokenizer.py +++ b/spacy/tests/ja/test_tokenizer.py @@ -3,6 +3,15 @@ from __future__ import unicode_literals import pytest -def test_japanese_tokenizer(ja_tokenizer): - tokens = ja_tokenizer("æ—ĨæœŦčĒžã ã‚ˆ") - assert len(tokens) == 3 +TOKENIZER_TESTS = [ + ("æ—ĨæœŦčĒžã ã‚ˆ", ['æ—ĨæœŦčĒž', 'だ', 'よ']), + ("æąäēŦã‚ŋワãƒŧぎčŋ‘くãĢäŊã‚“でいぞす。", ['æąäēŦ', 'ã‚ŋワãƒŧ', 'ぎ', 'čŋ‘く', 'ãĢ', 'äŊã‚“', 'で', 'い', 'ぞす', '。']), + ("吞čŧŠã¯įŒĢである。", ['吞čŧŠ', 'は', 'įŒĢ', 'で', 'ある', '。']), + ("月ãĢäģŖわãŖãĻ、おäģ•įŊŽãã‚ˆ!", ['月', 'ãĢ', 'äģŖわãŖ', 'ãĻ', '、', 'おäģ•įŊŽã', 'よ', '!']), + ("ã™ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ãŽã†ãĄ", ['すもも', 'も', 'もも', 'も', 'もも', 'ぎ', 'ã†ãĄ']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens): + tokens = [token.text for token in ja_tokenizer(text)] + assert tokens == expected_tokens From dfaeee1f37d8b7b614e55cd732c6c89abb9afd92 Mon Sep 17 00:00:00 2001 From: Callum Kift Date: Fri, 30 Jun 2017 09:56:33 +0200 Subject: [PATCH 066/195] fixed bug in training ner documentation and example --- examples/training/train_new_entity_type.py | 2 +- website/docs/usage/training-ner.jade | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4eae11c75..987ab5859 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -52,6 +52,7 @@ def train_ner(nlp, train_data, output_dir): random.shuffle(train_data) loss = 0. for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) # By default, the GoldParse class assumes that the entities # described by offset are complete, and all other words should @@ -63,7 +64,6 @@ def train_ner(nlp, train_data, output_dir): #for i in range(len(gold.ner)): #if not gold.ner[i].endswith('ANIMAL'): # gold.ner[i] = '-' - doc = nlp.make_doc(raw_text) nlp.tagger(doc) # As of 1.9, spaCy's parser now lets you supply a dropout probability # This might help the model generalize better from only a few diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 78eb4905e..52eedd21e 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -150,8 +150,8 @@ p for itn in range(20): random.shuffle(train_data) for raw_text, entity_offsets in train_data: - gold = GoldParse(doc, entities=entity_offsets) doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) nlp.tagger(doc) loss = nlp.entity.update(doc, gold) nlp.end_training() From 669bd142130f3e3c66b253efd0df1dd7ce2ba3f4 Mon Sep 17 00:00:00 2001 From: gispk47 Date: Sat, 1 Jul 2017 13:12:00 +0800 Subject: [PATCH 067/195] Update __init__.py remove the empty string return from jieba.cut,this will cause the list of tokens cant be pushed assert error --- spacy/zh/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index 1847a7d8d..0f407dec6 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -8,4 +8,5 @@ class Chinese(Language): def make_doc(self, text): import jieba words = list(jieba.cut(text, cut_all=True)) + words=[x for x in words if x] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From c3d722d66f150a69037340e4daf03ec921f4e489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sat, 1 Jul 2017 13:09:50 +0200 Subject: [PATCH 068/195] Add a disclaimer about classes copied from the Jinja2 project --- spacy/pattern/parser.py | 43 ++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py index a36446a1a..4b6fbc8dd 100644 --- a/spacy/pattern/parser.py +++ b/spacy/pattern/parser.py @@ -237,25 +237,6 @@ class PatternParser(object): yield Token(lineno, token, value) -class Token(tuple): - """Token class.""" - __slots__ = () - lineno, type, value = (property(itemgetter(x)) for x in range(3)) - - def __new__(cls, lineno, type, value): - return tuple.__new__(cls, (lineno, intern(str(type)), value)) - - def hash(self): - string = str(self.value) - return md5(string.encode('utf-8')).hexdigest() - - def __repr__(self): - return 'Token(%r, %r, %r)' % ( - self.lineno, - self.type, - self.value) - - class Reader(object): """A class used by the :class:`PatternParser` to tokenize the `text`.""" __slots__ = ('text', 'pos') @@ -283,7 +264,7 @@ class Reader(object): return s def next(self): - return self.text[self.pos:self.pos+1] + return self.text[self.pos:self.pos + 1] def remaining(self): return len(self.text) - self.pos @@ -301,6 +282,28 @@ class Reader(object): return self.text[self.pos:] +# The following classes were copied from Jinja2, a BSD-licensed project, +# and slightly modified: Token, TokenStreamIterator, TokenStream. + +class Token(tuple): + """Token class.""" + __slots__ = () + lineno, type, value = (property(itemgetter(x)) for x in range(3)) + + def __new__(cls, lineno, type, value): + return tuple.__new__(cls, (lineno, intern(str(type)), value)) + + def hash(self): + string = str(self.value) + return md5(string.encode('utf-8')).hexdigest() + + def __repr__(self): + return 'Token(%r, %r, %r)' % ( + self.lineno, + self.type, + self.value) + + class TokenStreamIterator(object): """The iterator for tokenstreams. Iterate over the stream until the stream is empty. From f4748834d973a525024cac18fb58fe9934957170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sat, 1 Jul 2017 13:17:26 +0200 Subject: [PATCH 069/195] Use spacy hash_string function instead of md5 --- spacy/pattern/parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pattern/parser.py b/spacy/pattern/parser.py index 4b6fbc8dd..122d2b8f3 100644 --- a/spacy/pattern/parser.py +++ b/spacy/pattern/parser.py @@ -1,9 +1,9 @@ # coding: utf-8 from spacy.compat import intern, queue +from spacy.strings import hash_string from operator import itemgetter import re -from hashlib import md5 import json from .pattern import DependencyPattern @@ -294,8 +294,8 @@ class Token(tuple): return tuple.__new__(cls, (lineno, intern(str(type)), value)) def hash(self): - string = str(self.value) - return md5(string.encode('utf-8')).hexdigest() + string = self.value + return hash_string(string) def __repr__(self): return 'Token(%r, %r, %r)' % ( From 8592f3de47406daed2a26e3d0927a7706b1191d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sat, 1 Jul 2017 15:03:32 +0200 Subject: [PATCH 070/195] Fix fuzzy unit tests --- spacy/tests/pattern/parser.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/spacy/tests/pattern/parser.py b/spacy/tests/pattern/parser.py index a56bda20a..50dd3ac60 100644 --- a/spacy/tests/pattern/parser.py +++ b/spacy/tests/pattern/parser.py @@ -44,25 +44,33 @@ class TestPatternParser: assert pattern.number_of_nodes() == 2 assert pattern.number_of_edges() == 1 - base_node_id = list(pattern.adjacency.keys())[0] - adj_map = pattern.adjacency[base_node_id] + quick_id = [node_id for node_id, node_attr in pattern.nodes.items() + if node_attr['word'] == 'quick'][0] - assert len(adj_map) == 1 - head_node_id = list(adj_map.keys())[0] - dep = adj_map[head_node_id] + fox_id = [node_id for node_id, node_attr in pattern.nodes.items() + if node_attr['word'] == 'fox'][0] + + quick_map = pattern.adjacency[quick_id] + fox_map = pattern.adjacency[fox_id] + + assert len(quick_map) == 0 + assert len(fox_map) == 1 + + dep = fox_map[quick_id] assert dep == 'amod' - assert pattern[base_node_id]['word'] == 'fox' - assert pattern[head_node_id]['word'] == 'quick' def test_define_edge_with_regex(self): query = "[word:quick] >/amod|nsubj/ [word:fox]" pattern = PatternParser.parse(query) - base_node_id = list(pattern.adjacency.keys())[0] - adj_map = pattern.adjacency[base_node_id] + quick_id = [node_id for node_id, node_attr in pattern.nodes.items() + if node_attr['word'] == 'quick'][0] + + fox_id = [node_id for node_id, node_attr in pattern.nodes.items() + if node_attr['word'] == 'fox'][0] + + fox_map = pattern.adjacency[fox_id] + dep = fox_map[quick_id] - assert len(adj_map) == 1 - head_node_id = list(adj_map.keys())[0] - dep = adj_map[head_node_id] assert dep == re.compile(r'amod|nsubj', re.U) From 5357874bf74b05a40961ba05936f6009453a48b8 Mon Sep 17 00:00:00 2001 From: Swier Date: Wed, 5 Jul 2017 14:03:30 +0200 Subject: [PATCH 071/195] add Dutch numbers and ordinals --- spacy/nl/stop_words.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/spacy/nl/stop_words.py b/spacy/nl/stop_words.py index 22f1d714c..d19515262 100644 --- a/spacy/nl/stop_words.py +++ b/spacy/nl/stop_words.py @@ -41,3 +41,22 @@ want waren was wat we wel werd wezen wie wij wil worden zal ze zei zelf zich zij zijn zo zonder zou """.split()) + + +# Number words + +NUM_WORDS = set(""" +nul een ÊÊn twee drie vier vijf zes zeven acht negen tien elf twaalf dertien +veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd +duizend miljoen miljard biljoen biljard triljoen triljard +""".split()) + + +# Ordinal words + +ORDINAL_WORDS = set(""" +eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde +twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste +zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste +miljardste biljoenste biljardste triljoenste triljardste +""".split()) From f377c9c952ed6b42086c0ee9fcedb5a67af963b4 Mon Sep 17 00:00:00 2001 From: Swier Date: Wed, 5 Jul 2017 14:06:28 +0200 Subject: [PATCH 072/195] Rename stop_words.py to word_sets.py --- spacy/nl/{stop_words.py => word_sets.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/nl/{stop_words.py => word_sets.py} (100%) diff --git a/spacy/nl/stop_words.py b/spacy/nl/word_sets.py similarity index 100% rename from spacy/nl/stop_words.py rename to spacy/nl/word_sets.py From 29720150f9960c1a57b2d463d4653e0a8f3211e0 Mon Sep 17 00:00:00 2001 From: Swier Date: Wed, 5 Jul 2017 14:08:04 +0200 Subject: [PATCH 073/195] fix import of stop words in language data --- spacy/nl/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index f9899d8d1..b3ca1aef9 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .. import language_data as base from ..language_data import update_exc, strings_to_exc -from .stop_words import STOP_WORDS +from .word_sets import STOP_WORDS, NUM_WORDS STOP_WORDS = set(STOP_WORDS) From 19d4706f69b8788bffc43ab0bf07a80a1ed5bdab Mon Sep 17 00:00:00 2001 From: val314159 Date: Fri, 7 Jul 2017 13:18:17 -0700 Subject: [PATCH 074/195] make this work in python2.7 --- website/docs/usage/lightning-tour.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 138b0058d..2fd390d26 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -83,7 +83,7 @@ p +h(2, "examples-word-vectors") Word vectors +code. - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") + doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.") apples = doc[0] oranges = doc[2] From 04e6a6518869b1ca15beb79694049e0fb164a2aa Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 9 Jul 2017 16:23:26 +0900 Subject: [PATCH 075/195] Remove Japanese from LANGUAGES LANGUAGES is a list of languages whose tokenizers get run through a variety of generic tests. Since the generic tests don't check the JA fixture, it blows up when it can't find janome. -POLM --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 222f9aa1d..29d896a5d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -28,7 +28,7 @@ import os import pytest -LANGUAGES = [English, German, Spanish, Italian, Japanese, French, Portuguese, Dutch, +LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] From bc87b815cc34d375e1a4b4c9b54c296691cee237 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 9 Jul 2017 16:28:55 +0900 Subject: [PATCH 076/195] Add comment clarifying what LANGUAGES does --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 29d896a5d..6e00b1513 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -27,7 +27,7 @@ from pathlib import Path import os import pytest - +# These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] From 6cf26909438230b4f9626d6cf25a19ecd0d1555c Mon Sep 17 00:00:00 2001 From: lgenerknol Date: Wed, 12 Jul 2017 11:06:16 -0400 Subject: [PATCH 077/195] Missing markup char Frontend displayed: ``` If start_idx and do not mark[...] ``` Note the missing "end_idx" after 'and'. --- website/docs/api/doc.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index adcd111a3..1c2911f52 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -272,7 +272,7 @@ p Import the document contents from a binary string. p | Retokenize the document, such that the span at | #[code doc.text[start_idx : end_idx]] is merged into a single token. If - | #[code start_idx] and #[end_idx] do not mark start and end token + | #[code start_idx] and #[code end_idx] do not mark start and end token | boundaries, the document remains unchanged. +table(["Name", "Type", "Description"]) From 2b219caf0d01e98e10b82b940ba184a63ead64a5 Mon Sep 17 00:00:00 2001 From: lgenerknol Date: Wed, 12 Jul 2017 13:12:24 -0400 Subject: [PATCH 078/195] .../cli/#foo is 404 https://spacy.io/docs/usage/cli/#package is a 404. Changed to https://spacy.io/docs/usage/cli#package Definitely a larger fix possible to deal with trailing slashes --- website/docs/usage/saving-loading.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index c4eb08f04..8978cce7a 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -28,7 +28,7 @@ p | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + | #[+a("/docs/usage/cli#package") #[code package] command] documentation. +aside-code("meta.json", "json"). { From fadacd0d47a898173ae68bdfb758e688f7a176ce Mon Sep 17 00:00:00 2001 From: Jorge Paredes Date: Sun, 16 Jul 2017 10:06:32 -0500 Subject: [PATCH 079/195] Fix url broken The related url to **custom named entities** was broken --- website/docs/usage/models.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 9bb75ba9a..30863720c 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -203,7 +203,7 @@ p p | If you've trained your own model, for example for | #[+a("/docs/usage/adding-languages") additional languages] or - | #[+a("/docs/usage/train-ner") custom named entities], you can save its + | #[+a("/docs/usage/training-ner") custom named entities], you can save its | state using the #[code Language.save_to_directory()] method. To make the | model more convenient to deploy, we recommend wrapping it as a Python | package. From 8bb443be4fc63fd76e6ddf48008aacfe3a716398 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:28:51 +0200 Subject: [PATCH 080/195] Add standalone tagger training example --- examples/training/train_tagger_ud.py | 150 +++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 examples/training/train_tagger_ud.py diff --git a/examples/training/train_tagger_ud.py b/examples/training/train_tagger_ud.py new file mode 100644 index 000000000..3015c52e8 --- /dev/null +++ b/examples/training/train_tagger_ud.py @@ -0,0 +1,150 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import plac +import codecs +import spacy.symbols as symbols +import spacy +from pathlib import Path + +from spacy.vocab import Vocab +from spacy.tagger import Tagger +from spacy.tokens import Doc +from spacy.gold import GoldParse +from spacy.language import Language +from spacy import orth +from spacy import attrs + +import random + +TAG_MAP = { + 'ADJ': {symbols.POS: symbols.ADJ}, + 'ADP': {symbols.POS: symbols.ADP}, + 'PUNCT': {symbols.POS: symbols.PUNCT}, + 'ADV': {symbols.POS: symbols.ADV}, + 'AUX': {symbols.POS: symbols.AUX}, + 'SYM': {symbols.POS: symbols.SYM}, + 'INTJ': {symbols.POS: symbols.INTJ}, + 'CCONJ': {symbols.POS: symbols.CCONJ}, + 'X': {symbols.POS: symbols.X}, + 'NOUN': {symbols.POS: symbols.NOUN}, + 'DET': {symbols.POS: symbols.DET}, + 'PROPN': {symbols.POS: symbols.PROPN}, + 'NUM': {symbols.POS: symbols.NUM}, + 'VERB': {symbols.POS: symbols.VERB}, + 'PART': {symbols.POS: symbols.PART}, + 'PRON': {symbols.POS: symbols.PRON}, + 'SCONJ': {symbols.POS: symbols.SCONJ}, +} + +LEX_ATTR_GETTERS = { + attrs.LOWER: lambda string: string.lower(), + attrs.NORM: lambda string: string, + attrs.SHAPE: orth.word_shape, + attrs.PREFIX: lambda string: string[0], + attrs.SUFFIX: lambda string: string[-3:], + attrs.CLUSTER: lambda string: 0, + attrs.IS_ALPHA: orth.is_alpha, + attrs.IS_ASCII: orth.is_ascii, + attrs.IS_DIGIT: lambda string: string.isdigit(), + attrs.IS_LOWER: orth.is_lower, + attrs.IS_PUNCT: orth.is_punct, + attrs.IS_SPACE: lambda string: string.isspace(), + attrs.IS_TITLE: orth.is_title, + attrs.IS_UPPER: orth.is_upper, + attrs.IS_BRACKET: orth.is_bracket, + attrs.IS_QUOTE: orth.is_quote, + attrs.IS_LEFT_PUNCT: orth.is_left_punct, + attrs.IS_RIGHT_PUNCT: orth.is_right_punct, + attrs.LIKE_URL: orth.like_url, + attrs.LIKE_NUM: orth.like_number, + attrs.LIKE_EMAIL: orth.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True +} + + +def read_ud_data(path): + data = [] + last_number = -1 + sentence_words = [] + sentence_tags = [] + with codecs.open(path, encoding="utf-8") as f: + while True: + line = f.readline() + if not line: + break + + if line[0].isdigit(): + d = line.split() + if not "-" in d[0]: + number = int(line[0]) + if number < last_number: + data.append((sentence_words, sentence_tags),) + sentence_words = [] + sentence_tags = [] + sentence_words.append(d[2]) + sentence_tags.append(d[3]) + last_number = number + if len(sentence_words) > 0: + data.append((sentence_words, sentence_tags,)) + return data + +def ensure_dir(path): + if not path.exists(): + path.mkdir() + + +def main(train_loc, dev_loc, output_dir=None): + if output_dir is not None: + output_dir = Path(output_dir) + ensure_dir(output_dir) + ensure_dir(output_dir / "pos") + ensure_dir(output_dir / "vocab") + + train_data = read_ud_data(train_loc) + vocab = Vocab(tag_map=TAG_MAP, lex_attr_getters=LEX_ATTR_GETTERS) + # Populate vocab + for words, _ in train_data: + for word in words: + _ = vocab[word] + + model = spacy.tagger.TaggerModel(spacy.tagger.Tagger.feature_templates) + tagger = Tagger(vocab, model) + print(tagger.tag_names) + for i in range(30): + print("training model (iteration " + str(i) + ")...") + score = 0. + num_samples = 0. + for words, tags in train_data: + doc = Doc(vocab, words=words) + gold = GoldParse(doc, tags=tags) + cost = tagger.update(doc, gold) + for i, word in enumerate(doc): + num_samples += 1 + if word.tag_ == tags[i]: + score += 1 + print('Train acc', score/num_samples) + random.shuffle(train_data) + tagger.model.end_training() + + score = 0.0 + test_data = read_ud_data(dev_loc) + num_samples = 0 + for words, tags in test_data: + doc = Doc(vocab, words) + tagger(doc) + for i, word in enumerate(doc): + num_samples += 1 + if word.tag_ == tags[i]: + score += 1 + print("score: " + str(score / num_samples * 100.0)) + + if output_dir is not None: + tagger.model.dump(str(output_dir / 'pos' / 'model')) + with (output_dir / 'vocab' / 'strings.json').open('w') as file_: + tagger.vocab.strings.dump(file_) + + +if __name__ == '__main__': + plac.call(main) From 3fef5f642bd7f40cbc41319e51e71579bde791f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:29:15 +0200 Subject: [PATCH 081/195] Rename tagger training example --- .../{train_tagger_ud.py => train_tagger_standalone_ud.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/training/{train_tagger_ud.py => train_tagger_standalone_ud.py} (100%) diff --git a/examples/training/train_tagger_ud.py b/examples/training/train_tagger_standalone_ud.py similarity index 100% rename from examples/training/train_tagger_ud.py rename to examples/training/train_tagger_standalone_ud.py From a405660068f9f1c17a71a54866f475b2b13eef6c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:32:48 +0200 Subject: [PATCH 082/195] Add commit to tagger example --- examples/training/train_new_entity_type.py | 4 ++-- examples/training/train_tagger_standalone_ud.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4eae11c75..6c432acdf 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,8 +24,8 @@ For more details, see the documentation: * Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner * Saving and loading models: https://spacy.io/docs/usage/saving-loading -Developed for: spaCy 1.7.6 -Last tested for: spaCy 1.7.6 +Developed for: spaCy 1.9.0 +Last tested for: spaCy 1.9.0 """ from __future__ import unicode_literals, print_function diff --git a/examples/training/train_tagger_standalone_ud.py b/examples/training/train_tagger_standalone_ud.py index 3015c52e8..ce1ab50d6 100644 --- a/examples/training/train_tagger_standalone_ud.py +++ b/examples/training/train_tagger_standalone_ud.py @@ -1,3 +1,17 @@ +''' +This example shows training of the POS tagger without the Language class, +showing the APIs of the atomic components. + +This example was adapted from the gist here: + +https://gist.github.com/kamac/a7bc139f62488839a8118214a4d932f2 + +Issue discussing the gist: + +https://github.com/explosion/spaCy/issues/1179 + +The example was written for spaCy 1.8.2. +''' from __future__ import unicode_literals from __future__ import print_function From 5916d46ba8a9c85f5f8c115bb831561e3c64d256 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:34:01 +0200 Subject: [PATCH 083/195] Avoid use of deepcopy in printer --- spacy/tokens/printers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index d70088540..487d74167 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -49,6 +49,7 @@ def parse_tree(doc, light=False, flat=False): >>> trees = doc.print_tree() [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] """ - doc_clone = deepcopy(doc) + doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) + doc_clone.from_array(doc.to_array([HEAD, DEP, TAG, ENT_IOB, ENT_TYPE]) merge_ents(doc_clone) # merge the entities into single tokens first return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] From 8b581fdac515173f80a2b1560f2b58286d3c92e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:36:54 +0200 Subject: [PATCH 084/195] Remove unused example --- examples/chainer_sentiment.py | 322 ---------------------------------- 1 file changed, 322 deletions(-) delete mode 100644 examples/chainer_sentiment.py diff --git a/examples/chainer_sentiment.py b/examples/chainer_sentiment.py deleted file mode 100644 index 747ef508a..000000000 --- a/examples/chainer_sentiment.py +++ /dev/null @@ -1,322 +0,0 @@ -'''WIP --- Doesn't work well yet''' -import plac -import random -import six - -import cProfile -import pstats - -import pathlib -import cPickle as pickle -from itertools import izip - -import spacy - -import cytoolz -import cupy as xp -import cupy.cuda -import chainer.cuda - -import chainer.links as L -import chainer.functions as F -from chainer import Chain, Variable, report -import chainer.training -import chainer.optimizers -from chainer.training import extensions -from chainer.iterators import SerialIterator -from chainer.datasets import TupleDataset - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - raise NotImplementedError - #with (path / 'config.json').open() as file_: - # model = model_from_json(file_.read()) - #with (path / 'model').open('rb') as file_: - # lstm_weights = pickle.load(file_) - #embeddings = get_embeddings(nlp.vocab) - #model.set_weights([embeddings] + lstm_weights) - #return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000, n_threads=2): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -class Classifier(Chain): - def __init__(self, predictor): - super(Classifier, self).__init__(predictor=predictor) - - def __call__(self, x, t): - y = self.predictor(x) - loss = F.softmax_cross_entropy(y, t) - accuracy = F.accuracy(y, t) - report({'loss': loss, 'accuracy': accuracy}, self) - return loss - - -class SentimentModel(Chain): - def __init__(self, nlp, shape, **settings): - Chain.__init__(self, - embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], - set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), - encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), - attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), - predict=_Predict(shape['nr_hidden'], shape['nr_class'])) - self.to_gpu(0) - - def __call__(self, sentence): - return self.predict( - self.attend( - self.encode( - self.embed(sentence)))) - - -class _Embed(Chain): - def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): - Chain.__init__(self, - embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), - project=L.Linear(None, nr_out, nobias=True)) - self.embed.W.volatile = False - - def __call__(self, sentence): - return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] - - -class _Encode(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - fwd=L.LSTM(nr_in, nr_out), - bwd=L.LSTM(nr_in, nr_out), - mix=L.Bilinear(nr_out, nr_out, nr_out)) - - def __call__(self, sentence): - self.fwd.reset_state() - fwds = map(self.fwd, sentence) - self.bwd.reset_state() - bwds = reversed(map(self.bwd, reversed(sentence))) - return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] - - -class _Attend(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self) - - def __call__(self, sentence): - sent = sum(sentence) - return sent - - -class _Predict(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - l1=L.Linear(nr_in, nr_in), - l2=L.Linear(nr_in, nr_out)) - - def __call__(self, vector): - vector = self.l1(vector) - vector = F.elu(vector) - vector = self.l2(vector) - return vector - - -class SentenceDataset(TupleDataset): - def __init__(self, nlp, texts, labels, max_length): - self.max_length = max_length - sents, labels = self._get_labelled_sentences( - nlp.pipe(texts, batch_size=5000, n_threads=3), - labels) - TupleDataset.__init__(self, - get_features(sents, max_length), - labels) - - def __getitem__(self, index): - batches = [dataset[index] for dataset in self._datasets] - if isinstance(index, slice): - length = len(batches[0]) - returns = [tuple([batch[i] for batch in batches]) - for i in six.moves.range(length)] - return returns - else: - return tuple(batches) - - def _get_labelled_sentences(self, docs, doc_labels): - labels = [] - sentences = [] - for doc, y in izip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, xp.asarray(labels, dtype='i') - - -class DocDataset(TupleDataset): - def __init__(self, nlp, texts, labels): - self.max_length = max_length - DatasetMixin.__init__(self, - get_features( - nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), - labels) - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -def get_features(docs, max_length): - docs = list(docs) - Xs = xp.zeros((len(docs), max_length), dtype='i') - for i, doc in enumerate(docs): - j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.norm - j += 1 - if j >= max_length: - break - return Xs - - -def set_vectors(vectors, vocab): - for lex in vocab: - if lex.has_vector and (lex.rank+1) < vectors.shape[0]: - lex.norm = lex.rank+1 - vectors[lex.rank + 1] = lex.vector - else: - lex.norm = 0 - return vectors - - -def train(train_texts, train_labels, dev_texts, dev_labels, - lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, - by_sentence=True): - nlp = spacy.load('en', entity=False) - if 'nr_vector' not in lstm_shape: - lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) - if 'nr_dim' not in lstm_shape: - lstm_shape['nr_dim'] = nlp.vocab.vectors_length - print("Make model") - model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) - print("Parsing texts...") - if by_sentence: - train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) - dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) - else: - train_data = DocDataset(nlp, train_texts, train_labels) - dev_data = DocDataset(nlp, dev_texts, dev_labels) - train_iter = SerialIterator(train_data, batch_size=batch_size, - shuffle=True, repeat=True) - dev_iter = SerialIterator(dev_data, batch_size=batch_size, - shuffle=False, repeat=False) - optimizer = chainer.optimizers.Adam() - optimizer.setup(model) - updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) - trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') - - trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) - trainer.extend(extensions.LogReport()) - trainer.extend(extensions.PrintReport([ - 'epoch', 'main/accuracy', 'validation/main/accuracy'])) - trainer.extend(extensions.ProgressBar()) - - trainer.run() - - -def evaluate(model_dir, texts, labels, max_length=100): - def create_pipeline(nlp): - ''' - This could be a lambda, but named functions are easier to read in Python. - ''' - return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, - max_length=max_length)] - - nlp = spacy.load('en') - nlp.pipeline = create_pipeline(nlp) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int) -) -def main(model_dir, train_dir, dev_dir, - is_runtime=False, - nr_hidden=64, max_length=100, # Shape - dropout=0.5, learn_rate=0.001, # General NN config - nb_epoch=5, batch_size=32, nr_examples=-1): # Training params - model_dir = pathlib.Path(model_dir) - train_dir = pathlib.Path(train_dir) - dev_dir = pathlib.Path(dev_dir) - if is_runtime: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) - print("Using GPU 0") - #chainer.cuda.get_device(0).use() - train_labels = xp.asarray(train_labels, dtype='i') - dev_labels = xp.asarray(dev_labels, dtype='i') - lstm = train(train_texts, train_labels, dev_texts, dev_labels, - {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, - 'nr_vector': 5000}, - {'dropout': 0.5, 'lr': learn_rate}, - {}, - nb_epoch=nb_epoch, batch_size=batch_size) - - -if __name__ == '__main__': - #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - #s = pstats.Stats("Profile.prof") - #s.strip_dirs().sort_stats("time").print_stats() - plac.call(main) From 69396dcfd35cf40c9706bf1199f3de8b8e7a06a5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 13:43:15 +0200 Subject: [PATCH 085/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index c419a03cf..bfdbf5c4f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,7 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) @@ -47,6 +48,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov, [@savkov](https://github.com/savkov) * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon) +* Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) From 8b9c4c5e1c80e7e3814b39a64e58a24c005b15f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:43:47 +0200 Subject: [PATCH 086/195] Add missing SP symbol to tag map, re #1052 --- spacy/language_data/tag_map.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index ead6dd1c6..65dab9b0d 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -22,5 +22,6 @@ TAG_MAP = { "CCONJ": {POS: CCONJ}, # U20 "ADJ": {POS: ADJ}, "VERB": {POS: VERB}, - "PART": {POS: PART} + "PART": {POS: PART}, + 'SP': {POS: SPACE} } From 45f6961ae0f54f1e6cbb6fb59158e2ce03e27417 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:45:21 +0200 Subject: [PATCH 087/195] Add __version__ symbol in __init__.py --- spacy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/__init__.py b/spacy/__init__.py index 2308ce7e4..3afb38cfb 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,6 +5,7 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info from .glossary import explain +from .about import __version__ from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja From 0ae3807d7df39b70cc45fc973b84701d9c4f9e25 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 13:53:48 +0200 Subject: [PATCH 088/195] Fix gaps in Lexeme API. Closes #1031 --- spacy/lexeme.pyx | 9 +++++++++ spacy/tests/regression/test_issue1031.py | 13 +++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 spacy/tests/regression/test_issue1031.py diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 05d8bddc6..dc0440486 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -159,6 +159,10 @@ cdef class Lexeme: def __get__(self): return self.c.id + property lex_id: + def __get__(self): + return self.c.id + property repvec: def __get__(self): raise AttributeError("lex.repvec has been renamed to lex.vector") @@ -173,6 +177,11 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.orth] + property text: + def __get__(self): + return self.vocab.strings[self.c.orth] + + property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x diff --git a/spacy/tests/regression/test_issue1031.py b/spacy/tests/regression/test_issue1031.py new file mode 100644 index 000000000..1ac14eb7b --- /dev/null +++ b/spacy/tests/regression/test_issue1031.py @@ -0,0 +1,13 @@ +from ...vocab import Vocab + +def test_lexeme_text(): + vocab = Vocab() + lex = vocab[u'the'] + assert lex.text == u'the' + + +def test_lexeme_lex_id(): + vocab = Vocab() + lex1 = vocab[u'the'] + lex2 = vocab[u'be'] + assert lex1.lex_id != lex2.lex_id From dfbc7e49de96c9e8980c89706d4889244d1f6e39 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:14:01 +0200 Subject: [PATCH 089/195] Add test for Issue #1207 --- spacy/tests/regression/test_issue1307.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 spacy/tests/regression/test_issue1307.py diff --git a/spacy/tests/regression/test_issue1307.py b/spacy/tests/regression/test_issue1307.py new file mode 100644 index 000000000..a71faebcb --- /dev/null +++ b/spacy/tests/regression/test_issue1307.py @@ -0,0 +1,25 @@ +from __future__ import unicode_literals +from ..util import get_doc +from ...vocab import Vocab +from ...en import English + + +def test_span_noun_chunks(): + vocab = Vocab(lang='en', tag_map=English.Defaults.tag_map) + words = "Employees are recruiting talented staffers from overseas .".split() + heads = [1, 1, 0, 1, -2, -1, -5] + deps = ['nsubj', 'aux', 'ROOT', 'nmod', 'dobj', 'adv', 'pobj'] + tags = ['NNS', 'VBP', 'VBG', 'JJ', 'NNS', 'IN', 'NN', '.'] + doc = get_doc(vocab, words=words, heads=heads, deps=deps, tags=tags) + doc.is_parsed = True + + noun_chunks = [np.text for np in doc.noun_chunks] + assert noun_chunks == ['Employees', 'talented staffers', 'overseas'] + + span = doc[0:4] + noun_chunks = [np.text for np in span.noun_chunks] + assert noun_chunks == ['Employees'] + + for sent in doc.sents: + noun_chunks = [np.text for np in sent.noun_chunks] + assert noun_chunks == ['Employees', 'talented staffers', 'overseas'] From d9b85675d79553b4435aef1140354161c3f5dc91 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:14:35 +0200 Subject: [PATCH 090/195] Rename regression test --- spacy/tests/regression/{test_issue1307.py => test_issue1207.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/tests/regression/{test_issue1307.py => test_issue1207.py} (100%) diff --git a/spacy/tests/regression/test_issue1307.py b/spacy/tests/regression/test_issue1207.py similarity index 100% rename from spacy/tests/regression/test_issue1307.py rename to spacy/tests/regression/test_issue1207.py From 9750a0128cf211dac80217eee38e41c38f2c761c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:14:57 +0200 Subject: [PATCH 091/195] Fix Span.noun_chunks. Closes #1207 --- spacy/tokens/span.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 09927ab4c..d8890addc 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -230,7 +230,7 @@ cdef class Span: # so it's okay once we have the Span objects. See Issue #375 spans = [] for start, end, label in self.doc.noun_chunks_iterator(self): - spans.append(Span(self, start, end, label=label)) + spans.append(Span(self.doc, start, end, label=label)) for span in spans: yield span From 23a55b40ca8af1af588b6cbf5504b8d87e3b91d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 14:15:25 +0200 Subject: [PATCH 092/195] Default to English noun chunks iterator if no lang set --- spacy/syntax/iterators.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index b0d1c78ca..0fe724622 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -117,4 +117,5 @@ def es_noun_chunks(obj): token = next_token(token) -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks} +CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks, + None: english_noun_chunks, '': english_noun_chunks} From e3f23f9d910b0fa0e5c71b5b4c5c2a243fe66e60 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 14:57:51 +0200 Subject: [PATCH 093/195] Use latest available version in examples --- website/docs/usage/models.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 30863720c..42a3c0bbf 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -67,7 +67,7 @@ p python -m spacy download en_core_web_md # download exact model version (doesn't create shortcut link) - python -m spacy download en_core_web_md-1.2.0 --direct + python -m spacy download en_core_web_md-1.2.1 --direct p | The download command will #[+a("#download-pip") install the model] via @@ -96,10 +96,10 @@ p +code(false, "bash"). # with external URL - pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz + pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.1/en_core_web_md-1.2.1.tar.gz # with local file - pip install /Users/you/en_core_web_md-1.2.0.tar.gz + pip install /Users/you/en_core_web_md-1.2.1.tar.gz p | By default, this will install the model into your #[code site-packages] From b22b18a0199ae2856f8f6923fb0db1cebe74dbb5 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:02:15 +0200 Subject: [PATCH 094/195] Add notes on spacy.explain() to annotation docs --- website/docs/api/annotation.jade | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 8c6b8fb10..30080dfd9 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -38,6 +38,11 @@ p +h(2, "pos-tagging") Part-of-speech Tagging ++infobox("Tip: Understanding tags") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -65,10 +70,20 @@ p +h(2, "dependency-parsing") Syntactic Dependency Parsing ++infobox("Tip: Understanding labels") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition ++infobox("Tip: Understanding entity types") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + include _annotation/_named-entities +h(2, "json-input") JSON input format for training From 96df9c7154b7967a145423200be62fa245039e8b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 15:05:46 +0200 Subject: [PATCH 095/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index bfdbf5c4f..995f6901f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -58,3 +58,4 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) * Yubing Dong, [@tomtung](https://github.com/tomtung) +* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) From 4b2e5e59eda15c5f60710acbfb8624f748a169fc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:06:50 +0200 Subject: [PATCH 096/195] Add flush_cache method to tokenizer, to fix #1061 The tokenizer caches output for common chunks, for efficiency. This cache is be invalidated when the tokenizer rules change, e.g. when a new special-case rule is introduced. That's what was causing #1061. When the cache is flushed, we free the intermediate token chunks. I *think* this is safe --- but if we start getting segfaults, this patch is to blame. The resolution would be to simply not free those bits of memory. They'll be freed when the tokenizer exits anyway. --- spacy/tests/regression/test_issue1061.py | 27 ++++++++++++++ spacy/tokenizer.pyx | 46 +++++++++++++++++++++--- 2 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue1061.py diff --git a/spacy/tests/regression/test_issue1061.py b/spacy/tests/regression/test_issue1061.py new file mode 100644 index 000000000..821ca2bfc --- /dev/null +++ b/spacy/tests/regression/test_issue1061.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals + +from ...symbols import ORTH + +from ...vocab import Vocab +from ...en import English + + +def test_issue1061(): + '''Test special-case works after tokenizing. Was caching problem.''' + text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.' + tokenizer = English.Defaults.create_tokenizer() + doc = tokenizer(text) + assert 'MATH' in [w.text for w in doc] + assert '_MATH_' not in [w.text for w in doc] + + tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) + doc = tokenizer(text) + assert '_MATH_' in [w.text for w in doc] + assert 'MATH' not in [w.text for w in doc] + + # For sanity, check it works when pipeline is clean. + tokenizer = English.Defaults.create_tokenizer() + tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) + doc = tokenizer(text) + assert '_MATH_' in [w.text for w in doc] + assert 'MATH' not in [w.text for w in doc] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index c094bea0d..276f0ef20 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -186,7 +186,13 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: - return False + # See 'flush_cache' below for hand-wringing about + # how to handle this. + cached = <_Cached*>self._specials.get(key) + if cached == NULL: + return False + else: + self._cache.set(key, cached) cdef int i if cached.is_lex: for i in range(cached.length): @@ -201,9 +207,15 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + special_case = self._specials.get(orig_key) + if special_case is not NULL: + for i in range(special_case.length): + tokens.push_back(&special_case.data.tokens[i], False) + self._cache.set(orig_key, special_case) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -389,5 +401,29 @@ cdef class Tokenizer: cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) self._specials.set(key, cached) - self._cache.set(key, cached) self._rules[string] = substrings + # After changing the tokenization rules, the previous tokenization + # may be stale. + self.flush_cache() + + def flush_cache(self): + '''Flush the tokenizer's cache. May not free memory immediately. + + This is called automatically after `add_special_case`, but if you + write to the prefix or suffix functions, you'll have to call this + yourself. You may also need to flush the tokenizer cache after + changing the lex_attr_getter functions. + ''' + cdef hash_t key + for key in self._cache.keys(): + special_case = self._specials.get(key) + # Don't free data shared with special-case rules + if special_case is not NULL: + continue + cached = <_Cached*>self._cache.get(key) + if cached is not NULL: + self.mem.free(cached) + self._cache = PreshMap(1000) + # We could here readd the data from specials --- but if we loop over + # a bunch of special-cases, we'll get a quadratic behaviour. The extra + # lookup isn't so bad? Tough to tell. From d7560047c5038fb4bf8a3f3a52b7a02ab6e88b25 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:24:31 +0200 Subject: [PATCH 097/195] Fix version --- website/docs/api/annotation.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 30080dfd9..d4b01a819 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -39,7 +39,7 @@ p +h(2, "pos-tagging") Part-of-speech Tagging +infobox("Tip: Understanding tags") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the | description for the string representation of a tag. For example, | #[code spacy.explain("RB")] will return "adverb". @@ -71,7 +71,7 @@ p +h(2, "dependency-parsing") Syntactic Dependency Parsing +infobox("Tip: Understanding labels") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the | description for the string representation of a label. For example, | #[code spacy.explain("prt")] will return "particle". @@ -80,7 +80,7 @@ include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition +infobox("Tip: Understanding entity types") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the | description for the string representation of an entity label. For example, | #[code spacy.explain("LANGUAGE")] will return "any named language". From de25bad036c7ddcf30181e71c4c1750ff6b93c18 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:29:10 +0200 Subject: [PATCH 098/195] Use lower min version for requests dependency (fixes #1137) Ensure compatibility with docker-compose and other packages --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 20c587841..fe273ee53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ six html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 -requests>=2.13.0,<3.0.0 +requests>=2.11.0,<3.0.0 regex==2017.4.5 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 From 7c4bf9994d23f5b07ebed24034b8d8eee2eaa6f6 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 15:40:12 +0200 Subject: [PATCH 099/195] Add note on requirements and preventing model re-downloads (closes #1143) --- website/docs/usage/models.jade | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 42a3c0bbf..2d0f83663 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -198,6 +198,37 @@ p nlp = en_core_web_md.load() doc = nlp(u'This is a sentence.') ++h(3, "models-download") Downloading and requiring model dependencies + +p + | spaCy's built-in #[+api("cli#download") #[code download]] command + | is mostly intended as a convenient, interactive wrapper. It performs + | compatibility checks and prints detailed error messages and warnings. + | However, if you're downloading models as part of an automated build + | process, this only adds an unecessary layer of complexity. If you know + | which models your application needs, you should be specifying them directly. + ++aside("Prevent re-downloading models") + | If you're installing a model from a URL, pip will usually re-download and + | re-install the package, even if you already have a matching + | version installed. To prevent this, simply add #[code #egg=] and the + | package name after the URL, e.g. #[code #egg=en_core_web_sm] or + | #[code #egg=en_core_web_sm-1.2.0]. This tells pip which package and version + | you're trying to download, and will skip the package if a matching + | installation is found. + +p + | Because all models are valid Python packages, you can add them to your + | application's #[code requirements.txt]. If you're running your own + | internal PyPi installation, you can simply upload the models there. pip's + | #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format] + | supports both package names to download via a PyPi server, as well as direct + | URLs. + ++code("requirements.txt", "text"). + spacy>=1.8.0,<2.0.0 + -e #{gh("spacy-models")}/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz#egg=en_core_web_sm-1.2.0 + +h(2, "own-models") Using your own models p From 796b2f4c1b49401f7cb490df174fe32f0186bc56 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:42:38 +0200 Subject: [PATCH 100/195] Remove print statements in tests --- spacy/tests/regression/test_issue693.py | 2 -- spacy/tests/regression/test_issue995.py | 1 - 2 files changed, 3 deletions(-) diff --git a/spacy/tests/regression/test_issue693.py b/spacy/tests/regression/test_issue693.py index e4d907716..5deeb3215 100644 --- a/spacy/tests/regression/test_issue693.py +++ b/spacy/tests/regression/test_issue693.py @@ -14,7 +14,5 @@ def test_issue693(EN): doc2 = EN(text2) chunks1 = [chunk for chunk in doc1.noun_chunks] chunks2 = [chunk for chunk in doc2.noun_chunks] - for word in doc1: - print(word.text, word.dep_, word.head.text) assert len(chunks1) == 2 assert len(chunks2) == 2 diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py index 633e96fb5..108b434a2 100644 --- a/spacy/tests/regression/test_issue995.py +++ b/spacy/tests/regression/test_issue995.py @@ -15,7 +15,6 @@ def test_issue955(doc): '''Test that we don't have any nested noun chunks''' seen_tokens = set() for np in doc.noun_chunks: - print(np.text, np.root.text, np.root.dep_, np.root.tag_) for word in np: key = (word.i, word.text) assert key not in seen_tokens From d51d55bba673cbe35784589825ff88fd33bb1f73 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:43:16 +0200 Subject: [PATCH 101/195] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 8c0e0afd3..d34c6f948 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.8.2' +__version__ = '1.9.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 78fcf56dd5ce0beeebdcd58c0082f78d751ba206 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 15:57:58 +0200 Subject: [PATCH 102/195] Update version pin for regex library --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fe273ee53..6f7d067a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 requests>=2.11.0,<3.0.0 -regex==2017.4.5 +regex==2017.7.11 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 pip>=9.0.0,<10.0.0 diff --git a/setup.py b/setup.py index 89aaf8eba..61bd6b6bb 100755 --- a/setup.py +++ b/setup.py @@ -203,7 +203,7 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.4.5', + 'regex==2017.7.11', 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', From 5494605689758238e92703fa759a2f56cbb00598 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Jul 2017 16:09:50 +0200 Subject: [PATCH 103/195] Fiddle with regex pin --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6f7d067a5..9d6f34133 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 requests>=2.11.0,<3.0.0 -regex==2017.7.11 +regex>=2017.4.1,<2017.12.1 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 pip>=9.0.0,<10.0.0 diff --git a/setup.py b/setup.py index 61bd6b6bb..1b127962b 100755 --- a/setup.py +++ b/setup.py @@ -203,7 +203,7 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.7.11', + 'regex>=2017.4.1,<2017.12.1', 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', From 570964e67f0c7a12e64551cd4b71dca3c40b6ad8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 16:20:19 +0200 Subject: [PATCH 104/195] Update README.rst --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 0f3efc146..9d52a6c9d 100644 --- a/README.rst +++ b/README.rst @@ -63,11 +63,12 @@ MIT license. đŸ’Ŧ Where to ask questions ========================== +Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. + ====================== === **Bug reports** `GitHub issue tracker`_ **Usage questions** `StackOverflow`_, `Gitter chat`_, `Reddit user group`_ **General discussion** `Gitter chat`_, `Reddit user group`_ -**Commercial support** contact@explosion.ai ====================== === .. _GitHub issue tracker: https://github.com/explosion/spaCy/issues From e349271506b66e4257e8c69e02c664bbb0442fda Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 18:29:30 +0200 Subject: [PATCH 105/195] Increment version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index cb476541a..37a0b54dd 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.8", + "SPACY_VERSION": "1.9", "LATEST_NEWS": { "url": "/docs/usage/models", "title": "The first official Spanish model is here!" From 864cefd3b267e08a843703687fcd0b2587c8d080 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 18:29:55 +0200 Subject: [PATCH 106/195] Update README.rst --- README.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9d52a6c9d..4efd5b1de 100644 --- a/README.rst +++ b/README.rst @@ -9,14 +9,14 @@ Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, released under the MIT license. -⭐ī¸ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. `_ +⭐ī¸ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes. `_ -đŸ’Ģ **Version 1.8 out now!** `Read the release notes here. `_ +đŸ’Ģ **Version 1.9 out now!** `Read the release notes here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy :alt: Travis Build Status - + .. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square :target: https://ci.appveyor.com/project/explosion/spacy :alt: Appveyor Build Status @@ -326,6 +326,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.9.0`_ ``2017-07-22`` Spanish model, alpha support for Norwegian & Japanese, and bug fixes `v1.8.2`_ ``2017-04-26`` French model and small improvements `v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes `v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading @@ -359,6 +360,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.9.0: https://github.com/explosion/spaCy/releases/tag/v1.9.0 .. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2 .. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 From 7e98a3613c4934709f3358594a928f476e2fa8f2 Mon Sep 17 00:00:00 2001 From: Gideon Dresdner Date: Sun, 6 Aug 2017 13:21:45 +0200 Subject: [PATCH 107/195] improve pipe, tee, izip explanation Use an example from an old issue https://github.com/explosion/spaCy/issues/172#issuecomment-183963403. --- website/docs/usage/processing-text.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 4bd6132d2..600654f65 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -98,7 +98,8 @@ p | important metadata, e.g. a JSON document. To pair up the metadata | with the processed #[code Doc] object, you should use the tee | function to split the generator in two, and then #[code izip] the - | extra stream to the document stream. + | extra stream to the document stream. Here's an + | #[a(href="https://github.com/explosion/spaCy/issues/172#issuecomment-183963403")= "example"] +h(2, "own-annotations") Bringing your own annotations From d3b03f05441de59cfd45b7414d4aab6fd1b32242 Mon Sep 17 00:00:00 2001 From: Delirious Lettuce Date: Sun, 6 Aug 2017 21:31:39 -0600 Subject: [PATCH 108/195] Fix typos: * `auxillary` -> `auxiliary` * `consistute` -> `constitute` * `earlist` -> `earliest` * `prefered` -> `preferred` * `direcory` -> `directory` * `reuseable` -> `reusable` * `idiosyncracies` -> `idiosyncrasies` * `enviroment` -> `environment` * `unecessary` -> `unnecessary` * `yesteday` -> `yesterday` * `resouces` -> `resources` --- spacy/glossary.py | 4 ++-- website/docs/api/_annotation/_pos-tags.jade | 4 ++-- website/docs/api/features.jade | 2 +- website/docs/api/span.jade | 2 +- website/docs/usage/adding-languages.jade | 4 ++-- website/docs/usage/customizing-tokenizer.jade | 2 +- website/docs/usage/index.jade | 2 +- website/docs/usage/models.jade | 2 +- website/docs/usage/pos-tagging.jade | 2 +- website/docs/usage/saving-loading.jade | 2 +- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index 4df5264a6..ed1c22c21 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -60,7 +60,7 @@ GLOSSARY = { 'JJR': 'adjective, comparative', 'JJS': 'adjective, superlative', 'LS': 'list item marker', - 'MD': 'verb, modal auxillary', + 'MD': 'verb, modal auxiliary', 'NIL': 'missing tag', 'NN': 'noun, singular or mass', 'NNP': 'noun, proper singular', @@ -91,7 +91,7 @@ GLOSSARY = { 'NFP': 'superfluous punctuation', 'GW': 'additional word in multi-word expression', 'XX': 'unknown', - 'BES': 'auxillary "be"', + 'BES': 'auxiliary "be"', 'HVS': 'forms of "have"', diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index ea3a225bf..51db4f4e2 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -21,7 +21,7 @@ p +pos-row("$", "SYM", "SymType=currency", "symbol, currency") +pos-row("ADD", "X", "", "email") +pos-row("AFX", "ADJ", "Hyph=yes", "affix") - +pos-row("BES", "VERB", "", 'auxillary "be"') + +pos-row("BES", "VERB", "", 'auxiliary "be"') +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating") +pos-row("CD", "NUM", "NumType=card", "cardinal number") +pos-row("DT", "DET", "determiner") @@ -35,7 +35,7 @@ p +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative") +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative") +pos-row("LS", "PUNCT", "NumType=ord", "list item marker") - +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary") + +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary") +pos-row("NFP", "PUNCT", "", "superfluous punctuation") +pos-row("NIL", "", "", "missing tag") +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass") diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade index 018790145..21481cf65 100644 --- a/website/docs/api/features.jade +++ b/website/docs/api/features.jade @@ -18,7 +18,7 @@ p | consisting of the words to be processed. p - | Each state consists of the words on the stack (if any), which consistute + | Each state consists of the words on the stack (if any), which constitute | the current entity being constructed. We also have the current word, and | the two subsequent words. Finally, we also have the entities previously | built. diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 770ee3e9b..d2d3d0f27 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -222,7 +222,7 @@ p The sentence span that this span is a part of. p | The token within the span that's highest in the parse tree. If there's a - | tie, the earlist is prefered. + | tie, the earliest is preferred. +table(["Name", "Type", "Description"]) +footrow diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 30c4486b0..7d893b4eb 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -28,7 +28,7 @@ p | #[a(href="#word-vectors") word vectors]. +item - | #[strong Set up] a #[a(href="#model-directory") model direcory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. + | #[strong Set up] a #[a(href="#model-directory") model directory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. p | For some languages, you may also want to develop a solution for @@ -303,7 +303,7 @@ p p | Because languages can vary in quite arbitrary ways, spaCy avoids | organising the language data into an explicit inheritance hierarchy. - | Instead, reuseable functions and data are collected as atomic pieces in + | Instead, reusable functions and data are collected as atomic pieces in | the #[code spacy.language_data] package. +aside-code("Example"). diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 354a56c22..ca5be9ef1 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -21,7 +21,7 @@ p +h(2, "special-cases") Adding special case tokenization rules p - | Most domains have at least some idiosyncracies that require custom + | Most domains have at least some idiosyncrasies that require custom | tokenization rules. Here's how to add a special case rule to an existing | #[+api("tokenizer") #[code Tokenizer]] instance: diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 9ad2fde5f..092c996b3 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -87,7 +87,7 @@ p | The other way to install spaCy is to clone its | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is | the common way if you want to make changes to the code base. You'll need to - | make sure that you have a development enviroment consisting of a Python + | make sure that you have a development environment consisting of a Python | distribution including header files, a compiler, | #[+a("https://pip.pypa.io/en/latest/installing/") pip], | #[+a("https://virtualenv.pypa.io/") virtualenv] and diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 2d0f83663..4951ea211 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -205,7 +205,7 @@ p | is mostly intended as a convenient, interactive wrapper. It performs | compatibility checks and prints detailed error messages and warnings. | However, if you're downloading models as part of an automated build - | process, this only adds an unecessary layer of complexity. If you know + | process, this only adds an unnecessary layer of complexity. If you know | which models your application needs, you should be specifying them directly. +aside("Prevent re-downloading models") diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index cded00b6c..3f22ab43f 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -50,7 +50,7 @@ p +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres] +row - +cell I read the paper yesteday + +cell I read the paper yesterday +cell read +cell read +cell verb diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 8978cce7a..56b218c29 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -58,7 +58,7 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+a(gh("spacy-dev-resources", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The From 6e9e686568ab1f70d0b517e0d5f2bcbb894eb17a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 8 Aug 2017 01:27:15 +0900 Subject: [PATCH 109/195] Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM --- spacy/ja/__init__.py | 92 +++++++++++++++++++++++++++++---- spacy/ja/tag_map.py | 97 +++++++++++++++++++++++++++++------ spacy/tests/conftest.py | 5 +- spacy/tests/ja/test_tagger.py | 10 ++++ 4 files changed, 177 insertions(+), 27 deletions(-) create mode 100644 spacy/tests/ja/test_tagger.py diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 1c85ded95..5f49f0b1b 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -5,37 +5,111 @@ from os import path from ..language import Language, BaseDefaults from ..tokenizer import Tokenizer +from ..tagger import Tagger from ..attrs import LANG from ..tokens import Doc from .language_data import * +import re +from collections import namedtuple + +ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) + class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: - from janome.tokenizer import Tokenizer + import MeCab except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: " - "https://github.com/mocobeta/janome") - self.tokenizer = Tokenizer() + raise ImportError("The Japanese tokenizer requires the MeCab library: " + "https://github.com/SamuraiT/mecab-python3") + self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in self.tokenizer.tokenize(text)] + words = [x.surface for x in detailed_tokens(self.tokenizer, text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) +def resolve_pos(token): + """If necessary, add a field to the POS tag for UD mapping. + + Under Universal Dependencies, sometimes the same Unidic POS tag can + be mapped differently depending on the literal token or its context + in the sentence. This function adds information to the POS tag to + resolve ambiguous mappings. + """ + + # NOTE: This is a first take. The rules here are crude approximations. + # For many of these, full dependencies are needed to properly resolve + # PoS mappings. + + if token.part_of_speech == 'é€ŖäŊ“芞,*,*,*': + # determiner-likes get DET, otherwise ADJ + if re.match('^[こそあお此å…ļåŊŧ]ぎ', token.surface): + return token.part_of_speech + ',DET' + else: + return token.part_of_speech + ',ADJ' + return token.part_of_speech + +def detailed_tokens(tokenizer, text): + """Format Mecab output into a nice data structure, based on Janome.""" + + node = tokenizer.parseToNode(text) + node = node.next # first node is beginning of sentence and empty, skip it + words = [] + while node.posid != 0: + parts = node.feature.split(',') + pos = ','.join(parts[0:4]) + reading = parts[6] + base = parts[7] + surface = parts[8] + + words.append( ShortUnitWord(surface, base, pos) ) + node = node.next + return words + +class JapaneseTagger(object): + def __init__(self, vocab): + try: + import MeCab + except ImportError: + raise ImportError("The Japanese tagger requires the MeCab library: " + "https://github.com/SamuraiT/mecab-python3") + + self.tagger = Tagger(vocab) + self.tokenizer = MeCab.Tagger() + + def __call__(self, tokens): + # two parts to this: + # 1. get raw JP tags + # 2. add features to tags as necessary for UD + + # TODO: if the text has been tokenized, this info is already available + # How to set the data when tokenizing or save it for the tagger to find? + + dtokens = detailed_tokens(self.tokenizer, tokens.text) + rawtags = list(map(resolve_pos, dtokens)) + self.tagger.tag_from_strings(tokens, rawtags) + class JapaneseDefaults(BaseDefaults): + tag_map = TAG_MAP + @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) + @classmethod + def create_tagger(cls, tokenizer): + return JapaneseTagger(tokenizer.vocab) + class Japanese(Language): lang = 'ja' Defaults = JapaneseDefaults def make_doc(self, text): - words = self.tokenizer(text) - return Doc(self.vocab, words=words, spaces=[False]*len(words)) - - + words = [str(t) for t in self.tokenizer(text)] + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + tagger = JapaneseDefaults.create_tagger(self.tokenizer) + tagger(doc) + return doc diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index f5b6b5040..609739c2f 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -3,22 +3,85 @@ from __future__ import unicode_literals from ..symbols import * - TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} + # Explanation of Unidic tags: + # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf + + # Universal Dependencies Mapping: + # http://universaldependencies.org/ja/overview/morphology.html + # http://universaldependencies.org/ja/pos/all.html + + "č¨˜åˇ,一čˆŦ,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドãƒŦミ + "č¨˜åˇ,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math + + "æ„Ÿå‹•čŠž,フã‚Ŗナãƒŧ,*,*": {POS: INTJ}, + "æ„Ÿå‹•čŠž,一čˆŦ,*,*": {POS: INTJ}, + + # this is specifically for unicode full-width space + "įŠēį™Ŋ,*,*,*": {POS: X}, + + "åŊĸįŠļ詞,一čˆŦ,*,*":{POS: ADJ}, + "åŊĸįŠļ詞,ã‚ŋãƒĒ,*,*":{POS: ADJ}, + "åŊĸįŠļ詞,åŠŠå‹•čŠžčĒžåšš,*,*":{POS: ADJ}, + "åŊĸåŽščŠž,一čˆŦ,*,*":{POS: ADJ}, + "åŊĸåŽščŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise + + "åŠŠčŠž,æ ŧåŠŠčŠž,*,*":{POS: ADP}, + "åŠŠčŠž,äŋ‚åŠŠčŠž,*,*":{POS: ADP}, + "åŠŠčŠž,įĩ‚åŠŠčŠž,*,*":{POS: PART}, + "åŠŠčŠž,æē–äŊ“åŠŠčŠž,*,*":{POS: SCONJ}, # ぎ as in čĩ°ã‚‹ãŽãŒé€Ÿã„ + "åŠŠčŠž,æŽĨįļšåŠŠčŠž,*,*":{POS: SCONJ}, # verb ending ãĻ + "åŠŠčŠž,å‰¯åŠŠčŠž,*,*":{POS: PART}, # ばかり, つつ after a verb + "åŠŠå‹•čŠž,*,*,*":{POS: AUX}, + "æŽĨįļščŠž,*,*,*":{POS: SCONJ}, # XXX: might need refinement + + "æŽĨ頭辞,*,*,*":{POS: NOUN}, + "æŽĨå°žčžž,åŊĸįŠļ詞įš„,*,*":{POS: ADJ}, # ãŒãĄ, チック + "æŽĨå°žčžž,åŊĸåŽščŠžįš„,*,*":{POS: ADJ}, # -らしい + "æŽĨå°žčžž,å‹•čŠžįš„,*,*":{POS: NOUN}, # -じãŋ + "æŽĨå°žčžž,åčŠžįš„,ã‚ĩ変可čƒŊ,*":{POS: NOUN}, # XXX see åčŠž,æ™Žé€šåčŠž,ã‚ĩ変可čƒŊ,* + "æŽĨå°žčžž,åčŠžįš„,一čˆŦ,*":{POS: NOUN}, + "æŽĨå°žčžž,åčŠžįš„,åŠŠæ•°čŠž,*":{POS: NOUN}, + "æŽĨå°žčžž,åčŠžįš„,å‰¯čŠžå¯čƒŊ,*":{POS: NOUN}, # -垌, -過ぎ + + "äģŖåčŠž,*,*,*":{POS: PRON}, + "å‹•čŠž,一čˆŦ,*,*":{POS: VERB}, + "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise + "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*,AUX":{POS: AUX}, + "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*,VERB":{POS: VERB}, + "å‰¯čŠž,*,*,*":{POS: ADV}, + + "čŖœåŠŠč¨˜åˇ,īŧĄīŧĄ,一čˆŦ,*":{POS: SYM}, # text art + "čŖœåŠŠč¨˜åˇ,īŧĄīŧĄ,éĄ”æ–‡å­—,*":{POS: SYM}, # kaomoji + "čŖœåŠŠč¨˜åˇ,一čˆŦ,*,*":{POS: SYM}, + "čŖœåŠŠč¨˜åˇ,æ‹Ŧåŧ§é–‹,*,*":{POS: PUNCT}, # open bracket + "čŖœåŠŠč¨˜åˇ,æ‹Ŧåŧ§é–‰,*,*":{POS: PUNCT}, # close bracket + "čŖœåŠŠč¨˜åˇ,åĨį‚š,*,*":{POS: PUNCT}, # period or other EOS marker + "čŖœåŠŠč¨˜åˇ,čĒ­į‚š,*,*":{POS: PUNCT}, # comma + + "åčŠž,å›ēæœ‰åčŠž,一čˆŦ,*":{POS: PROPN}, # general proper noun + "åčŠž,å›ēæœ‰åčŠž,äēē名,一čˆŦ":{POS: PROPN}, # person's name + "åčŠž,å›ēæœ‰åčŠž,äēē名,姓":{POS: PROPN}, # surname + "åčŠž,å›ēæœ‰åčŠž,äēē名,名":{POS: PROPN}, # first name + "åčŠž,å›ēæœ‰åčŠž,地名,一čˆŦ":{POS: PROPN}, # place name + "åčŠž,å›ēæœ‰åčŠž,地名,å›Ŋ":{POS: PROPN}, # country name + + "åčŠž,åŠŠå‹•čŠžčĒžåšš,*,*":{POS: AUX}, + "åčŠž,æ•°čŠž,*,*":{POS: NUM}, # includes Chinese numerals + + "åčŠž,æ™Žé€šåčŠž,ã‚ĩ変可čƒŊ,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + "åčŠž,æ™Žé€šåčŠž,ã‚ĩ変可čƒŊ,*,NOUN":{POS: NOUN}, + "åčŠž,æ™Žé€šåčŠž,ã‚ĩ変可čƒŊ,*,VERB":{POS: VERB}, + + "åčŠž,æ™Žé€šåčŠž,ã‚ĩ変åŊĸįŠļčŠžå¯čƒŊ,*":{POS: NOUN}, # ex: 下手 + "åčŠž,æ™Žé€šåčŠž,一čˆŦ,*":{POS: NOUN}, + "åčŠž,æ™Žé€šåčŠž,åŊĸįŠļčŠžå¯čƒŊ,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2 + "åčŠž,æ™Žé€šåčŠž,åŊĸįŠļčŠžå¯čƒŊ,*,NOUN":{POS: NOUN}, + "åčŠž,æ™Žé€šåčŠž,åŊĸįŠļčŠžå¯čƒŊ,*,ADJ":{POS: ADJ}, + "åčŠž,æ™Žé€šåčŠž,åŠŠæ•°čŠžå¯čƒŊ,*":{POS: NOUN}, # counter / unit + "åčŠž,æ™Žé€šåčŠž,å‰¯čŠžå¯čƒŊ,*":{POS: NOUN}, + + "é€ŖäŊ“芞,*,*,*":{POS: ADJ}, # XXX note こぎ、そぎ etc. should be DET + "é€ŖäŊ“芞,*,*,*,ADJ":{POS: ADJ}, + "é€ŖäŊ“芞,*,*,*,DET":{POS: DET}, } diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6e00b1513..52b9bdd57 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -79,9 +79,12 @@ def fi_tokenizer(): @pytest.fixture def ja_tokenizer(): - janome = pytest.importorskip("janome") + pytest.importorskip("MeCab") return Japanese.Defaults.create_tokenizer() +@pytest.fixture +def japanese(): + return Japanese() @pytest.fixture def sv_tokenizer(): diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py new file mode 100644 index 000000000..43259fb49 --- /dev/null +++ b/spacy/tests/ja/test_tagger.py @@ -0,0 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_japanese_tagger(japanese): + doc = japanese.make_doc("ã“ãŽãƒ•ã‚Ąã‚¤ãƒĢãĢは小さãĒテ゚トがå…ĨãŖãĻいるよ") + # note these both have the same raw tag, 'é€ŖäŊ“芞,*,*,*' + assert doc[0].pos_ == "DET" + assert doc[4].pos_ == "ADJ" From e3738aba0dc562cdd87133fdfd58a9741b0c08f2 Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Tue, 15 Aug 2017 21:50:09 +0100 Subject: [PATCH 110/195] Fix broken tutorial link on website --- website/docs/usage/_data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 703a185d6..c2ce271aa 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -313,7 +313,7 @@ "author": "Clark Grubb" }, "A very (very) short primer on spacy.io": { - "url": "http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html", + "url": "https://web.archive.org/web/20161219095416/http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html", "author": "Nimrod Milo " } }, From 234a8a75917aa01c48e06ed4767d6f47cdfead22 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 21 Aug 2017 00:21:45 +0900 Subject: [PATCH 111/195] =?UTF-8?q?Change=20default=20tag=20for=20?= =?UTF-8?q?=E5=8B=95=E8=A9=9E,=E9=9D=9E=E8=87=AA=E7=AB=8B=E5=8F=AF?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Example of this is いる in these sentences: åŊŧはそこãĢいる。# should be VERB åŊŧはåē•ãĢįĢ‹ãŖãĻいる。# should be AUX Unclear which case is more numerous - need to check a large corpus - but in keeping with the other ambiguous tags, this is mapped to the "dominant" or first part of the tag. -POLM --- spacy/ja/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 609739c2f..8436f07ff 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -46,7 +46,7 @@ TAG_MAP = { "äģŖåčŠž,*,*,*":{POS: PRON}, "å‹•čŠž,一čˆŦ,*,*":{POS: VERB}, - "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise + "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*,AUX":{POS: AUX}, "å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*,VERB":{POS: VERB}, "å‰¯čŠž,*,*,*":{POS: ADV}, From c5c3f4c7d9a9c715110040d1e75e08d8a7b8dc20 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 21 Aug 2017 16:08:40 +0200 Subject: [PATCH 112/195] Use more generous .env ignore rule --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2209f5b4a..84ced41f8 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ Profile.prof .python-version __pycache__/ *.py[cod] +.env*/ .env/ .env2/ .env3/ From edc596d9a77cf0281b3641297fd5abd62a74edf2 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 21 Aug 2017 16:11:36 +0200 Subject: [PATCH 113/195] Add missing tokenizer exceptions (resolves #1281) --- spacy/en/tokenizer_exceptions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index d9aa01734..29447314a 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -276,7 +276,10 @@ for verb_data in [ {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"} + {ORTH: "were", LEMMA: "be"}, + {ORTH: "have"}, + {ORTH: "has", LEMMA: "have"}, + {ORTH: "dare"} ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() From dcff10abe98c844f2f66ff22835c9eb8ea8e7138 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 21 Aug 2017 16:11:47 +0200 Subject: [PATCH 114/195] Add regression test for #1281 --- spacy/tests/regression/test_issue1281.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 spacy/tests/regression/test_issue1281.py diff --git a/spacy/tests/regression/test_issue1281.py b/spacy/tests/regression/test_issue1281.py new file mode 100644 index 000000000..17307b1d6 --- /dev/null +++ b/spacy/tests/regression/test_issue1281.py @@ -0,0 +1,13 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', [ + "She hasn't done the housework.", + "I haven't done it before.", + "you daren't do that"]) +def test_issue1281(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[2].text == "n't" From c435f748d743b1ee407c02c14223679769fa52b2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:01:28 +0900 Subject: [PATCH 115/195] Put Mecab import in utility function --- spacy/ja/__init__.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 5f49f0b1b..c82591f58 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,14 +16,21 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) +def try_mecab_import(): + """Mecab is required for Japanese support, so check for it. + + It it's not available blow up and explain how to fix it.""" + try: + import MeCab + return MeCab + except ImportError: + raise ImportError("Japanese support requires MeCab: " + "https://github.com/SamuraiT/mecab-python3") + class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - try: - import MeCab - except ImportError: - raise ImportError("The Japanese tokenizer requires the MeCab library: " - "https://github.com/SamuraiT/mecab-python3") + MeCab = try_mecab_import() self.tokenizer = MeCab.Tagger() def __call__(self, text): @@ -70,12 +77,7 @@ def detailed_tokens(tokenizer, text): class JapaneseTagger(object): def __init__(self, vocab): - try: - import MeCab - except ImportError: - raise ImportError("The Japanese tagger requires the MeCab library: " - "https://github.com/SamuraiT/mecab-python3") - + MeCab = try_mecab_import() self.tagger = Tagger(vocab) self.tokenizer = MeCab.Tagger() From 53e17296e98ba8db1b9b99fec0a39aaa56d12e5c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:01:49 +0900 Subject: [PATCH 116/195] Fix pronoun handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missed this case earlier. é€ŖäŊ“芞 have three classes for UD purposes: - そぎ -> DET - それ -> PRON - 同じ -> ADJ -POLM --- spacy/ja/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index c82591f58..8cd48ad84 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -51,9 +51,10 @@ def resolve_pos(token): # PoS mappings. if token.part_of_speech == 'é€ŖäŊ“芞,*,*,*': - # determiner-likes get DET, otherwise ADJ if re.match('^[こそあお此å…ļåŊŧ]ぎ', token.surface): return token.part_of_speech + ',DET' + if re.match('^[こそあお此å…ļåŊŧ]', token.surface): + return token.part_of_speech + ',PRON' else: return token.part_of_speech + ',ADJ' return token.part_of_speech From adfd98731655cc3f351e0042353ea850ef7d23c2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:02:55 +0900 Subject: [PATCH 117/195] Update the TAG_MAP --- spacy/ja/__init__.py | 3 --- spacy/ja/tag_map.py | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 8cd48ad84..dfd0bca5b 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -87,9 +87,6 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - # TODO: if the text has been tokenized, this info is already available - # How to set the data when tokenizing or save it for the tagger to find? - dtokens = detailed_tokens(self.tokenizer, tokens.text) rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 8436f07ff..191865ed2 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -81,7 +81,8 @@ TAG_MAP = { "åčŠž,æ™Žé€šåčŠž,åŠŠæ•°čŠžå¯čƒŊ,*":{POS: NOUN}, # counter / unit "åčŠž,æ™Žé€šåčŠž,å‰¯čŠžå¯čƒŊ,*":{POS: NOUN}, - "é€ŖäŊ“芞,*,*,*":{POS: ADJ}, # XXX note こぎ、そぎ etc. should be DET + "é€ŖäŊ“芞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token "é€ŖäŊ“芞,*,*,*,ADJ":{POS: ADJ}, + "é€ŖäŊ“芞,*,*,*,PRON":{POS: PRON}, "é€ŖäŊ“芞,*,*,*,DET":{POS: DET}, } From bcf2b9b4f5e12951394bbc2e77daf5a1763ec9e5 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:03:11 +0900 Subject: [PATCH 118/195] Update tagger & tokenizer tests Tagger is now parametrized and has two sentences with more tag coverage. The tokenizer tests are updated to reflect differences in tokenization between IPAdic and Unidic. -POLM --- spacy/tests/ja/test_tagger.py | 33 +++++++++++++++++++++++++++----- spacy/tests/ja/test_tokenizer.py | 4 ++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 43259fb49..629cc795f 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -3,8 +3,31 @@ from __future__ import unicode_literals import pytest -def test_japanese_tagger(japanese): - doc = japanese.make_doc("ã“ãŽãƒ•ã‚Ąã‚¤ãƒĢãĢは小さãĒテ゚トがå…ĨãŖãĻいるよ") - # note these both have the same raw tag, 'é€ŖäŊ“芞,*,*,*' - assert doc[0].pos_ == "DET" - assert doc[4].pos_ == "ADJ" +TAGGER_TESTS = [ + ('あれãĒらそこãĢあるよ', + (('äģŖåčŠž,*,*,*', 'PRON'), + ('åŠŠå‹•čŠž,*,*,*', 'AUX'), + ('äģŖåčŠž,*,*,*', 'PRON'), + ('åŠŠčŠž,æ ŧåŠŠčŠž,*,*', 'ADP'), + ('å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*', 'VERB'), + ('åŠŠčŠž,įĩ‚åŠŠčŠž,*,*', 'PART'))), + ('ã“ãŽãƒ•ã‚Ąã‚¤ãƒĢãĢは小さãĒテ゚トがå…ĨãŖãĻいるよ', + (('é€ŖäŊ“芞,*,*,*,DET', 'DET'), + ('åčŠž,æ™Žé€šåčŠž,ã‚ĩ変可čƒŊ,*', 'NOUN'), + ('åŠŠčŠž,æ ŧåŠŠčŠž,*,*', 'ADP'), + ('åŠŠčŠž,äŋ‚åŠŠčŠž,*,*', 'ADP'), + ('é€ŖäŊ“芞,*,*,*,ADJ', 'ADJ'), + ('åčŠž,æ™Žé€šåčŠž,ã‚ĩ変可čƒŊ,*', 'NOUN'), + ('åŠŠčŠž,æ ŧåŠŠčŠž,*,*', 'ADP'), + ('å‹•čŠž,一čˆŦ,*,*', 'VERB'), + ('åŠŠčŠž,æŽĨįļšåŠŠčŠž,*,*', 'SCONJ'), + ('å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*', 'VERB'), + ('åŠŠčŠž,įĩ‚åŠŠčŠž,*,*', 'PART'))) +] + +@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS) +def test_japanese_tagger(japanese, text, expected_tags): + tokens = japanese.make_doc(text) + assert len(tokens) == len(expected_tags) + for token, res in zip(tokens, expected_tags): + assert token.tag_ == res[0] and token.pos_ == res[1] diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py index 58700b353..17411aee2 100644 --- a/spacy/tests/ja/test_tokenizer.py +++ b/spacy/tests/ja/test_tokenizer.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import pytest TOKENIZER_TESTS = [ - ("æ—ĨæœŦčĒžã ã‚ˆ", ['æ—ĨæœŦčĒž', 'だ', 'よ']), + ("æ—ĨæœŦčĒžã ã‚ˆ", ['æ—ĨæœŦ', 'čĒž', 'だ', 'よ']), ("æąäēŦã‚ŋワãƒŧぎčŋ‘くãĢäŊã‚“でいぞす。", ['æąäēŦ', 'ã‚ŋワãƒŧ', 'ぎ', 'čŋ‘く', 'ãĢ', 'äŊã‚“', 'で', 'い', 'ぞす', '。']), ("吞čŧŠã¯įŒĢである。", ['吞čŧŠ', 'は', 'įŒĢ', 'で', 'ある', '。']), - ("月ãĢäģŖわãŖãĻ、おäģ•įŊŽãã‚ˆ!", ['月', 'ãĢ', 'äģŖわãŖ', 'ãĻ', '、', 'おäģ•įŊŽã', 'よ', '!']), + ("月ãĢäģŖわãŖãĻ、おäģ•įŊŽãã‚ˆ!", ['月', 'ãĢ', 'äģŖわãŖ', 'ãĻ', '、', 'お', 'äģ•įŊŽã', 'よ', '!']), ("ã™ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ã‚‚ãŽã†ãĄ", ['すもも', 'も', 'もも', 'も', 'もも', 'ぎ', 'ã†ãĄ']) ] From 95050201ce095e2328be383beec3025a5e64fb0a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 21:30:59 +0900 Subject: [PATCH 119/195] Add importorskip for Japanese fixture --- spacy/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 52b9bdd57..5fad6e429 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -84,6 +84,7 @@ def ja_tokenizer(): @pytest.fixture def japanese(): + pytest.importorskip("MeCab") return Japanese() @pytest.fixture From 884ba168a88699bedecf55888b670cbf2040a539 Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Wed, 23 Aug 2017 21:18:53 -0700 Subject: [PATCH 120/195] Capture more noun chunks --- spacy/syntax/iterators.pyx | 2 +- spacy/tests/parser/test_noun_chunks.py | 30 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index 0fe724622..14dba5f9b 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -9,7 +9,7 @@ def english_noun_chunks(obj): Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos', 'attr', 'ROOT'] doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings[label] for label in labels] diff --git a/spacy/tests/parser/test_noun_chunks.py b/spacy/tests/parser/test_noun_chunks.py index 5e8c7659a..ddebca8b8 100644 --- a/spacy/tests/parser/test_noun_chunks.py +++ b/spacy/tests/parser/test_noun_chunks.py @@ -47,6 +47,36 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer): assert chunks[1].text_with_ws == "another phrase " +def test_parser_noun_chunks_appositional_modifiers(en_tokenizer): + text = "Sam, my brother, arrived to the house." + heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4] + tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.'] + deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "Sam " + assert chunks[1].text_with_ws == "my brother " + assert chunks[2].text_with_ws == "the house " + + +def test_parser_noun_chunks_dative(en_tokenizer): + text = "She gave Bob a raise." + heads = [1, 0, -1, 1, -3, -4] + tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.'] + deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "She " + assert chunks[1].text_with_ws == "Bob " + assert chunks[2].text_with_ws == "a raise " + + def test_parser_noun_chunks_standard_de(de_tokenizer): text = "Eine Tasse steht auf dem Tisch." heads = [1, 1, 0, -1, 1, -2, -4] From 8b3e1f7b5b2d29ca3b70e5681daa095574b694be Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 29 Aug 2017 23:58:42 +0900 Subject: [PATCH 121/195] Handle out-of-vocab words Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM --- spacy/ja/__init__.py | 10 +++++++--- spacy/tests/ja/test_tagger.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index dfd0bca5b..2f85406c0 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text): node = node.next # first node is beginning of sentence and empty, skip it words = [] while node.posid != 0: + surface = node.surface + base = surface parts = node.feature.split(',') pos = ','.join(parts[0:4]) - reading = parts[6] - base = parts[7] - surface = parts[8] + + if len(parts) > 6: + # this information is only available for words in the tokenizer dictionary + reading = parts[6] + base = parts[7] words.append( ShortUnitWord(surface, base, pos) ) node = node.next diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 629cc795f..85f653836 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -22,7 +22,12 @@ TAGGER_TESTS = [ ('å‹•čŠž,一čˆŦ,*,*', 'VERB'), ('åŠŠčŠž,æŽĨįļšåŠŠčŠž,*,*', 'SCONJ'), ('å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*', 'VERB'), - ('åŠŠčŠž,įĩ‚åŠŠčŠž,*,*', 'PART'))) + ('åŠŠčŠž,įĩ‚åŠŠčŠž,*,*', 'PART'))), + ('プププナãƒŗドãĢčĄŒããŸã„', + (('åčŠž,æ™Žé€šåčŠž,一čˆŦ,*', 'NOUN'), + ('åŠŠčŠž,æ ŧåŠŠčŠž,*,*', 'ADP'), + ('å‹•čŠž,非č‡ĒįĢ‹å¯čƒŊ,*,*', 'VERB'), + ('åŠŠå‹•čŠž,*,*,*', 'AUX'))) ] @pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS) From a6d9fb5bb65066887e5a7e5d44b078e722b2b002 Mon Sep 17 00:00:00 2001 From: Vimos Tan Date: Wed, 30 Aug 2017 14:49:14 +0800 Subject: [PATCH 122/195] fix issue #1292 --- .../tokenizer/test_customized_tokenizer.py | 46 +++++++++++++++++++ spacy/tokenizer.pyx | 3 +- 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/tokenizer/test_customized_tokenizer.py diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py new file mode 100644 index 000000000..97a7db64c --- /dev/null +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ... import load +from ...tokenizer import Tokenizer +from ... import util + +import pytest + + +def test_customized_tokenizer_handles_infixes(): + def custom_tokenizer(nlp_model): + prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) + custom_infixes = ['\.\.\.+', + '(?<=[0-9])-(?=[0-9])', + # '(?<=[0-9]+),(?=[0-9]+)', + '[0-9]+(,[0-9]+)+', + u'[\[\]!&:,()\*—–\/-]'] + + infix_re = util.compile_infix_regex(custom_infixes) + + # infix_re = re.compile(ur'[\[\]!&:,()]') + + tokenizer = Tokenizer(nlp_model.vocab, + nlp_model.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None) + return lambda text: tokenizer(text) + + nlp = load('en', create_make_doc=custom_tokenizer) + + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in nlp(sentence)] + assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', + u'for', + u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] + + # the trailing '-' may cause Assertion Error + sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in nlp(sentence)] + assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', + u'for', + u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 276f0ef20..799e4bdaa 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -312,7 +312,8 @@ cdef class Tokenizer: start = infix_end span = string[start:] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + if span: + tokens.push_back(self.vocab.get(tokens.mem, span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) From 9bffcaa73df60794c63f428f5f83f06bd5a271e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 1 Sep 2017 21:16:56 +0200 Subject: [PATCH 123/195] Update test to make it slightly more direct The `nlp` container should be unnecessary here. If so, we can test the tokenizer class just a little more directly. --- .../tokenizer/test_customized_tokenizer.py | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py index 97a7db64c..695f8c649 100644 --- a/spacy/tests/tokenizer/test_customized_tokenizer.py +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -1,46 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals -from ... import load +from ...lang.en import English from ...tokenizer import Tokenizer from ... import util import pytest +@pytest.fixture +def tokenizer(en_vocab): + prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) + custom_infixes = ['\.\.\.+', + '(?<=[0-9])-(?=[0-9])', + # '(?<=[0-9]+),(?=[0-9]+)', + '[0-9]+(,[0-9]+)+', + u'[\[\]!&:,()\*—–\/-]'] -def test_customized_tokenizer_handles_infixes(): - def custom_tokenizer(nlp_model): - prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) - custom_infixes = ['\.\.\.+', - '(?<=[0-9])-(?=[0-9])', - # '(?<=[0-9]+),(?=[0-9]+)', - '[0-9]+(,[0-9]+)+', - u'[\[\]!&:,()\*—–\/-]'] - - infix_re = util.compile_infix_regex(custom_infixes) - - # infix_re = re.compile(ur'[\[\]!&:,()]') - - tokenizer = Tokenizer(nlp_model.vocab, - nlp_model.Defaults.tokenizer_exceptions, - prefix_re.search, - suffix_re.search, - infix_re.finditer, - token_match=None) - return lambda text: tokenizer(text) - - nlp = load('en', create_make_doc=custom_tokenizer) + infix_re = util.compile_infix_regex(custom_infixes) + return Tokenizer(en_vocab, + English.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None) +def test_customized_tokenizer_handles_infixes(tokenizer): sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." - context = [word.text for word in nlp(sentence)] + context = [word.text for word in tokenizer(sentence)] assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', u'for', u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] # the trailing '-' may cause Assertion Error sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." - context = [word.text for word in nlp(sentence)] + context = [word.text for word in tokenizer(sentence)] assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', u'for', u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] From d61c117081a57f7788e7e709abfd9adcd6e39df8 Mon Sep 17 00:00:00 2001 From: Eric Zhao Date: Sun, 3 Sep 2017 12:16:59 -0700 Subject: [PATCH 124/195] Lowest common ancestor matrix for spans and docs Added functionality for spans and docs to get lowest common ancestor matrix by simply calling: doc.get_lca_matrix() or doc[:3].get_lca_matrix(). Corresponding unit tests were also added under spacy/tests/doc and spacy/tests/spans. Designed to address: https://github.com/explosion/spaCy/issues/969. --- spacy/tests/doc/test_doc_api.py | 7 +++++ spacy/tests/spans/test_span.py | 11 +++++++ spacy/tokens/doc.pyx | 43 +++++++++++++++++++++++++++ spacy/tokens/span.pyx | 52 +++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 1bc534ecd..d1a6316d5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -216,6 +216,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): doc = en_tokenizer(text) assert doc.has_vector +def test_lowest_common_ancestor(en_tokenizer): + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc.get_lca_matrix() + assert(lca[1, 1] == 1) + assert(lca[0, 1] == 2) + assert(lca[1, 2] == 2) def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index d22fa52ae..29aefe5c7 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -54,6 +54,17 @@ def test_spans_span_sent(doc): assert doc[6:7].sent.root.left_edge.text == 'This' +def test_spans_lca_matrix(en_tokenizer): + """Test span's lca matrix generation""" + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc[:2].get_lca_matrix() + assert(lca[0, 0] == 0) + assert(lca[0, 1] == -1) + assert(lca[1, 0] == -1) + assert(lca[1, 1] == 1) + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ca5a3d696..aa888382e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -614,6 +614,49 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy doc. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix): + if lca_matrix[token_j.i][token_k.i] != -2: + return lca_matrix[token_j.i][token_k.i] + elif token_j == token_k: + lca_index = token_j.i + elif token_k.head == token_j: + lca_index = token_j.i + elif token_j.head == token_k: + lca_index = token_k.i + elif (token_j.head == token_j) and (token_k.head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_matrix[token_j.i][token_k.i] = lca_index + lca_matrix[token_k.i][token_j.i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + def to_bytes(self): """ Serialize, producing a byte string. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d8890addc..ae28f698a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -130,6 +130,58 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy span. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix, margins): + offset = margins[0] + token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k + token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j + token_j_i = token_j.i - offset + token_k_i = token_k.i - offset + + if lca_matrix[token_j_i][token_k_i] != -2: + return lca_matrix[token_j_i][token_k_i] + elif token_j == token_k: + lca_index = token_j_i + elif token_k_head == token_j: + lca_index = token_j_i + elif token_j_head == token_k: + lca_index = token_k_i + elif (token_j_head == token_j) and (token_k_head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) + + lca_matrix[token_j_i][token_k_i] = lca_index + lca_matrix[token_k_i][token_j_i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + margins = [self.start, self.end] + + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ From e8a26ebfabec51327b2948fba95d6fa87f77eaa5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:43:52 +0200 Subject: [PATCH 125/195] Add efficiency note to new get_lca_matrix() method --- spacy/tokens/doc.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index aa888382e..aca35a73f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -626,7 +626,14 @@ cdef class Doc: impact on performance is negligible given the natural limitations on the depth of a typical human sentence. ''' - + # Efficiency notes: + # + # We can easily improve the performance here by iterating in Cython. + # To loop over the tokens in Cython, the easiest way is: + # for token in doc.c[:doc.c.length]: + # head = token + token.head + # Both token and head will be TokenC* here. The token.head attribute + # is an integer offset. def __pairwise_lca(token_j, token_k, lca_matrix): if lca_matrix[token_j.i][token_k.i] != -2: return lca_matrix[token_j.i][token_k.i] @@ -649,7 +656,7 @@ cdef class Doc: lca_matrix.fill(-2) for j in range(len(self)): token_j = self[j] - for k in range(len(self)): + for k in range(j, len(self)): token_k = self[k] lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) lca_matrix[k][j] = lca_matrix[j][k] From c68f188eb035ed67e2df905dd5e483f0261a8ace Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 18:59:36 +0200 Subject: [PATCH 126/195] Fix error on test --- spacy/tests/tokenizer/test_customized_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py index 695f8c649..19909ceba 100644 --- a/spacy/tests/tokenizer/test_customized_tokenizer.py +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ...lang.en import English +from ...en import English from ...tokenizer import Tokenizer from ... import util From 45029a550e128e887fe1a6d826c04923991d98e2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 20:13:13 +0200 Subject: [PATCH 127/195] Fix customized-tokenizer tests --- spacy/tests/tokenizer/test_customized_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py index 19909ceba..855f3386c 100644 --- a/spacy/tests/tokenizer/test_customized_tokenizer.py +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -9,8 +9,8 @@ import pytest @pytest.fixture def tokenizer(en_vocab): - prefix_re = util.compile_prefix_regex(nlp_model.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp_model.Defaults.suffixes) + prefix_re = util.compile_prefix_regex(English.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(English.Defaults.suffixes) custom_infixes = ['\.\.\.+', '(?<=[0-9])-(?=[0-9])', # '(?<=[0-9]+),(?=[0-9]+)', From 7692b8c071af51165c732474978b032ca85f262f Mon Sep 17 00:00:00 2001 From: Yu-chun Huang Date: Tue, 12 Sep 2017 16:23:47 +0800 Subject: [PATCH 128/195] Update __init__.py Set the "cut_all" parameter to False, or jieba will return ALL POSSIBLE word segmentations. --- spacy/zh/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index 0f407dec6..bde0054b5 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -7,6 +7,6 @@ class Chinese(Language): def make_doc(self, text): import jieba - words = list(jieba.cut(text, cut_all=True)) + words = list(jieba.cut(text, cut_all=False)) words=[x for x in words if x] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From 1f1f35dcd07d419a2aca449c0ef738e098e37b68 Mon Sep 17 00:00:00 2001 From: Yu-chun Huang Date: Tue, 19 Sep 2017 16:57:24 +0800 Subject: [PATCH 129/195] Add Chinese punctuation Add Chinese punctuation. --- spacy/language_data/punctuation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index f23b15bbc..fe636fa4b 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -19,11 +19,13 @@ _CURRENCY = r""" _QUOTES = r""" ' '' " ” “ `` ` ‘ ´ ‚ , „ Âģ ÂĢ +「 」 『 』 īŧˆ īŧ‰ 〔 〕 【 】 《 》 〈 〉 """ _PUNCT = r""" â€Ļ , : ; \! \? Âŋ ÂĄ \( \) \[ \] \{ \} < > _ # \* & +。īŧŸ īŧ īŧŒ 、 īŧ› īŧš īŊž """ From 188b439b25dbe020977761cc719efaf452e79423 Mon Sep 17 00:00:00 2001 From: Yu-chun Huang Date: Tue, 19 Sep 2017 16:58:42 +0800 Subject: [PATCH 130/195] Add Chinese punctuation Add Chinese punctuation. --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index fe636fa4b..6229eff21 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -25,7 +25,7 @@ _QUOTES = r""" _PUNCT = r""" â€Ļ , : ; \! \? Âŋ ÂĄ \( \) \[ \] \{ \} < > _ # \* & -。īŧŸ īŧ īŧŒ 、 īŧ› īŧš īŊž +。 īŧŸ īŧ īŧŒ 、 īŧ› īŧš īŊž """ From 978b24ccd44a80f9ea2f8ae781e9b3a2164f68c4 Mon Sep 17 00:00:00 2001 From: Yam Date: Wed, 20 Sep 2017 23:02:22 +0800 Subject: [PATCH 131/195] Update punctuation.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Chinese, `~` and `——` is hyphens, `¡` is intermittent symbol --- spacy/language_data/punctuation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 6229eff21..58ec73f2d 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -25,12 +25,12 @@ _QUOTES = r""" _PUNCT = r""" â€Ļ , : ; \! \? Âŋ ÂĄ \( \) \[ \] \{ \} < > _ # \* & -。 īŧŸ īŧ īŧŒ 、 īŧ› īŧš īŊž +。 īŧŸ īŧ īŧŒ 、 īŧ› īŧš īŊž ¡ """ _HYPHENS = r""" -- – — -- --- +- – — -- --- —— ~ """ From 44291f6697e3707c8730153c78cc547fc2e8f9e4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 20 Sep 2017 23:26:34 +0700 Subject: [PATCH 132/195] add thai --- spacy/__init__.py | 5 +- spacy/th/__init__.py | 30 ++++++++++++ spacy/th/language_data.py | 25 ++++++++++ spacy/th/stop_words.py | 62 ++++++++++++++++++++++++ spacy/th/tag_map.py | 81 ++++++++++++++++++++++++++++++++ spacy/th/tokenizer_exceptions.py | 80 +++++++++++++++++++++++++++++++ 6 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 spacy/th/__init__.py create mode 100644 spacy/th/language_data.py create mode 100644 spacy/th/stop_words.py create mode 100644 spacy/th/tag_map.py create mode 100644 spacy/th/tokenizer_exceptions.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 3afb38cfb..f0d5ea0fc 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -7,12 +7,13 @@ from .cli.info import info from .glossary import explain from .about import __version__ -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese, + th.Thai) for _lang in _languages: diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py new file mode 100644 index 000000000..0b6f8cf76 --- /dev/null +++ b/spacy/th/__init__.py @@ -0,0 +1,30 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .language_data import * +from ..language import Language, BaseDefaults +from ..attrs import LANG +from ..tokenizer import Tokenizer +from ..tokens import Doc +class ThaiDefaults(BaseDefaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'th' + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tag_map = TAG_MAP + stop_words = set(STOP_WORDS) + + +class Thai(Language): + lang = 'th' + Defaults = ThaiDefaults + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + +__all__ = ['Thai'] \ No newline at end of file diff --git a/spacy/th/language_data.py b/spacy/th/language_data.py new file mode 100644 index 000000000..03800ba19 --- /dev/null +++ b/spacy/th/language_data.py @@ -0,0 +1,25 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +#from .tag_map import TAG_MAP +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) + +# export __all__ = ["TAG_MAP", "STOP_WORDS"] +__all__ = ["TAG_MAP", "STOP_WORDS","TOKENIZER_EXCEPTIONS"] \ No newline at end of file diff --git a/spacy/th/stop_words.py b/spacy/th/stop_words.py new file mode 100644 index 000000000..e13dec984 --- /dev/null +++ b/spacy/th/stop_words.py @@ -0,0 +1,62 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt +# stop words as whitespace-separated list +STOP_WORDS = set(""" +ā¸™ā¸ĩāš‰ ā¸™āšā¸˛ ā¸™ā¸ąāš‰ā¸™ ā¸™ā¸ąā¸ ā¸™ā¸­ā¸ā¸ˆā¸˛ā¸ ā¸—ā¸¸ā¸ ā¸—ā¸ĩāšˆā¸Ēā¸¸ā¸” ā¸—ā¸ĩāšˆ ā¸—āšā¸˛āšƒā¸Ģāš‰ ā¸—āšā¸˛ ā¸—ā¸˛ā¸‡ ā¸—ā¸ąāš‰ā¸‡ā¸™ā¸ĩāš‰ ā¸”ā¸ąā¸‡ ā¸‹ā¸ļāšˆā¸‡ ā¸Šāšˆā¸§ā¸‡ ā¸ˆā¸˛ā¸ ā¸ˆā¸ąā¸” ā¸ˆā¸° ā¸„ā¸ˇā¸­ ā¸„ā¸§ā¸˛ā¸Ą ā¸„ā¸Ŗā¸ąāš‰ā¸‡ ā¸„ā¸‡ ā¸‚ā¸ļāš‰ā¸™ ā¸‚ā¸­ā¸‡ +ā¸‚ā¸­ ā¸Ŗā¸ąā¸š ā¸Ŗā¸°ā¸Ģā¸§āšˆā¸˛ā¸‡ ā¸Ŗā¸§ā¸Ą ā¸ĸā¸ąā¸‡ ā¸Ąā¸ĩ ā¸Ąā¸˛ā¸ ā¸Ąā¸˛ ā¸žā¸Ŗāš‰ā¸­ā¸Ą ā¸žā¸š ā¸œāšˆā¸˛ā¸™ ā¸œā¸Ĩ ā¸šā¸˛ā¸‡ ā¸™āšˆā¸˛ āš€ā¸›ā¸´ā¸”āš€ā¸œā¸ĸ āš€ā¸›ā¸´ā¸” āš€ā¸™ā¸ˇāšˆā¸­ā¸‡ā¸ˆā¸˛ā¸ āš€ā¸”ā¸ĩā¸ĸā¸§ā¸ā¸ąā¸™ āš€ā¸”ā¸ĩā¸ĸā¸§ āš€ā¸Šāšˆā¸™ āš€ā¸‰ā¸žā¸˛ā¸° āš€ā¸‚āš‰ā¸˛ ā¸–āš‰ā¸˛ +ā¸–ā¸šā¸ ā¸–ā¸ļā¸‡ ā¸•āš‰ā¸­ā¸‡ ā¸•āšˆā¸˛ā¸‡āš† ā¸•āšˆā¸˛ā¸‡ ā¸•āšˆā¸­ ā¸•ā¸˛ā¸Ą ā¸•ā¸ąāš‰ā¸‡āšā¸•āšˆ ā¸•ā¸ąāš‰ā¸‡ ā¸”āš‰ā¸˛ā¸™ ā¸”āš‰ā¸§ā¸ĸ ā¸­ā¸ĩā¸ ā¸­ā¸˛ā¸ˆ ā¸­ā¸­ā¸ ā¸­ā¸ĸāšˆā¸˛ā¸‡ ā¸­ā¸°āš„ā¸Ŗ ā¸­ā¸ĸā¸šāšˆ ā¸­ā¸ĸā¸˛ā¸ ā¸Ģā¸˛ā¸ ā¸Ģā¸Ĩā¸˛ā¸ĸ ā¸Ģā¸Ĩā¸ąā¸‡ā¸ˆā¸˛ā¸ āšā¸•āšˆ āš€ā¸­ā¸‡ āš€ā¸Ģāš‡ā¸™ +āš€ā¸Ĩā¸ĸ āš€ā¸Ŗā¸´āšˆā¸Ą āš€ā¸Ŗā¸˛ āš€ā¸Ąā¸ˇāšˆā¸­ āš€ā¸žā¸ˇāšˆā¸­ āš€ā¸žā¸Ŗā¸˛ā¸° āš€ā¸›āš‡ā¸™ā¸ā¸˛ā¸Ŗ āš€ā¸›āš‡ā¸™ ā¸Ģā¸Ĩā¸ąā¸‡ ā¸Ģā¸Ŗā¸ˇā¸­ ā¸Ģā¸™ā¸ļāšˆā¸‡ ā¸Ēāšˆā¸§ā¸™ ā¸Ēāšˆā¸‡ ā¸Ēā¸¸ā¸” ā¸Ēāšā¸˛ā¸Ģā¸Ŗā¸ąā¸š ā¸§āšˆā¸˛ ā¸Ĩā¸‡ ā¸Ŗāšˆā¸§ā¸Ą ā¸Ŗā¸˛ā¸ĸ ā¸‚ā¸“ā¸° ā¸āšˆā¸­ā¸™ ā¸āš‡ ā¸ā¸˛ā¸Ŗ ā¸ā¸ąā¸š ā¸ā¸ąā¸™ +ā¸ā¸§āšˆā¸˛ ā¸ā¸Ĩāšˆā¸˛ā¸§ ā¸ˆā¸ļā¸‡ āš„ā¸§āš‰ āš„ā¸› āš„ā¸”āš‰ āšƒā¸Ģāš‰ āšƒā¸™ āš‚ā¸”ā¸ĸ āšā¸Ģāšˆā¸‡ āšā¸Ĩāš‰ā¸§ āšā¸Ĩā¸° āšā¸Ŗā¸ āšā¸šā¸š āš† ā¸—ā¸ąāš‰ā¸‡ ā¸§ā¸ąā¸™ āš€ā¸‚ā¸˛ āš€ā¸„ā¸ĸ āš„ā¸Ąāšˆ ā¸­ā¸ĸā¸˛ā¸ āš€ā¸ā¸´ā¸™ āš€ā¸ā¸´ā¸™āš† āš€ā¸ā¸ĩāšˆā¸ĸā¸§ā¸ā¸ąā¸™ āš€ā¸ā¸ĩāšˆā¸ĸā¸§ā¸ā¸ąā¸š +āš€ā¸ā¸ĩāšˆā¸ĸā¸§ā¸‚āš‰ā¸­ā¸‡ āš€ā¸ā¸ĩāšˆā¸ĸā¸§āš€ā¸™ā¸ˇāšˆā¸­ā¸‡ āš€ā¸ā¸ĩāšˆā¸ĸā¸§āš† āš€ā¸ā¸ˇā¸­ā¸š āš€ā¸ā¸ˇā¸­ā¸šā¸ˆā¸° āš€ā¸ā¸ˇā¸­ā¸šāš† āšā¸ āšā¸āšˆ āšā¸āš‰āš„ā¸‚ āšƒā¸ā¸Ĩāš‰ āšƒā¸ā¸Ĩāš‰āš† āš„ā¸ā¸Ĩ āš„ā¸ā¸Ĩāš† ā¸‚ā¸“ā¸°āš€ā¸”ā¸ĩā¸ĸā¸§ā¸ā¸ąā¸™ ā¸‚ā¸“ā¸°āšƒā¸” ā¸‚ā¸“ā¸°āšƒā¸”āš† ā¸‚ā¸“ā¸°ā¸—ā¸ĩāšˆ ā¸‚ā¸“ā¸°ā¸™ā¸ąāš‰ā¸™ ā¸‚ā¸“ā¸°ā¸™ā¸ĩāš‰ ā¸‚ā¸“ā¸°ā¸Ģā¸™ā¸ļāšˆā¸‡ ā¸‚ā¸§ā¸˛ā¸‡ +ā¸‚ā¸§ā¸˛ā¸‡āš† ā¸‚ā¸ąāš‰ā¸™ āšƒā¸„ā¸Ŗ āšƒā¸„ā¸Ŗāšˆ āšƒā¸„ā¸Ŗāšˆā¸ˆā¸° āšƒā¸„ā¸Ŗāš† ā¸‡āšˆā¸˛ā¸ĸ ā¸‡āšˆā¸˛ā¸ĸāš† āš„ā¸‡ ā¸ˆā¸‡ ā¸ˆā¸” ā¸ˆā¸™ ā¸ˆā¸™ā¸ā¸Ŗā¸°ā¸—ā¸ąāšˆā¸‡ ā¸ˆā¸™ā¸ā¸§āšˆā¸˛ ā¸ˆā¸™ā¸‚ā¸“ā¸°ā¸™ā¸ĩāš‰ ā¸ˆā¸™ā¸•ā¸Ĩā¸­ā¸” ā¸ˆā¸™ā¸–ā¸ļā¸‡ ā¸ˆā¸™ā¸—ā¸ąāšˆā¸§ ā¸ˆā¸™ā¸šā¸ąā¸”ā¸™ā¸ĩāš‰ ā¸ˆā¸™āš€ā¸Ąā¸ˇāšˆā¸­ ā¸ˆā¸™āšā¸Ąāš‰ ā¸ˆā¸™āšā¸Ąāš‰ā¸™ +ā¸ˆā¸Ŗā¸” ā¸ˆā¸Ŗā¸”ā¸ā¸ąā¸š ā¸ˆā¸Ŗā¸´ā¸‡ ā¸ˆā¸Ŗā¸´ā¸‡ā¸ˆā¸ąā¸‡ ā¸ˆā¸Ŗā¸´ā¸‡āš† ā¸ˆā¸Ŗā¸´ā¸‡āš†ā¸ˆā¸ąā¸‡āš† ā¸ˆā¸§ā¸™ ā¸ˆā¸§ā¸™ā¸ˆā¸° ā¸ˆā¸§ā¸™āš€ā¸ˆā¸ĩā¸ĸā¸™ ā¸ˆā¸§ā¸š ā¸‹ā¸ļāšˆā¸‡ā¸āš‡ ā¸‹ā¸ļāšˆā¸‡ā¸āš‡ā¸„ā¸ˇā¸­ ā¸‹ā¸ļāšˆā¸‡ā¸ā¸ąā¸™ ā¸‹ā¸ļāšˆā¸‡ā¸ā¸ąā¸™āšā¸Ĩā¸°ā¸ā¸ąā¸™ ā¸‹ā¸ļāšˆā¸‡āš„ā¸”āš‰āšā¸āšˆ ā¸‹ā¸ļāšˆā¸‡āš† ā¸“ ā¸”āš‰ā¸§ā¸ĸ ā¸”āš‰ā¸§ā¸ĸā¸ā¸ąā¸™ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Šāšˆā¸™ā¸ā¸ąā¸™ ā¸”āš‰ā¸§ā¸ĸā¸—ā¸ĩāšˆ ā¸”āš‰ā¸§ā¸ĸā¸›ā¸Ŗā¸°ā¸ā¸˛ā¸Ŗā¸‰ā¸°ā¸™ā¸ĩāš‰ +ā¸”āš‰ā¸§ā¸ĸāš€ā¸žā¸Ŗā¸˛ā¸° ā¸”āš‰ā¸§ā¸ĸā¸§āšˆā¸˛ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸—ā¸ĩāšˆ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸™ā¸ąāš‰ā¸™ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸™ā¸ĩāš‰ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸āš€ā¸žā¸Ŗā¸˛ā¸° ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸§āšˆā¸˛ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸Ąā¸ˇā¸­ā¸™ā¸ā¸ąā¸™ ā¸”ā¸ąāšˆā¸‡ ā¸”ā¸ąā¸‡ā¸ā¸Ĩāšˆā¸˛ā¸§ ā¸”ā¸ąā¸‡ā¸ā¸ąā¸š ā¸”ā¸ąāšˆā¸‡ā¸ā¸ąā¸š ā¸”ā¸ąā¸‡ā¸ā¸ąā¸šā¸§āšˆā¸˛ ā¸”ā¸ąāšˆā¸‡ā¸ā¸ąā¸šā¸§āšˆā¸˛ ā¸”ā¸ąā¸‡āš€ā¸āšˆā¸˛ +ā¸”ā¸ąāšˆā¸‡āš€ā¸āšˆā¸˛ ā¸”ā¸ąā¸‡āš€ā¸„ā¸ĸ āšƒā¸”āš† āš„ā¸”āš‰ āš„ā¸”āš‰āšā¸āšˆ āš„ā¸”āš‰āšā¸•āšˆ āš„ā¸”āš‰ā¸—ā¸ĩāšˆ āš„ā¸”āš‰ā¸Ąā¸˛ āš„ā¸”āš‰ā¸Ŗā¸ąā¸š ā¸•ā¸™ ā¸•ā¸™āš€ā¸­ā¸‡ ā¸•ā¸™ā¸¯ ā¸•ā¸Ŗā¸‡ ā¸•ā¸Ŗā¸‡āš† ā¸•ā¸Ĩā¸­ā¸” ā¸•ā¸Ĩā¸­ā¸”ā¸ā¸˛ā¸Ĩ ā¸•ā¸Ĩā¸­ā¸”ā¸ā¸˛ā¸Ĩā¸™ā¸˛ā¸™ ā¸•ā¸Ĩā¸­ā¸”ā¸ˆā¸™ ā¸•ā¸Ĩā¸­ā¸”ā¸–ā¸ļā¸‡ ā¸•ā¸Ĩā¸­ā¸”ā¸—ā¸ąāš‰ā¸‡ +ā¸•ā¸Ĩā¸­ā¸”ā¸—ā¸ąāšˆā¸§ ā¸•ā¸Ĩā¸­ā¸”ā¸—ā¸ąāšˆā¸§ā¸–ā¸ļā¸‡ ā¸•ā¸Ĩā¸­ā¸”ā¸—ā¸ąāšˆā¸§ā¸—ā¸ąāš‰ā¸‡ ā¸•ā¸Ĩā¸­ā¸”ā¸›ā¸ĩ ā¸•ā¸Ĩā¸­ā¸”āš„ā¸› ā¸•ā¸Ĩā¸­ā¸”ā¸Ąā¸˛ ā¸•ā¸Ĩā¸­ā¸”ā¸Ŗā¸°ā¸ĸā¸°āš€ā¸§ā¸Ĩā¸˛ ā¸•ā¸Ĩā¸­ā¸”ā¸§ā¸ąā¸™ ā¸•ā¸Ĩā¸­ā¸”āš€ā¸§ā¸Ĩā¸˛ ā¸•ā¸Ĩā¸­ā¸”ā¸¨ā¸ ā¸•āšˆā¸­ ā¸•āšˆā¸­ā¸ā¸ąā¸™ ā¸–ā¸ļā¸‡āšā¸āšˆ ā¸–ā¸ļā¸‡ā¸ˆā¸° ā¸–ā¸ļā¸‡ā¸šā¸ąā¸”ā¸™ā¸ąāš‰ā¸™ ā¸–ā¸ļā¸‡ā¸šā¸ąā¸”ā¸™ā¸ĩāš‰ +ā¸–ā¸ļā¸‡āš€ā¸Ąā¸ˇāšˆā¸­ ā¸–ā¸ļā¸‡āš€ā¸Ąā¸ˇāšˆā¸­āšƒā¸” ā¸–ā¸ļā¸‡āš€ā¸Ąā¸ˇāšˆā¸­āš„ā¸Ŗ ā¸–ā¸ļā¸‡āšā¸Ąāš‰ ā¸–ā¸ļā¸‡āšā¸Ąāš‰ā¸ˆā¸° ā¸–ā¸ļā¸‡āšā¸Ąāš‰ā¸§āšˆā¸˛ ā¸–ā¸ļā¸‡ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ŗ ā¸–ā¸ˇā¸­ ā¸–ā¸ˇā¸­ā¸§āšˆā¸˛ ā¸–ā¸šā¸ā¸•āš‰ā¸­ā¸‡ ā¸–ā¸šā¸āš† āš€ā¸–ā¸­ā¸° āš€ā¸–ā¸´ā¸” ā¸—ā¸Ŗā¸‡ ā¸—ā¸§āšˆā¸˛ ā¸—ā¸ąāš‰ā¸‡ā¸„ā¸™ ā¸—ā¸ąāš‰ā¸‡ā¸•ā¸ąā¸§ ā¸—ā¸ąāš‰ā¸‡ā¸—ā¸ĩ ā¸—ā¸ąāš‰ā¸‡ā¸—ā¸ĩāšˆ ā¸—ā¸ąāš‰ā¸‡ā¸™ā¸ąāš‰ā¸™ ā¸—ā¸ąāš‰ā¸‡ā¸™ā¸ąāš‰ā¸™ā¸”āš‰ā¸§ā¸ĸ ā¸—ā¸ąāš‰ā¸‡ā¸™ā¸ąāš‰ā¸™āš€ā¸žā¸Ŗā¸˛ā¸° +ā¸™ā¸­ā¸ ā¸™ā¸­ā¸ā¸ˆā¸˛ā¸ā¸—ā¸ĩāšˆ ā¸™ā¸­ā¸ā¸ˆā¸˛ā¸ā¸™ā¸ąāš‰ā¸™ ā¸™ā¸­ā¸ā¸ˆā¸˛ā¸ā¸™ā¸ĩāš‰ ā¸™ā¸­ā¸ā¸ˆā¸˛ā¸ā¸§āšˆā¸˛ ā¸™ā¸­ā¸ā¸™ā¸ąāš‰ā¸™ ā¸™ā¸­ā¸āš€ā¸Ģā¸™ā¸ˇā¸­ ā¸™ā¸­ā¸āš€ā¸Ģā¸™ā¸ˇā¸­ā¸ˆā¸˛ā¸ ā¸™āš‰ā¸­ā¸ĸ ā¸™āš‰ā¸­ā¸ĸā¸ā¸§āšˆā¸˛ ā¸™āš‰ā¸­ā¸ĸāš† ā¸™ā¸° ā¸™āšˆā¸° ā¸™ā¸ąā¸āš† ā¸™ā¸ąāšˆā¸™ ā¸™ā¸ąāšˆā¸™āš„ā¸‡ ā¸™ā¸ąāšˆā¸™āš€ā¸›āš‡ā¸™ ā¸™ā¸ąāšˆā¸™āšā¸Ģā¸Ĩā¸° +ā¸™ā¸ąāšˆā¸™āš€ā¸­ā¸‡ ā¸™ā¸ąāš‰ā¸™āš† ā¸™ā¸ąā¸š ā¸™ā¸ąā¸šā¸ˆā¸˛ā¸ā¸™ā¸ąāš‰ā¸™ ā¸™ā¸ąā¸šā¸ˆā¸˛ā¸ā¸™ā¸ĩāš‰ ā¸™ā¸ąā¸šā¸•ā¸ąāš‰ā¸‡āšā¸•āšˆ ā¸™ā¸ąā¸šāšā¸•āšˆ ā¸™ā¸ąā¸šāšā¸•āšˆā¸—ā¸ĩāšˆ ā¸™ā¸ąā¸šāšā¸•āšˆā¸™ā¸ąāš‰ā¸™ āš€ā¸›āš‡ā¸™ā¸•āš‰ā¸™ āš€ā¸›āš‡ā¸™ā¸•āš‰ā¸™āš„ā¸› āš€ā¸›āš‡ā¸™ā¸•āš‰ā¸™ā¸Ąā¸˛ āš€ā¸›āš‡ā¸™āšā¸•āšˆ āš€ā¸›āš‡ā¸™āšā¸•āšˆāš€ā¸žā¸ĩā¸ĸā¸‡ āš€ā¸›āš‡ā¸™ā¸—ā¸ĩ āš€ā¸›āš‡ā¸™ā¸—ā¸ĩāšˆ āš€ā¸›āš‡ā¸™ā¸—ā¸ĩāšˆā¸Ēā¸¸ā¸” āš€ā¸›āš‡ā¸™āš€ā¸žā¸Ŗā¸˛ā¸° +āš€ā¸›āš‡ā¸™āš€ā¸žā¸Ŗā¸˛ā¸°ā¸§āšˆā¸˛ āš€ā¸›āš‡ā¸™āš€ā¸žā¸ĩā¸ĸā¸‡ āš€ā¸›āš‡ā¸™āš€ā¸žā¸ĩā¸ĸā¸‡ā¸§āšˆā¸˛ āš€ā¸›āš‡ā¸™āš€ā¸žā¸ˇāšˆā¸­ āš€ā¸›āš‡ā¸™ā¸­ā¸ąā¸™ āš€ā¸›āš‡ā¸™ā¸­ā¸ąā¸™ā¸Ąā¸˛ā¸ āš€ā¸›āš‡ā¸™ā¸­ā¸ąā¸™ā¸§āšˆā¸˛ āš€ā¸›āš‡ā¸™ā¸­ā¸ąā¸™āš† āš€ā¸›āš‡ā¸™ā¸­ā¸˛ā¸—ā¸´ āš€ā¸›āš‡ā¸™āš† āš€ā¸›ā¸Ĩā¸ĩāšˆā¸ĸā¸™ āš€ā¸›ā¸Ĩā¸ĩāšˆā¸ĸā¸™āšā¸›ā¸Ĩā¸‡ āš€ā¸›ā¸´ā¸” āš€ā¸›ā¸´ā¸”āš€ā¸œā¸ĸ āš„ā¸›āšˆ ā¸œāšˆā¸˛ā¸™ ā¸œāšˆā¸˛ā¸™āš† +ā¸œā¸´ā¸” ā¸œā¸´ā¸”āš† ā¸œā¸šāš‰ āš€ā¸žā¸ĩā¸ĸā¸‡āš€ā¸žā¸ˇāšˆā¸­ āš€ā¸žā¸ĩā¸ĸā¸‡āš„ā¸Ŗ āš€ā¸žā¸ĩā¸ĸā¸‡āš„ā¸Ģā¸™ āš€ā¸žā¸ˇāšˆā¸­ā¸—ā¸ĩāšˆ āš€ā¸žā¸ˇāšˆā¸­ā¸—ā¸ĩāšˆā¸ˆā¸° āš€ā¸žā¸ˇāšˆā¸­ā¸§āšˆā¸˛ āš€ā¸žā¸ˇāšˆā¸­āšƒā¸Ģāš‰ ā¸ ā¸˛ā¸„ ā¸ ā¸˛ā¸„ā¸¯ ā¸ ā¸˛ā¸ĸ ā¸ ā¸˛ā¸ĸāšƒā¸•āš‰ ā¸ ā¸˛ā¸ĸā¸™ā¸­ā¸ ā¸ ā¸˛ā¸ĸāšƒā¸™ ā¸ ā¸˛ā¸ĸā¸ ā¸˛ā¸„ ā¸ ā¸˛ā¸ĸā¸ ā¸˛ā¸„ā¸Ģā¸™āš‰ā¸˛ ā¸ ā¸˛ā¸ĸā¸Ģā¸™āš‰ā¸˛ ā¸ ā¸˛ā¸ĸā¸Ģā¸Ĩā¸ąā¸‡ +ā¸Ąā¸­ā¸‡ ā¸Ąā¸­ā¸‡ā¸§āšˆā¸˛ ā¸Ąā¸ąā¸ ā¸Ąā¸ąā¸ā¸ˆā¸° ā¸Ąā¸ąā¸™ ā¸Ąā¸ąā¸™āš† ā¸Ąā¸ąāš‰ā¸ĸ ā¸Ąā¸ąāš‰ā¸ĸā¸™ā¸° ā¸Ąā¸ąāš‰ā¸ĸā¸™ā¸ąāšˆā¸™ ā¸Ąā¸ąāš‰ā¸ĸāš€ā¸™ā¸ĩāšˆā¸ĸ ā¸Ąā¸ąāš‰ā¸ĸā¸Ĩāšˆā¸° ā¸ĸā¸ˇā¸™ā¸™ā¸˛ā¸™ ā¸ĸā¸ˇā¸™ā¸ĸā¸‡ ā¸ĸā¸ˇā¸™ā¸ĸā¸ąā¸™ ā¸ĸā¸ˇā¸™ā¸ĸā¸˛ā¸§ āš€ā¸ĸā¸­ā¸° āš€ā¸ĸā¸­ā¸°āšā¸ĸā¸° āš€ā¸ĸā¸­ā¸°āš† āšā¸ĸā¸° āšā¸ĸā¸°āš† ā¸Ŗā¸§ā¸” ā¸Ŗā¸§ā¸”āš€ā¸Ŗāš‡ā¸§ ā¸Ŗāšˆā¸§ā¸Ą ā¸Ŗā¸§ā¸Ąā¸ā¸ąā¸™ ā¸Ŗāšˆā¸§ā¸Ąā¸ā¸ąā¸™ +ā¸Ŗā¸§ā¸Ąā¸”āš‰ā¸§ā¸ĸ ā¸Ŗāšˆā¸§ā¸Ąā¸”āš‰ā¸§ā¸ĸ ā¸Ŗā¸§ā¸Ąā¸–ā¸ļā¸‡ ā¸Ŗā¸§ā¸Ąā¸—ā¸ąāš‰ā¸‡ ā¸Ŗāšˆā¸§ā¸Ąā¸Ąā¸ˇā¸­ ā¸Ŗā¸§ā¸Ąāš† ā¸Ŗā¸°ā¸ĸā¸° ā¸Ŗā¸°ā¸ĸā¸°āš† ā¸Ŗā¸°ā¸Ģā¸§āšˆā¸˛ā¸‡ ā¸Ŗā¸ąā¸šā¸Ŗā¸­ā¸‡ ā¸Ŗā¸ļ ā¸Ŗā¸ļā¸§āšˆā¸˛ ā¸Ŗā¸ˇā¸­ ā¸Ŗā¸ˇā¸­ā¸§āšˆā¸˛ ā¸Ēā¸´āš‰ā¸™ā¸ā¸˛ā¸Ĩā¸™ā¸˛ā¸™ ā¸Ēā¸ˇā¸šāš€ā¸™ā¸ˇāšˆā¸­ā¸‡ ā¸Ēā¸¸ā¸”āš† ā¸Ēā¸šāšˆ ā¸Ēā¸šā¸‡ ā¸Ēā¸šā¸‡ā¸ā¸§āšˆā¸˛ ā¸Ēā¸šā¸‡ā¸Ēāšˆā¸‡ ā¸Ēā¸šā¸‡ā¸Ēā¸¸ā¸” ā¸Ēā¸šā¸‡āš† āš€ā¸Ēā¸Ąā¸ˇā¸­ā¸™ā¸ā¸ąā¸š +āš€ā¸Ēā¸Ąā¸ˇā¸­ā¸™ā¸§āšˆā¸˛ āš€ā¸Ēā¸Ŗāš‡ā¸ˆ āš€ā¸Ēā¸Ŗāš‡ā¸ˆā¸ā¸ąā¸™ āš€ā¸Ēā¸Ŗāš‡ā¸ˆāšā¸Ĩāš‰ā¸§ āš€ā¸Ēā¸Ŗāš‡ā¸ˆā¸Ēā¸Ąā¸šā¸šā¸Ŗā¸“āšŒ āš€ā¸Ēā¸Ŗāš‡ā¸ˆā¸Ēā¸´āš‰ā¸™ āš€ā¸Ēā¸ĩā¸ĸ āš€ā¸Ēā¸ĩā¸ĸā¸āšˆā¸­ā¸™ āš€ā¸Ēā¸ĩā¸ĸā¸ˆā¸™ āš€ā¸Ēā¸ĩā¸ĸā¸ˆā¸™ā¸ā¸Ŗā¸°ā¸—ā¸ąāšˆā¸‡ āš€ā¸Ēā¸ĩā¸ĸā¸ˆā¸™ā¸–ā¸ļā¸‡ āš€ā¸Ēā¸ĩā¸ĸā¸”āš‰ā¸§ā¸ĸ āš€ā¸Ēā¸ĩā¸ĸā¸™ā¸ąāšˆā¸™ āš€ā¸Ēā¸ĩā¸ĸā¸™ā¸ąāšˆā¸™āš€ā¸­ā¸‡ āš€ā¸Ēā¸ĩā¸ĸā¸™ā¸ĩāšˆ āš€ā¸Ēā¸ĩā¸ĸā¸™ā¸ĩāšˆā¸ā¸Ŗā¸°āš„ā¸Ŗ āš€ā¸Ēā¸ĩā¸ĸā¸ĸā¸´āšˆā¸‡ +āš€ā¸Ēā¸ĩā¸ĸā¸ĸā¸´āšˆā¸‡ā¸™ā¸ąā¸ āš€ā¸Ēā¸ĩā¸ĸāšā¸Ĩāš‰ā¸§ āšƒā¸Ģā¸āšˆāš† āšƒā¸Ģāš‰ā¸”ā¸ĩ āšƒā¸Ģāš‰āšā¸”āšˆ āšƒā¸Ģāš‰āš„ā¸› āšƒā¸Ģā¸Ąāšˆ āšƒā¸Ģāš‰ā¸Ąā¸˛ āšƒā¸Ģā¸Ąāšˆāš† āš„ā¸Ģā¸™ āš„ā¸Ģā¸™āš† ā¸­ā¸”ā¸ĩā¸• ā¸­ā¸™ā¸ļāšˆā¸‡ ā¸­ā¸ĸāšˆā¸˛ā¸‡ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš€ā¸Šāšˆā¸™ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸”ā¸ĩ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš€ā¸”ā¸ĩā¸ĸā¸§ ā¸­ā¸ĸāšˆā¸˛ā¸‡āšƒā¸” ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸—ā¸ĩāšˆ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸™āš‰ā¸­ā¸ĸ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸™ā¸ąāš‰ā¸™ +ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸™ā¸ĩāš‰ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš‚ā¸™āš‰ā¸™ ā¸āš‡ā¸„ā¸ˇā¸­ ā¸āš‡āšā¸„āšˆ ā¸āš‡ā¸ˆā¸° ā¸āš‡ā¸”ā¸ĩ ā¸āš‡āš„ā¸”āš‰ ā¸āš‡ā¸•āšˆā¸­āš€ā¸Ąā¸ˇāšˆā¸­ ā¸āš‡ā¸•ā¸˛ā¸Ą ā¸āš‡ā¸•ā¸˛ā¸Ąāšā¸•āšˆ ā¸āš‡ā¸•ā¸˛ā¸Ąā¸—ā¸ĩ ā¸āš‡āšā¸Ĩāš‰ā¸§āšā¸•āšˆ ā¸ā¸Ŗā¸°ā¸—ā¸ąāšˆā¸‡ ā¸ā¸Ŗā¸°ā¸—ā¸ŗ ā¸ā¸Ŗā¸°ā¸™ā¸ąāš‰ā¸™ ā¸ā¸Ŗā¸°ā¸œā¸Ą ā¸ā¸Ĩā¸ąā¸š ā¸ā¸Ĩāšˆā¸˛ā¸§ā¸„ā¸ˇā¸­ ā¸ā¸Ĩā¸¸āšˆā¸Ą ā¸ā¸Ĩā¸¸āšˆā¸Ąā¸āš‰ā¸­ā¸™ +ā¸ā¸Ĩā¸¸āšˆā¸Ąāš† ā¸ā¸§āš‰ā¸˛ā¸‡ ā¸ā¸§āš‰ā¸˛ā¸‡ā¸‚ā¸§ā¸˛ā¸‡ ā¸ā¸§āš‰ā¸˛ā¸‡āš† ā¸āšˆā¸­ā¸™ā¸Ģā¸™āš‰ā¸˛ ā¸āšˆā¸­ā¸™ā¸Ģā¸™āš‰ā¸˛ā¸™ā¸ĩāš‰ ā¸āšˆā¸­ā¸™āš† ā¸ā¸ąā¸™ā¸”ā¸ĩā¸ā¸§āšˆā¸˛ ā¸ā¸ąā¸™ā¸”ā¸ĩāš„ā¸Ģā¸Ą ā¸ā¸ąā¸™āš€ā¸–ā¸­ā¸° ā¸ā¸ąā¸™ā¸™ā¸° ā¸ā¸ąā¸™āšā¸Ĩā¸°ā¸ā¸ąā¸™ ā¸ā¸ąā¸™āš„ā¸Ģā¸Ą ā¸ā¸ąā¸™āš€ā¸­ā¸‡ ā¸ā¸ŗā¸Ĩā¸ąā¸‡ ā¸ā¸ŗā¸Ĩā¸ąā¸‡ā¸ˆā¸° ā¸ā¸ŗā¸Ģā¸™ā¸” ā¸ā¸š āš€ā¸āš‡ā¸š +āš€ā¸ā¸´ā¸” āš€ā¸ā¸ĩāšˆā¸ĸā¸§ā¸‚āš‰ā¸­ā¸‡ āšā¸āšˆ āšā¸āš‰āš„ā¸‚ āšƒā¸ā¸Ĩāš‰ āšƒā¸ā¸Ĩāš‰āš† ā¸‚āš‰ā¸˛ ā¸‚āš‰ā¸˛ā¸‡ ā¸‚āš‰ā¸˛ā¸‡āš€ā¸„ā¸ĩā¸ĸā¸‡ ā¸‚āš‰ā¸˛ā¸‡ā¸•āš‰ā¸™ ā¸‚āš‰ā¸˛ā¸‡ā¸šā¸™ ā¸‚āš‰ā¸˛ā¸‡ā¸Ĩāšˆā¸˛ā¸‡ ā¸‚āš‰ā¸˛ā¸‡āš† ā¸‚ā¸˛ā¸” ā¸‚āš‰ā¸˛ā¸žāš€ā¸ˆāš‰ā¸˛ ā¸‚āš‰ā¸˛ā¸¯ āš€ā¸‚āš‰ā¸˛āšƒā¸ˆ āš€ā¸‚ā¸ĩā¸ĸā¸™ ā¸„ā¸‡ā¸ˆā¸° ā¸„ā¸‡ā¸­ā¸ĸā¸šāšˆ ā¸„ā¸Ŗā¸š ā¸„ā¸Ŗā¸šā¸„ā¸Ŗā¸ąā¸™ ā¸„ā¸Ŗā¸šā¸–āš‰ā¸§ā¸™ +ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸ā¸Ŗā¸°ā¸™ā¸ąāš‰ā¸™ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸āšˆā¸­ā¸™ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸„ā¸Ŗā¸˛ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸„ā¸Ŗā¸˛ā¸§ ā¸„ā¸Ŗā¸ąāš‰ā¸‡āšƒā¸” ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸—ā¸ĩāšˆ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸™ā¸ąāš‰ā¸™ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸™ā¸ĩāš‰ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸Ĩā¸° ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸Ģā¸™ā¸ļāšˆā¸‡ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸Ģā¸Ĩā¸ąā¸‡ ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸Ģā¸Ĩā¸ąā¸‡ā¸Ēā¸¸ā¸” ā¸„ā¸Ŗā¸ąāš‰ā¸‡āš„ā¸Ģā¸™ ā¸„ā¸Ŗā¸ąāš‰ā¸‡āš† ā¸„ā¸Ŗā¸ąā¸™ ā¸„ā¸Ŗā¸ąā¸š ā¸„ā¸Ŗā¸˛ ā¸„ā¸Ŗā¸˛āšƒā¸” ā¸„ā¸Ŗā¸˛ā¸—ā¸ĩāšˆ ā¸„ā¸Ŗā¸˛ā¸™ā¸ąāš‰ā¸™ ā¸„ā¸Ŗā¸˛ā¸™ā¸ĩāš‰ ā¸„ā¸Ŗā¸˛ā¸Ģā¸™ā¸ļāšˆā¸‡ +ā¸„ā¸Ŗā¸˛āš„ā¸Ģā¸™ ā¸„ā¸Ŗā¸˛ā¸§ ā¸„ā¸Ŗā¸˛ā¸§ā¸āšˆā¸­ā¸™ ā¸„ā¸Ŗā¸˛ā¸§āšƒā¸” ā¸„ā¸Ŗā¸˛ā¸§ā¸—ā¸ĩāšˆ ā¸„ā¸Ŗā¸˛ā¸§ā¸™ā¸ąāš‰ā¸™ ā¸„ā¸Ŗā¸˛ā¸§ā¸™ā¸ĩāš‰ ā¸„ā¸Ŗā¸˛ā¸§āš‚ā¸™āš‰ā¸™ ā¸„ā¸Ŗā¸˛ā¸§ā¸Ĩā¸° ā¸„ā¸Ŗā¸˛ā¸§ā¸Ģā¸™āš‰ā¸˛ ā¸„ā¸Ŗā¸˛ā¸§ā¸Ģā¸™ā¸ļāšˆā¸‡ ā¸„ā¸Ŗā¸˛ā¸§ā¸Ģā¸Ĩā¸ąā¸‡ ā¸„ā¸Ŗā¸˛ā¸§āš„ā¸Ģā¸™ ā¸„ā¸Ŗā¸˛ā¸§āš† ā¸„ā¸Ĩāš‰ā¸˛ā¸ĸ ā¸„ā¸Ĩāš‰ā¸˛ā¸ĸā¸ā¸ąā¸™ ā¸„ā¸Ĩāš‰ā¸˛ā¸ĸā¸ā¸ąā¸™ā¸ā¸ąā¸š +ā¸„ā¸Ĩāš‰ā¸˛ā¸ĸā¸ā¸ąā¸š ā¸„ā¸Ĩāš‰ā¸˛ā¸ĸā¸ā¸ąā¸šā¸§āšˆā¸˛ ā¸„ā¸Ĩāš‰ā¸˛ā¸ĸā¸§āšˆā¸˛ ā¸„ā¸§ā¸Ŗ ā¸„āšˆā¸­ā¸™ ā¸„āšˆā¸­ā¸™ā¸‚āš‰ā¸˛ā¸‡ ā¸„āšˆā¸­ā¸™ā¸‚āš‰ā¸˛ā¸‡ā¸ˆā¸° ā¸„āšˆā¸­ā¸ĸāš„ā¸›ā¸—ā¸˛ā¸‡ ā¸„āšˆā¸­ā¸™ā¸Ąā¸˛ā¸—ā¸˛ā¸‡ ā¸„āšˆā¸­ā¸ĸ ā¸„āšˆā¸­ā¸ĸāš† ā¸„ā¸° ā¸„āšˆā¸° ā¸„ā¸ŗ ā¸„ā¸´ā¸” ā¸„ā¸´ā¸”ā¸§āšˆā¸˛ ā¸„ā¸¸ā¸“ ā¸„ā¸¸ā¸“āš† +āš€ā¸„ā¸ĸāš† āšā¸„āšˆ āšā¸„āšˆā¸ˆā¸° āšā¸„āšˆā¸™ā¸ąāš‰ā¸™ āšā¸„āšˆā¸™ā¸ĩāš‰ āšā¸„āšˆāš€ā¸žā¸ĩā¸ĸā¸‡ āšā¸„āšˆā¸§āšˆā¸˛ āšā¸„āšˆāš„ā¸Ģā¸™ āšƒā¸„ā¸Ŗāšˆ āšƒā¸„ā¸Ŗāšˆā¸ˆā¸° ā¸‡āšˆā¸˛ā¸ĸ ā¸‡āšˆā¸˛ā¸ĸāš† ā¸ˆā¸™ā¸ā¸§āšˆā¸˛ ā¸ˆā¸™āšā¸Ąāš‰ ā¸ˆā¸™āšā¸Ąāš‰ā¸™ ā¸ˆā¸ąā¸‡āš† ā¸ˆā¸§ā¸šā¸ā¸ąā¸š ā¸ˆā¸§ā¸šā¸ˆā¸™ ā¸ˆāš‰ā¸° ā¸ˆāšŠā¸° ā¸ˆā¸°āš„ā¸”āš‰ ā¸ˆā¸ąā¸‡ ā¸ˆā¸ąā¸”ā¸ā¸˛ā¸Ŗ ā¸ˆā¸ąā¸”ā¸‡ā¸˛ā¸™ ā¸ˆā¸ąā¸”āšā¸ˆā¸‡ +ā¸ˆā¸ąā¸”ā¸•ā¸ąāš‰ā¸‡ ā¸ˆā¸ąā¸”ā¸—ā¸ŗ ā¸ˆā¸ąā¸”ā¸Ģā¸˛ ā¸ˆā¸ąā¸”āšƒā¸Ģāš‰ ā¸ˆā¸ąā¸š ā¸ˆāš‰ā¸˛ ā¸ˆāš‹ā¸˛ ā¸ˆā¸˛ā¸ā¸™ā¸ąāš‰ā¸™ ā¸ˆā¸˛ā¸ā¸™ā¸ĩāš‰ ā¸ˆā¸˛ā¸ā¸™ā¸ĩāš‰āš„ā¸› ā¸ˆā¸ŗ ā¸ˆā¸ŗāš€ā¸›āš‡ā¸™ ā¸ˆā¸ŗā¸žā¸§ā¸ ā¸ˆā¸ļā¸‡ā¸ˆā¸° ā¸ˆā¸ļā¸‡āš€ā¸›āš‡ā¸™ ā¸ˆā¸šāšˆāš† ā¸‰ā¸°ā¸™ā¸ąāš‰ā¸™ ā¸‰ā¸°ā¸™ā¸ĩāš‰ ā¸‰ā¸ąā¸™ āš€ā¸‰ā¸āš€ā¸Šāšˆā¸™ āš€ā¸‰ā¸ĸ āš€ā¸‰ā¸ĸāš† āš„ā¸‰ā¸™ ā¸Šāšˆā¸§ā¸‡ā¸āšˆā¸­ā¸™ +ā¸Šāšˆā¸§ā¸‡ā¸•āšˆā¸­āš„ā¸› ā¸Šāšˆā¸§ā¸‡ā¸–ā¸ąā¸”āš„ā¸› ā¸Šāšˆā¸§ā¸‡ā¸—āš‰ā¸˛ā¸ĸ ā¸Šāšˆā¸§ā¸‡ā¸—ā¸ĩāšˆ ā¸Šāšˆā¸§ā¸‡ā¸™ā¸ąāš‰ā¸™ ā¸Šāšˆā¸§ā¸‡ā¸™ā¸ĩāš‰ ā¸Šāšˆā¸§ā¸‡ā¸Ŗā¸°ā¸Ģā¸§āšˆā¸˛ā¸‡ ā¸Šāšˆā¸§ā¸‡āšā¸Ŗā¸ ā¸Šāšˆā¸§ā¸‡ā¸Ģā¸™āš‰ā¸˛ ā¸Šāšˆā¸§ā¸‡ā¸Ģā¸Ĩā¸ąā¸‡ ā¸Šāšˆā¸§ā¸‡āš† ā¸Šāšˆā¸§ā¸ĸ ā¸Šāš‰ā¸˛ ā¸Šāš‰ā¸˛ā¸™ā¸˛ā¸™ ā¸Šā¸˛ā¸§ ā¸Šāš‰ā¸˛āš† āš€ā¸Šāšˆā¸™ā¸āšˆā¸­ā¸™ āš€ā¸Šāšˆā¸™ā¸ā¸ąā¸™ āš€ā¸Šāšˆā¸™āš€ā¸„ā¸ĸ +āš€ā¸Šāšˆā¸™ā¸”ā¸ąā¸‡ āš€ā¸Šāšˆā¸™ā¸”ā¸ąā¸‡ā¸āšˆā¸­ā¸™ āš€ā¸Šāšˆā¸™ā¸”ā¸ąā¸‡āš€ā¸āšˆā¸˛ āš€ā¸Šāšˆā¸™ā¸”ā¸ąā¸‡ā¸—ā¸ĩāšˆ āš€ā¸Šāšˆā¸™ā¸”ā¸ąā¸‡ā¸§āšˆā¸˛ āš€ā¸Šāšˆā¸™āš€ā¸”ā¸ĩā¸ĸā¸§ā¸ā¸ąā¸™ āš€ā¸Šāšˆā¸™āš€ā¸”ā¸ĩā¸ĸā¸§ā¸ā¸ąā¸š āš€ā¸Šāšˆā¸™āšƒā¸” āš€ā¸Šāšˆā¸™ā¸—ā¸ĩāšˆ āš€ā¸Šāšˆā¸™ā¸—ā¸ĩāšˆāš€ā¸„ā¸ĸ āš€ā¸Šāšˆā¸™ā¸—ā¸ĩāšˆā¸§āšˆā¸˛ āš€ā¸Šāšˆā¸™ā¸™ā¸ąāš‰ā¸™ āš€ā¸Šāšˆā¸™ā¸™ā¸ąāš‰ā¸™āš€ā¸­ā¸‡ āš€ā¸Šāšˆā¸™ā¸™ā¸ĩāš‰ āš€ā¸Šāšˆā¸™āš€ā¸Ąā¸ˇāšˆā¸­ āš€ā¸Šāšˆā¸™āš„ā¸Ŗ āš€ā¸Šā¸ˇāšˆā¸­ +āš€ā¸Šā¸ˇāšˆā¸­ā¸–ā¸ˇā¸­ āš€ā¸Šā¸ˇāšˆā¸­ā¸Ąā¸ąāšˆā¸™ āš€ā¸Šā¸ˇāšˆā¸­ā¸§āšˆā¸˛ āšƒā¸Šāšˆ āšƒā¸Šāšˆāš„ā¸Ģā¸Ą āšƒā¸Šāš‰ ā¸‹ā¸° ā¸‹ā¸°ā¸āšˆā¸­ā¸™ ā¸‹ā¸°ā¸ˆā¸™ ā¸‹ā¸°ā¸ˆā¸™ā¸ā¸Ŗā¸°ā¸—ā¸ąāšˆā¸‡ ā¸‹ā¸°ā¸ˆā¸™ā¸–ā¸ļā¸‡ ā¸‹ā¸ļāšˆā¸‡āš„ā¸”āš‰āšā¸āšˆ ā¸”āš‰ā¸§ā¸ĸā¸ā¸ąā¸™ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Šāšˆā¸™ā¸ā¸ąā¸™ ā¸”āš‰ā¸§ā¸ĸā¸—ā¸ĩāšˆ ā¸”āš‰ā¸§ā¸ĸāš€ā¸žā¸Ŗā¸˛ā¸° ā¸”āš‰ā¸§ā¸ĸā¸§āšˆā¸˛ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸—ā¸ĩāšˆ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸™ā¸ąāš‰ā¸™ +ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸™ā¸ĩāš‰ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸āš€ā¸žā¸Ŗā¸˛ā¸° ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸•ā¸¸ā¸§āšˆā¸˛ ā¸”āš‰ā¸§ā¸ĸāš€ā¸Ģā¸Ąā¸ˇā¸­ā¸™ā¸ā¸ąā¸™ ā¸”ā¸ąā¸‡ā¸ā¸Ĩāšˆā¸˛ā¸§ ā¸”ā¸ąā¸‡ā¸ā¸ąā¸šā¸§āšˆā¸˛ ā¸”ā¸ąāšˆā¸‡ā¸ā¸ąā¸šā¸§āšˆā¸˛ ā¸”ā¸ąā¸‡āš€ā¸āšˆā¸˛ ā¸”ā¸ąāšˆā¸‡āš€ā¸āšˆā¸˛ ā¸”ā¸ąāšˆā¸‡āš€ā¸„ā¸ĸ ā¸•āšˆā¸˛ā¸‡ā¸āš‡ ā¸•āšˆā¸˛ā¸‡ā¸Ģā¸˛ā¸ ā¸•ā¸˛ā¸Ąā¸”āš‰ā¸§ā¸ĸ ā¸•ā¸˛ā¸Ąāšā¸•āšˆ ā¸•ā¸˛ā¸Ąā¸—ā¸ĩāšˆ +ā¸•ā¸˛ā¸Ąāš† āš€ā¸•āš‡ā¸Ąāš„ā¸›ā¸”āš‰ā¸§ā¸ĸ āš€ā¸•āš‡ā¸Ąāš„ā¸›ā¸Ģā¸Ąā¸” āš€ā¸•āš‡ā¸Ąāš† āšā¸•āšˆā¸āš‡ āšā¸•āšˆā¸āšˆā¸­ā¸™ āšā¸•āšˆā¸ˆā¸° āšā¸•āšˆāš€ā¸”ā¸´ā¸Ą āšā¸•āšˆā¸•āš‰ā¸­ā¸‡ āšā¸•āšˆā¸–āš‰ā¸˛ āšā¸•āšˆā¸—ā¸§āšˆā¸˛ āšā¸•āšˆā¸—ā¸ĩāšˆ āšā¸•āšˆā¸™ā¸ąāš‰ā¸™ āšā¸•āšˆāš€ā¸žā¸ĩā¸ĸā¸‡ āšā¸•āšˆāš€ā¸Ąā¸ˇāšˆā¸­ āšā¸•āšˆāš„ā¸Ŗ āšā¸•āšˆā¸Ĩā¸° āšā¸•āšˆā¸§āšˆā¸˛ āšā¸•āšˆāš„ā¸Ģā¸™ āšā¸•āšˆā¸­ā¸ĸāšˆā¸˛ā¸‡āšƒā¸” āš‚ā¸• +āš‚ā¸•āš† āšƒā¸•āš‰ ā¸–āš‰ā¸˛ā¸ˆā¸° ā¸–āš‰ā¸˛ā¸Ģā¸˛ā¸ ā¸–ā¸ļā¸‡āšā¸āšˆ ā¸–ā¸ļā¸‡āšā¸Ąāš‰ ā¸–ā¸ļā¸‡āšā¸Ąāš‰ā¸ˆā¸° ā¸–ā¸ļā¸‡āšā¸Ąāš‰ā¸§āšˆā¸˛ ā¸–ā¸ļā¸‡ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ŗ ā¸–ā¸ˇā¸­ā¸§āšˆā¸˛ ā¸–ā¸šā¸ā¸•āš‰ā¸­ā¸‡ ā¸—ā¸§āšˆā¸˛ ā¸—ā¸ąāš‰ā¸‡ā¸™ā¸ąāš‰ā¸™ā¸”āš‰ā¸§ā¸ĸ ā¸—ā¸ąāš‰ā¸‡ā¸›ā¸§ā¸‡ ā¸—ā¸ąāš‰ā¸‡āš€ā¸›āš‡ā¸™ ā¸—ā¸ąāš‰ā¸‡ā¸Ąā¸§ā¸Ĩ ā¸—ā¸ąāš‰ā¸‡ā¸Ēā¸´āš‰ā¸™ ā¸—ā¸ąāš‰ā¸‡ā¸Ģā¸Ąā¸” ā¸—ā¸ąāš‰ā¸‡ā¸Ģā¸Ĩā¸˛ā¸ĸ ā¸—ā¸ąāš‰ā¸‡āš† ā¸—ā¸ąā¸™ +ā¸—ā¸ąā¸™āšƒā¸”ā¸™ā¸ąāš‰ā¸™ ā¸—ā¸ąā¸™ā¸—ā¸ĩ ā¸—ā¸ąā¸™ā¸—ā¸ĩā¸—ā¸ąā¸™āšƒā¸” ā¸—ā¸ąāšˆā¸§ ā¸—ā¸ŗāš„ā¸Ą ā¸—ā¸ŗāš„ā¸Ŗ ā¸—ā¸ŗāšƒā¸Ģāš‰ ā¸—ā¸ŗāš† ā¸—ā¸ĩ ā¸—ā¸ĩāšˆā¸ˆā¸Ŗā¸´ā¸‡ ā¸—ā¸ĩāšˆā¸‹ā¸ļāšˆā¸‡ ā¸—ā¸ĩāš€ā¸”ā¸ĩā¸ĸā¸§ ā¸—ā¸ĩāšƒā¸” ā¸—ā¸ĩāšˆāšƒā¸” ā¸—ā¸ĩāšˆāš„ā¸”āš‰ ā¸—ā¸ĩāš€ā¸–ā¸­ā¸° ā¸—ā¸ĩāšˆāšā¸—āš‰ ā¸—ā¸ĩāšˆāšā¸—āš‰ā¸ˆā¸Ŗā¸´ā¸‡ ā¸—ā¸ĩāšˆā¸™ā¸ąāš‰ā¸™ ā¸—ā¸ĩāšˆā¸™ā¸ĩāš‰ ā¸—ā¸ĩāš„ā¸Ŗ ā¸—ā¸ĩā¸Ĩā¸° ā¸—ā¸ĩāšˆā¸Ĩā¸° +ā¸—ā¸ĩāšˆāšā¸Ĩāš‰ā¸§ ā¸—ā¸ĩāšˆā¸§āšˆā¸˛ ā¸—ā¸ĩāšˆāšā¸Ģāšˆā¸‡ā¸™ā¸ąāš‰ā¸™ ā¸—ā¸ĩāšˆāš„ā¸Ģā¸™ ā¸—ā¸ĩāš† ā¸—ā¸ĩāšˆāš† ā¸—ā¸¸ā¸ā¸„ā¸™ ā¸—ā¸¸ā¸ā¸„ā¸Ŗā¸ąāš‰ā¸‡ ā¸—ā¸¸ā¸ā¸„ā¸Ŗā¸˛ ā¸—ā¸¸ā¸ā¸„ā¸Ŗā¸˛ā¸§ ā¸—ā¸¸ā¸ā¸Šā¸´āš‰ā¸™ ā¸—ā¸¸ā¸ā¸•ā¸ąā¸§ ā¸—ā¸¸ā¸ā¸—ā¸˛ā¸‡ ā¸—ā¸¸ā¸ā¸—ā¸ĩ ā¸—ā¸¸ā¸ā¸—ā¸ĩāšˆ ā¸—ā¸¸ā¸āš€ā¸Ąā¸ˇāšˆā¸­ ā¸—ā¸¸ā¸ā¸§ā¸ąā¸™ ā¸—ā¸¸ā¸ā¸§ā¸ąā¸™ā¸™ā¸ĩāš‰ ā¸—ā¸¸ā¸ā¸Ēā¸´āšˆā¸‡ ā¸—ā¸¸ā¸ā¸Ģā¸™ ā¸—ā¸¸ā¸āšā¸Ģāšˆā¸‡ ā¸—ā¸¸ā¸ā¸­ā¸ĸāšˆā¸˛ā¸‡ +ā¸—ā¸¸ā¸ā¸­ā¸ąā¸™ ā¸—ā¸¸ā¸āš† āš€ā¸—āšˆā¸˛ āš€ā¸—āšˆā¸˛ā¸ā¸ąā¸™ āš€ā¸—āšˆā¸˛ā¸ā¸ąā¸š āš€ā¸—āšˆā¸˛āšƒā¸” āš€ā¸—āšˆā¸˛ā¸—ā¸ĩāšˆ āš€ā¸—āšˆā¸˛ā¸™ā¸ąāš‰ā¸™ āš€ā¸—āšˆā¸˛ā¸™ā¸ĩāš‰ āš€ā¸—āšˆā¸˛āš„ā¸Ŗ āš€ā¸—āšˆā¸˛āš„ā¸Ģā¸Ŗāšˆ āšā¸—āš‰ āšā¸—āš‰ā¸ˆā¸Ŗā¸´ā¸‡ āš€ā¸˜ā¸­ ā¸™ā¸­ā¸ā¸ˆā¸˛ā¸ā¸§āšˆā¸˛ ā¸™āš‰ā¸­ā¸ĸ ā¸™āš‰ā¸­ā¸ĸā¸ā¸§āšˆā¸˛ ā¸™āš‰ā¸­ā¸ĸāš† ā¸™āšˆā¸° ā¸™ā¸ąāš‰ā¸™āš„ā¸§ ā¸™ā¸ąā¸šāšā¸•āšˆā¸™ā¸ĩāš‰ ā¸™ā¸˛ā¸‡ +ā¸™ā¸˛ā¸‡ā¸Ēā¸˛ā¸§ ā¸™āšˆā¸˛ā¸ˆā¸° ā¸™ā¸˛ā¸™ ā¸™ā¸˛ā¸™āš† ā¸™ā¸˛ā¸ĸ ā¸™ā¸ŗ ā¸™ā¸ŗā¸žā¸˛ ā¸™ā¸ŗā¸Ąā¸˛ ā¸™ā¸´ā¸” ā¸™ā¸´ā¸”ā¸Ģā¸™āšˆā¸­ā¸ĸ ā¸™ā¸´ā¸”āš† ā¸™ā¸ĩāšˆ ā¸™ā¸ĩāšˆāš„ā¸‡ ā¸™ā¸ĩāšˆā¸™ā¸˛ ā¸™ā¸ĩāšˆāšā¸™āšˆā¸° ā¸™ā¸ĩāšˆāšā¸Ģā¸Ĩā¸° ā¸™ā¸ĩāš‰āšā¸Ģā¸Ĩāšˆ ā¸™ā¸ĩāšˆāš€ā¸­ā¸‡ ā¸™ā¸ĩāš‰āš€ā¸­ā¸‡ ā¸™ā¸šāšˆā¸™ ā¸™ā¸šāš‰ā¸™ āš€ā¸™āš‰ā¸™ āš€ā¸™ā¸ĩāšˆā¸ĸ +āš€ā¸™ā¸ĩāšˆā¸ĸāš€ā¸­ā¸‡ āšƒā¸™ā¸Šāšˆā¸§ā¸‡ āšƒā¸™ā¸—ā¸ĩāšˆ āšƒā¸™āš€ā¸Ąā¸ˇāšˆā¸­ āšƒā¸™ā¸Ŗā¸°ā¸Ģā¸§āšˆā¸˛ā¸‡ ā¸šā¸™ ā¸šā¸­ā¸ ā¸šā¸­ā¸āšā¸Ĩāš‰ā¸§ ā¸šā¸­ā¸ā¸§āšˆā¸˛ ā¸šāšˆā¸­ā¸ĸ ā¸šāšˆā¸­ā¸ĸā¸ā¸§āšˆā¸˛ ā¸šāšˆā¸­ā¸ĸā¸„ā¸Ŗā¸ąāš‰ā¸‡ ā¸šāšˆā¸­ā¸ĸāš† ā¸šā¸ąā¸”ā¸”ā¸Ĩ ā¸šā¸ąā¸”āš€ā¸”ā¸ĩāš‹ā¸ĸā¸§ā¸™ā¸ĩāš‰ ā¸šā¸ąā¸”ā¸™ā¸ąāš‰ā¸™ ā¸šā¸ąā¸”ā¸™ā¸ĩāš‰ ā¸šāš‰ā¸˛ā¸‡ ā¸šā¸˛ā¸‡ā¸ā¸§āšˆā¸˛ +ā¸šā¸˛ā¸‡ā¸‚ā¸“ā¸° ā¸šā¸˛ā¸‡ā¸„ā¸Ŗā¸ąāš‰ā¸‡ ā¸šā¸˛ā¸‡ā¸„ā¸Ŗā¸˛ ā¸šā¸˛ā¸‡ā¸„ā¸Ŗā¸˛ā¸§ ā¸šā¸˛ā¸‡ā¸—ā¸ĩ ā¸šā¸˛ā¸‡ā¸—ā¸ĩāšˆ ā¸šā¸˛ā¸‡āšā¸Ģāšˆā¸‡ ā¸šā¸˛ā¸‡āš† ā¸›ā¸ā¸´ā¸šā¸ąā¸•ā¸´ ā¸›ā¸Ŗā¸°ā¸ā¸­ā¸š ā¸›ā¸Ŗā¸°ā¸ā¸˛ā¸Ŗ ā¸›ā¸Ŗā¸°ā¸ā¸˛ā¸Ŗā¸‰ā¸°ā¸™ā¸ĩāš‰ ā¸›ā¸Ŗā¸°ā¸ā¸˛ā¸Ŗāšƒā¸” ā¸›ā¸Ŗā¸°ā¸ā¸˛ā¸Ŗā¸Ģā¸™ā¸ļāšˆā¸‡ ā¸›ā¸Ŗā¸°ā¸Ąā¸˛ā¸“ ā¸›ā¸Ŗā¸°ā¸Ēā¸š ā¸›ā¸Ŗā¸ąā¸š +ā¸›ā¸Ŗā¸˛ā¸ā¸ ā¸›ā¸Ŗā¸˛ā¸ā¸ā¸§āšˆā¸˛ ā¸›ā¸ąā¸ˆā¸ˆā¸¸ā¸šā¸ąā¸™ ā¸›ā¸´ā¸” āš€ā¸›āš‡ā¸™ā¸”āš‰ā¸§ā¸ĸ āš€ā¸›āš‡ā¸™ā¸”ā¸ąā¸‡ āš€ā¸›āš‡ā¸™ā¸•āš‰ā¸™ āš€ā¸›āš‡ā¸™āšā¸•āšˆ āš€ā¸›āš‡ā¸™āš€ā¸žā¸ˇāšˆā¸­ āš€ā¸›āš‡ā¸™ā¸­ā¸ąā¸™ āš€ā¸›āš‡ā¸™ā¸­ā¸ąā¸™ā¸Ąā¸˛ā¸ āš€ā¸›āš‡ā¸™ā¸­ā¸˛ā¸—ā¸´ ā¸œāšˆā¸˛ā¸™āš† ā¸œā¸šāš‰ ā¸œā¸šāš‰āšƒā¸” āš€ā¸œā¸ˇāšˆā¸­ āš€ā¸œā¸ˇāšˆā¸­ā¸ˆā¸° āš€ā¸œā¸ˇāšˆā¸­ā¸—ā¸ĩāšˆ āš€ā¸œā¸ˇāšˆā¸­ā¸§āšˆā¸˛ ā¸āšˆā¸˛ā¸ĸ +ā¸āšˆā¸˛ā¸ĸāšƒā¸” ā¸žā¸šā¸§āšˆā¸˛ ā¸žā¸ĸā¸˛ā¸ĸā¸˛ā¸Ą ā¸žā¸Ŗāš‰ā¸­ā¸Ąā¸ā¸ąā¸™ ā¸žā¸Ŗāš‰ā¸­ā¸Ąā¸ā¸ąā¸š ā¸žā¸Ŗāš‰ā¸­ā¸Ąā¸”āš‰ā¸§ā¸ĸ ā¸žā¸Ŗāš‰ā¸­ā¸Ąā¸—ā¸ąāš‰ā¸‡ ā¸žā¸Ŗāš‰ā¸­ā¸Ąā¸—ā¸ĩāšˆ ā¸žā¸Ŗāš‰ā¸­ā¸Ąāš€ā¸žā¸ĩā¸ĸā¸‡ ā¸žā¸§ā¸ ā¸žā¸§ā¸ā¸ā¸ąā¸™ ā¸žā¸§ā¸ā¸ā¸š ā¸žā¸§ā¸āšā¸ ā¸žā¸§ā¸āš€ā¸‚ā¸˛ ā¸žā¸§ā¸ā¸„ā¸¸ā¸“ ā¸žā¸§ā¸ā¸‰ā¸ąā¸™ ā¸žā¸§ā¸ā¸—āšˆā¸˛ā¸™ +ā¸žā¸§ā¸ā¸—ā¸ĩāšˆ ā¸žā¸§ā¸āš€ā¸˜ā¸­ ā¸žā¸§ā¸ā¸™ā¸ąāš‰ā¸™ ā¸žā¸§ā¸ā¸™ā¸ĩāš‰ ā¸žā¸§ā¸ā¸™ā¸šāš‰ā¸™ ā¸žā¸§ā¸āš‚ā¸™āš‰ā¸™ ā¸žā¸§ā¸ā¸Ąā¸ąā¸™ ā¸žā¸§ā¸ā¸Ąā¸ļā¸‡ ā¸žā¸­ ā¸žā¸­ā¸ā¸ąā¸™ ā¸žā¸­ā¸„ā¸§ā¸Ŗ ā¸žā¸­ā¸ˆā¸° ā¸žā¸­ā¸”ā¸ĩ ā¸žā¸­ā¸•ā¸ąā¸§ ā¸žā¸­ā¸—ā¸ĩ ā¸žā¸­ā¸—ā¸ĩāšˆ ā¸žā¸­āš€ā¸žā¸ĩā¸ĸā¸‡ ā¸žā¸­āšā¸Ĩāš‰ā¸§ ā¸žā¸­ā¸Ēā¸Ą ā¸žā¸­ā¸Ēā¸Ąā¸„ā¸§ā¸Ŗ +ā¸žā¸­āš€ā¸Ģā¸Ąā¸˛ā¸° ā¸žā¸­āš† ā¸žā¸˛ ā¸žā¸ļā¸‡ ā¸žā¸ļāšˆā¸‡ ā¸žā¸ˇāš‰ā¸™āš† ā¸žā¸šā¸” āš€ā¸žā¸Ŗā¸˛ā¸°ā¸‰ā¸°ā¸™ā¸ąāš‰ā¸™ āš€ā¸žā¸Ŗā¸˛ā¸°ā¸§āšˆā¸˛ āš€ā¸žā¸´āšˆā¸‡ āš€ā¸žā¸´āšˆā¸‡ā¸ˆā¸° āš€ā¸žā¸´āšˆā¸Ą āš€ā¸žā¸´āšˆā¸Ąāš€ā¸•ā¸´ā¸Ą āš€ā¸žā¸ĩā¸ĸā¸‡ āš€ā¸žā¸ĩā¸ĸā¸‡āšā¸„āšˆ āš€ā¸žā¸ĩā¸ĸā¸‡āšƒā¸” āš€ā¸žā¸ĩā¸ĸā¸‡āšā¸•āšˆ āš€ā¸žā¸ĩā¸ĸā¸‡ā¸žā¸­ āš€ā¸žā¸ĩā¸ĸā¸‡āš€ā¸žā¸Ŗā¸˛ā¸° +āš€ā¸žā¸ˇāšˆā¸­ā¸§āšˆā¸˛ āš€ā¸žā¸ˇāšˆā¸­āšƒā¸Ģāš‰ ā¸ ā¸˛ā¸ĸāšƒā¸•āš‰ ā¸Ąā¸­ā¸‡ā¸§āšˆā¸˛ ā¸Ąā¸ąāšŠā¸ĸ ā¸Ąā¸˛ā¸ā¸ā¸§āšˆā¸˛ ā¸Ąā¸˛ā¸ā¸Ąā¸˛ā¸ĸ ā¸Ąā¸´ ā¸Ąā¸´ā¸‰ā¸°ā¸™ā¸ąāš‰ā¸™ ā¸Ąā¸´āšƒā¸Šāšˆ ā¸Ąā¸´āš„ā¸”āš‰ ā¸Ąā¸ĩāšā¸•āšˆ ā¸Ąā¸ļā¸‡ ā¸Ąā¸¸āšˆā¸‡ ā¸Ąā¸¸āšˆā¸‡āš€ā¸™āš‰ā¸™ ā¸Ąā¸¸āšˆā¸‡ā¸Ģā¸Ąā¸˛ā¸ĸ āš€ā¸Ąā¸ˇāšˆā¸­ā¸āšˆā¸­ā¸™ āš€ā¸Ąā¸ˇāšˆā¸­ā¸„ā¸Ŗā¸ąāš‰ā¸‡ āš€ā¸Ąā¸ˇāšˆā¸­ā¸„ā¸Ŗā¸ąāš‰ā¸‡ā¸āšˆā¸­ā¸™ +āš€ā¸Ąā¸ˇāšˆā¸­ā¸„ā¸Ŗā¸˛ā¸§ā¸āšˆā¸­ā¸™ āš€ā¸Ąā¸ˇāšˆā¸­ā¸„ā¸Ŗā¸˛ā¸§ā¸—ā¸ĩāšˆ āš€ā¸Ąā¸ˇāšˆā¸­ā¸„ā¸Ŗā¸˛ā¸§ āš€ā¸Ąā¸ˇāšˆā¸­ā¸„ā¸ˇā¸™ āš€ā¸Ąā¸ˇāšˆā¸­āš€ā¸Šāš‰ā¸˛ āš€ā¸Ąā¸ˇāšˆā¸­āšƒā¸” āš€ā¸Ąā¸ˇāšˆā¸­ā¸™ā¸ąāš‰ā¸™ āš€ā¸Ąā¸ˇāšˆā¸­ā¸™ā¸ĩāš‰ āš€ā¸Ąā¸ˇāšˆā¸­āš€ā¸ĸāš‡ā¸™ āš€ā¸Ąā¸ˇāšˆā¸­āš„ā¸Ŗ āš€ā¸Ąā¸ˇāšˆā¸­ā¸§ā¸ąā¸™ā¸§ā¸˛ā¸™ āš€ā¸Ąā¸ˇāšˆā¸­ā¸§ā¸˛ā¸™ āš€ā¸Ąā¸ˇāšˆā¸­āš„ā¸Ģā¸Ŗāšˆ āšā¸Ąāš‰ āšā¸Ąāš‰ā¸ā¸Ŗā¸°ā¸—ā¸ąāšˆā¸‡ āšā¸Ąāš‰āšā¸•āšˆ āšā¸Ąāš‰ā¸™ā¸§āšˆā¸˛ āšā¸Ąāš‰ā¸§āšˆā¸˛ +āš„ā¸Ąāšˆā¸„āšˆā¸­ā¸ĸ āš„ā¸Ąāšˆā¸„āšˆā¸­ā¸ĸā¸ˆā¸° āš„ā¸Ąāšˆā¸„āšˆā¸­ā¸ĸāš€ā¸›āš‡ā¸™ āš„ā¸Ąāšˆāšƒā¸Šāšˆ āš„ā¸Ąāšˆāš€ā¸›āš‡ā¸™āš„ā¸Ŗ āš„ā¸Ąāšˆā¸§āšˆā¸˛ ā¸ĸā¸ ā¸ĸā¸āšƒā¸Ģāš‰ ā¸ĸā¸­ā¸Ą ā¸ĸā¸­ā¸Ąā¸Ŗā¸ąā¸š ā¸ĸāšˆā¸­ā¸Ą ā¸ĸāšˆā¸­ā¸ĸ ā¸ĸā¸ąā¸‡ā¸„ā¸‡ ā¸ĸā¸ąā¸‡ā¸‡ā¸ąāš‰ā¸™ ā¸ĸā¸ąā¸‡ā¸‡ā¸ĩāš‰ ā¸ĸā¸ąā¸‡āš‚ā¸‡āš‰ā¸™ ā¸ĸā¸ąā¸‡āš„ā¸‡ ā¸ĸā¸ąā¸‡ā¸ˆā¸° ā¸ĸā¸ąā¸‡āšā¸•āšˆ ā¸ĸā¸˛ā¸ +ā¸ĸā¸˛ā¸§ ā¸ĸā¸˛ā¸§ā¸™ā¸˛ā¸™ ā¸ĸā¸´āšˆā¸‡ ā¸ĸā¸´āšˆā¸‡ā¸ā¸§āšˆā¸˛ ā¸ĸā¸´āšˆā¸‡ā¸‚ā¸ļāš‰ā¸™ ā¸ĸā¸´āšˆā¸‡ā¸‚ā¸ļāš‰ā¸™āš„ā¸› ā¸ĸā¸´āšˆā¸‡ā¸ˆā¸™ ā¸ĸā¸´āšˆā¸‡ā¸ˆā¸° ā¸ĸā¸´āšˆā¸‡ā¸™ā¸ąā¸ ā¸ĸā¸´āšˆā¸‡āš€ā¸Ąā¸ˇāšˆā¸­ ā¸ĸā¸´āšˆā¸‡āšā¸Ĩāš‰ā¸§ ā¸ĸā¸´āšˆā¸‡āšƒā¸Ģā¸āšˆ ā¸Ŗāšˆā¸§ā¸Ąā¸ā¸ąā¸™ ā¸Ŗā¸§ā¸Ąā¸”āš‰ā¸§ā¸ĸ ā¸Ŗāšˆā¸§ā¸Ąā¸”āš‰ā¸§ā¸ĸ ā¸Ŗā¸ˇā¸­ā¸§āšˆā¸˛ āš€ā¸Ŗāš‡ā¸§ āš€ā¸Ŗāš‡ā¸§āš† āš€ā¸Ŗā¸˛āš† āš€ā¸Ŗā¸ĩā¸ĸā¸ āš€ā¸Ŗā¸ĩā¸ĸā¸š āš€ā¸Ŗā¸ˇāšˆā¸­ā¸ĸ +āš€ā¸Ŗā¸ˇāšˆā¸­ā¸ĸāš† āš„ā¸Ŗ ā¸Ĩāš‰ā¸§ā¸™ ā¸Ĩāš‰ā¸§ā¸™ā¸ˆā¸™ ā¸Ĩāš‰ā¸§ā¸™āšā¸•āšˆ ā¸Ĩā¸° ā¸Ĩāšˆā¸˛ā¸Ēā¸¸ā¸” āš€ā¸Ĩāš‡ā¸ āš€ā¸Ĩāš‡ā¸ā¸™āš‰ā¸­ā¸ĸ āš€ā¸Ĩāš‡ā¸āš† āš€ā¸Ĩāšˆā¸˛ā¸§āšˆā¸˛ āšā¸Ĩāš‰ā¸§ā¸ā¸ąā¸™ āšā¸Ĩāš‰ā¸§āšā¸•āšˆ āšā¸Ĩāš‰ā¸§āš€ā¸Ēā¸Ŗāš‡ā¸ˆ ā¸§ā¸ąā¸™āšƒā¸” ā¸§ā¸ąā¸™ā¸™ā¸ąāš‰ā¸™ ā¸§ā¸ąā¸™ā¸™ā¸ĩāš‰ ā¸§ā¸ąā¸™āš„ā¸Ģā¸™ ā¸Ēā¸šā¸˛ā¸ĸ ā¸Ēā¸Ąā¸ąā¸ĸ ā¸Ēā¸Ąā¸ąā¸ĸā¸āšˆā¸­ā¸™ +ā¸Ēā¸Ąā¸ąā¸ĸā¸™ā¸ąāš‰ā¸™ ā¸Ēā¸Ąā¸ąā¸ĸā¸™ā¸ĩāš‰ ā¸Ēā¸Ąā¸ąā¸ĸāš‚ā¸™āš‰ā¸™ ā¸Ēāšˆā¸§ā¸™āš€ā¸ā¸´ā¸™ ā¸Ēāšˆā¸§ā¸™ā¸”āš‰ā¸­ā¸ĸ ā¸Ēāšˆā¸§ā¸™ā¸”ā¸ĩ ā¸Ēāšˆā¸§ā¸™āšƒā¸” ā¸Ēāšˆā¸§ā¸™ā¸—ā¸ĩāšˆ ā¸Ēāšˆā¸§ā¸™ā¸™āš‰ā¸­ā¸ĸ ā¸Ēāšˆā¸§ā¸™ā¸™ā¸ąāš‰ā¸™ ā¸Ēāšˆā¸§ā¸™ā¸Ąā¸˛ā¸ ā¸Ēāšˆā¸§ā¸™āšƒā¸Ģā¸āšˆ ā¸Ēā¸ąāš‰ā¸™ ā¸Ēā¸ąāš‰ā¸™āš† ā¸Ēā¸˛ā¸Ąā¸˛ā¸Ŗā¸– ā¸Ēā¸ŗā¸„ā¸ąā¸ ā¸Ēā¸´āšˆā¸‡ +ā¸Ēā¸´āšˆā¸‡āšƒā¸” ā¸Ēā¸´āšˆā¸‡ā¸™ā¸ąāš‰ā¸™ ā¸Ēā¸´āšˆā¸‡ā¸™ā¸ĩāš‰ ā¸Ēā¸´āšˆā¸‡āš„ā¸Ģā¸™ ā¸Ēā¸´āš‰ā¸™ āš€ā¸Ēā¸Ŗāš‡ā¸ˆāšā¸Ĩāš‰ā¸§ āš€ā¸Ēā¸ĩā¸ĸā¸”āš‰ā¸§ā¸ĸ āš€ā¸Ēā¸ĩā¸ĸāšā¸Ĩāš‰ā¸§ āšā¸Ēā¸”ā¸‡ āšā¸Ēā¸”ā¸‡ā¸§āšˆā¸˛ ā¸Ģā¸™ ā¸Ģā¸™ā¸­ ā¸Ģā¸™ā¸­ā¸ĸ ā¸Ģā¸™āšˆā¸­ā¸ĸ ā¸Ģā¸Ąā¸” ā¸Ģā¸Ąā¸”ā¸ā¸ąā¸™ ā¸Ģā¸Ąā¸”ā¸Ēā¸´āš‰ā¸™ ā¸Ģā¸Ŗā¸ˇā¸­āš„ā¸‡ ā¸Ģā¸Ŗā¸ˇā¸­āš€ā¸›ā¸Ĩāšˆā¸˛ ā¸Ģā¸Ŗā¸ˇā¸­āš„ā¸Ąāšˆ ā¸Ģā¸Ŗā¸ˇā¸­ā¸ĸā¸ąā¸‡ +ā¸Ģā¸Ŗā¸ˇā¸­āš„ā¸Ŗ ā¸Ģā¸˛ā¸āšā¸Ąāš‰ ā¸Ģā¸˛ā¸āšā¸Ąāš‰ā¸™ ā¸Ģā¸˛ā¸āšā¸Ąāš‰ā¸™ā¸§āšˆā¸˛ ā¸Ģā¸˛ā¸ā¸§āšˆā¸˛ ā¸Ģā¸˛ā¸„ā¸§ā¸˛ā¸Ą ā¸Ģā¸˛āšƒā¸Šāšˆ ā¸Ģā¸˛ā¸Ŗā¸ˇā¸­ āš€ā¸Ģā¸•ā¸¸ āš€ā¸Ģā¸•ā¸¸ā¸œā¸Ĩ āš€ā¸Ģā¸•ā¸¸ā¸™ā¸ąāš‰ā¸™ āš€ā¸Ģā¸•ā¸¸ā¸™ā¸ĩāš‰ āš€ā¸Ģā¸•ā¸¸āš„ā¸Ŗ āš€ā¸Ģāš‡ā¸™āšā¸āšˆ āš€ā¸Ģāš‡ā¸™ā¸„ā¸§ā¸Ŗ āš€ā¸Ģāš‡ā¸™ā¸ˆā¸° āš€ā¸Ģāš‡ā¸™ā¸§āšˆā¸˛ āš€ā¸Ģā¸Ĩā¸ˇā¸­ āš€ā¸Ģā¸Ĩā¸ˇā¸­āš€ā¸ā¸´ā¸™ āš€ā¸Ģā¸Ĩāšˆā¸˛ +āš€ā¸Ģā¸Ĩāšˆā¸˛ā¸™ā¸ąāš‰ā¸™ āš€ā¸Ģā¸Ĩāšˆā¸˛ā¸™ā¸ĩāš‰ āšā¸Ģāšˆā¸‡āšƒā¸” āšā¸Ģāšˆā¸‡ā¸™ā¸ąāš‰ā¸™ āšā¸Ģāšˆā¸‡ā¸™ā¸ĩāš‰ āšā¸Ģāšˆā¸‡āš‚ā¸™āš‰ā¸™ āšā¸Ģāšˆā¸‡āš„ā¸Ģā¸™ āšā¸Ģā¸Ĩā¸° āšƒā¸Ģāš‰āšā¸āšˆ āšƒā¸Ģā¸āšˆ āšƒā¸Ģā¸āšˆāš‚ā¸• ā¸­ā¸ĸāšˆā¸˛ā¸‡āš€ā¸Šāšˆā¸™ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸”ā¸ĩ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš€ā¸”ā¸ĩā¸ĸā¸§ ā¸­ā¸ĸāšˆā¸˛ā¸‡āšƒā¸” ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸—ā¸ĩāšˆ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸™āš‰ā¸­ā¸ĸ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸™ā¸ąāš‰ā¸™ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸™ā¸ĩāš‰ +ā¸­ā¸ĸāšˆā¸˛ā¸‡āš‚ā¸™āš‰ā¸™ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸Ąā¸˛ā¸ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸ĸā¸´āšˆā¸‡ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ŗ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ŗā¸āš‡ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ŗā¸āš‡āš„ā¸”āš‰ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ŗāš€ā¸Ēā¸ĩā¸ĸ ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸Ĩā¸° ā¸­ā¸ĸāšˆā¸˛ā¸‡ā¸Ģā¸™ā¸ļāšˆā¸‡ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš„ā¸Ģā¸™ ā¸­ā¸ĸāšˆā¸˛ā¸‡āš† ā¸­ā¸ąā¸™ ā¸­ā¸ąā¸™ā¸ˆā¸° ā¸­ā¸ąā¸™āšƒā¸” ā¸­ā¸ąā¸™āš„ā¸”āš‰āšā¸āšˆ ā¸­ā¸ąā¸™ā¸—ā¸ĩāšˆ +ā¸­ā¸ąā¸™ā¸—ā¸ĩāšˆā¸ˆā¸Ŗā¸´ā¸‡ ā¸­ā¸ąā¸™ā¸—ā¸ĩāšˆā¸ˆā¸° ā¸­ā¸ąā¸™āš€ā¸™ā¸ˇāšˆā¸­ā¸‡ā¸Ąā¸˛ā¸ˆā¸˛ā¸ ā¸­ā¸ąā¸™ā¸Ĩā¸° ā¸­ā¸ąā¸™āš„ā¸Ģā¸™ ā¸­ā¸ąā¸™āš† ā¸­ā¸˛ā¸ˆā¸ˆā¸° ā¸­ā¸˛ā¸ˆāš€ā¸›āš‡ā¸™ ā¸­ā¸˛ā¸ˆāš€ā¸›āš‡ā¸™ā¸”āš‰ā¸§ā¸ĸ ā¸­ā¸ˇāšˆā¸™ ā¸­ā¸ˇāšˆā¸™āš† āš€ā¸­āš‡ā¸‡ āš€ā¸­ā¸˛ ā¸¯ ā¸¯ā¸Ĩ ā¸¯ā¸Ĩā¸¯ +""".split()) \ No newline at end of file diff --git a/spacy/th/tag_map.py b/spacy/th/tag_map.py new file mode 100644 index 000000000..e225f7289 --- /dev/null +++ b/spacy/th/tag_map.py @@ -0,0 +1,81 @@ +# encoding: utf8 +# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) +from __future__ import unicode_literals + +from ..symbols import * + +TAG_MAP = { + #NOUN + "NOUN": {POS: NOUN}, + "NCMN": {POS: NOUN}, + "NTTL": {POS: NOUN}, + "CNIT": {POS: NOUN}, + "CLTV": {POS: NOUN}, + "CMTR": {POS: NOUN}, + "CFQC": {POS: NOUN}, + "CVBL": {POS: NOUN}, + #PRON + "PRON": {POS: PRON}, + "NPRP": {POS: PRON}, + # ADJ + "ADJ": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, + # ADV + "ADV": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, + # INT + "INT": {POS: INTJ}, + # PRON + "PROPN": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, + # DET + "DET": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, + # NUM + "NUM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, + # AUX + "AUX": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, + # ADP + "ADP": {POS: ADP}, + "RPRE": {POS: ADP}, + # CCONJ + "CCONJ": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, + # SCONJ + "SCONJ": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, + # PART + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, + # PUNCT + "PUNCT": {POS: PUNCT}, + "PUNC": {POS: PUNCT} +} \ No newline at end of file diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py new file mode 100644 index 000000000..7e3967aed --- /dev/null +++ b/spacy/th/tokenizer_exceptions.py @@ -0,0 +1,80 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "ā¸Ą.ā¸„.": [ + {ORTH: "ā¸Ą.ā¸„.", LEMMA: "ā¸Ąā¸ā¸Ŗā¸˛ā¸„ā¸Ą"} + ], + "ā¸.ā¸ž.": [ + {ORTH: "ā¸.ā¸ž.", LEMMA: "ā¸ā¸¸ā¸Ąā¸ ā¸˛ā¸žā¸ąā¸™ā¸˜āšŒ"} + ], + "ā¸Ąā¸ĩ.ā¸„.": [ + {ORTH: "ā¸Ąā¸ĩ.ā¸„.", LEMMA: "ā¸Ąā¸ĩā¸™ā¸˛ā¸„ā¸Ą"} + ], + "āš€ā¸Ą.ā¸ĸ.": [ + {ORTH: "āš€ā¸Ą.ā¸ĸ.", LEMMA: "āš€ā¸Ąā¸Šā¸˛ā¸ĸā¸™"} + ], + "ā¸ž.ā¸„.": [ + {ORTH: "ā¸ž.ā¸„.", LEMMA: "ā¸žā¸¤ā¸Šā¸ ā¸˛ā¸„ā¸Ą"} + ], + "ā¸Ąā¸´.ā¸ĸ.": [ + {ORTH: "ā¸Ąā¸´.ā¸ĸ.", LEMMA: "ā¸Ąā¸´ā¸–ā¸¸ā¸™ā¸˛ā¸ĸā¸™"} + ], + "ā¸.ā¸„.": [ + {ORTH: "ā¸.ā¸„.", LEMMA: "ā¸ā¸Ŗā¸ā¸Žā¸˛ā¸„ā¸Ą"} + ], + "ā¸Ē.ā¸„.": [ + {ORTH: "ā¸Ē.ā¸„.", LEMMA: "ā¸Ēā¸´ā¸‡ā¸Ģā¸˛ā¸„ā¸Ą"} + ], + "ā¸.ā¸ĸ.": [ + {ORTH: "ā¸.ā¸ĸ.", LEMMA: "ā¸ā¸ąā¸™ā¸ĸā¸˛ā¸ĸā¸™"} + ], + "ā¸•.ā¸„.": [ + {ORTH: "ā¸•.ā¸„.", LEMMA: "ā¸•ā¸¸ā¸Ĩā¸˛ā¸„ā¸Ą"} + ], + "ā¸ž.ā¸ĸ.": [ + {ORTH: "ā¸ž.ā¸ĸ.", LEMMA: "ā¸žā¸¤ā¸¨ā¸ˆā¸´ā¸ā¸˛ā¸ĸā¸™"} + ], + "ā¸˜.ā¸„.": [ + {ORTH: "ā¸˜.ā¸„.", LEMMA: "ā¸˜ā¸ąā¸™ā¸§ā¸˛ā¸„ā¸Ą"} + ] +} + + +# exceptions mapped to a single token containing only ORTH property +# example: {"string": [{ORTH: "string"}]} +# converted using strings_to_exc() util +''' +ORTH_ONLY = [ + "a.", + "b.", + "c.", + "d.", + "e.", + "f.", + "g.", + "h.", + "i.", + "j.", + "k.", + "l.", + "m.", + "n.", + "o.", + "p.", + "q.", + "r.", + "s.", + "t.", + "u.", + "v.", + "w.", + "x.", + "y.", + "z." +] +''' \ No newline at end of file From 39bb5690f0e1398b75407f70e89f88da4f9c3738 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 21 Sep 2017 00:36:02 +0700 Subject: [PATCH 133/195] update th --- spacy/th/__init__.py | 4 +--- spacy/th/tokenizer_exceptions.py | 37 +------------------------------- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py index 0b6f8cf76..0ed5268c6 100644 --- a/spacy/th/__init__.py +++ b/spacy/th/__init__.py @@ -25,6 +25,4 @@ class Thai(Language): raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " "https://github.com/wannaphongcom/pythainlp/") words = [x for x in list(word_tokenize(text,"newmm"))] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) - -__all__ = ['Thai'] \ No newline at end of file + return Doc(self.vocab, words=words, spaces=[False]*len(words)) \ No newline at end of file diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py index 7e3967aed..0f933f1c1 100644 --- a/spacy/th/tokenizer_exceptions.py +++ b/spacy/th/tokenizer_exceptions.py @@ -42,39 +42,4 @@ TOKENIZER_EXCEPTIONS = { "ā¸˜.ā¸„.": [ {ORTH: "ā¸˜.ā¸„.", LEMMA: "ā¸˜ā¸ąā¸™ā¸§ā¸˛ā¸„ā¸Ą"} ] -} - - -# exceptions mapped to a single token containing only ORTH property -# example: {"string": [{ORTH: "string"}]} -# converted using strings_to_exc() util -''' -ORTH_ONLY = [ - "a.", - "b.", - "c.", - "d.", - "e.", - "f.", - "g.", - "h.", - "i.", - "j.", - "k.", - "l.", - "m.", - "n.", - "o.", - "p.", - "q.", - "r.", - "s.", - "t.", - "u.", - "v.", - "w.", - "x.", - "y.", - "z." -] -''' \ No newline at end of file +} \ No newline at end of file From 1abf472068ef700c66da4dc0f4beadb3ccd7c718 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 21 Sep 2017 12:56:58 +0700 Subject: [PATCH 134/195] add th test --- spacy/tests/conftest.py | 6 ++++++ spacy/tests/th/test_tokenizer.py | 13 +++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 spacy/tests/th/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6e00b1513..c9652b08d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -15,6 +15,7 @@ from ..fi import Finnish from ..bn import Bengali from ..he import Hebrew from ..nb import Norwegian +from ..th import Thai from ..tokens import Doc @@ -101,6 +102,11 @@ def he_tokenizer(): def nb_tokenizer(): return Norwegian.Defaults.create_tokenizer() +@pytest.fixture +def th_tokenizer(): + pythainlp = pytest.importorskip("pythainlp") + return Thai.Defaults.create_tokenizer() + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/th/test_tokenizer.py b/spacy/tests/th/test_tokenizer.py new file mode 100644 index 000000000..851c6f067 --- /dev/null +++ b/spacy/tests/th/test_tokenizer.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +TOKENIZER_TESTS = [ + ("ā¸„ā¸¸ā¸“ā¸Ŗā¸ąā¸ā¸œā¸Ąāš„ā¸Ģā¸Ą", ['ā¸„ā¸¸ā¸“', 'ā¸Ŗā¸ąā¸', 'ā¸œā¸Ą', 'āš„ā¸Ģā¸Ą']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_thai_tokenizer(th_tokenizer, text, expected_tokens): + tokens = [token.text for token in th_tokenizer(text)] + assert tokens == expected_tokens From 425c09488d1370d217b46521e2942b4b04a4e254 Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 08:56:34 +0800 Subject: [PATCH 135/195] Update word-vectors-similarities.jade add ``` import spacy nlp = spacy.load('en') ``` --- website/docs/usage/word-vectors-similarities.jade | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 3cc0a67a8..3fd6326d1 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -21,10 +21,12 @@ p +code. import numpy + import spacy + nlp = spacy.load('en') apples, and_, oranges = nlp(u'apples and oranges') print(apples.vector.shape) - # (1,) + # (300,) apples.similarity(oranges) p From 923c4c2fb2863858c18d262de53746f42c9aa6ae Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 09:50:46 +0800 Subject: [PATCH 136/195] Update punctuation.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add `â€Ļâ€Ļ` --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 58ec73f2d..3b5307496 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -36,7 +36,7 @@ _HYPHENS = r""" LIST_ELLIPSES = [ r'\.\.+', - "â€Ļ" + "â€Ļ â€Ļâ€Ļ" ] From 6f450306c3429d19472e7ae25bcbcd7f8b835e2d Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 10:53:22 +0800 Subject: [PATCH 137/195] Update customizing-tokenizer.jade update some codes: - `me` -> `-PRON` - `TAG` -> `POS` - `create_tokenizer` function --- website/docs/usage/customizing-tokenizer.jade | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index ca5be9ef1..c7f717380 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -40,7 +40,9 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] + # Pronoun lemma is returned as -PRON- + # More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -57,7 +59,7 @@ p +code. nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) + ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p @@ -172,12 +174,14 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~]''') def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, + return Tokenizer(nlp.vocab, rules={}, prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) - nlp = spacy.load('en', tokenizer=create_make_doc) + nlp = spacy.load('en', create_make_doc=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to From 54855f0eee6707798caa58d41d192ec4401a5763 Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 12:15:48 +0800 Subject: [PATCH 138/195] Update customizing-tokenizer.jade --- website/docs/usage/customizing-tokenizer.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c7f717380..c2f840a27 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -59,7 +59,7 @@ p +code. nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) + ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p From b6ebedd09c03648c8bd3a448bd15ab87ce1631e4 Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Mon, 25 Sep 2017 13:13:25 -0700 Subject: [PATCH 139/195] Document Tokenizer(token_match) and clarify tokenizer_pseudo_code Closes #835 In the `tokenizer_pseudo_code` I put the `special_cases` kwarg before `find_prefix` because this now matches the order the args are used in the pseudocode, and it also matches spacy's actual code. --- website/docs/usage/customizing-tokenizer.jade | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c2f840a27..173521a33 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -87,8 +87,8 @@ p | algorithm in Python, optimized for readability rather than performance: +code. - def tokenizer_pseudo_code(text, find_prefix, find_suffix, - find_infixes, special_cases): + def tokenizer_pseudo_code(text, special_cases, + find_prefix, find_suffix, find_infixes): tokens = [] for substring in text.split(' '): suffixes = [] @@ -140,7 +140,7 @@ p p | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | are five things you would need to define: +list("numbers") +item @@ -162,6 +162,11 @@ p | A function #[code infixes_finditer], to handle non-whitespace | separators, such as hyphens etc. + +item + | (Optional) A boolean function #[code token_match] matching strings + | that should never be split, overriding the previous rules. + | Useful for things like URLs or numbers. + p | You shouldn't usually need to create a #[code Tokenizer] subclass. | Standard usage is to use #[code re.compile()] to build a regular @@ -175,11 +180,15 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') infix_re = re.compile(r'''[-~]''') + simple_url_re = re.compile(r'''^https?://''') def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, rules={}, + return Tokenizer(nlp.vocab, + rules={}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer) + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match + ) nlp = spacy.load('en', create_make_doc=create_tokenizer) From 259ed027af0e4584956b7d00c37a3beb9d5b8d98 Mon Sep 17 00:00:00 2001 From: Vincent Genty Date: Tue, 26 Sep 2017 15:46:04 +0200 Subject: [PATCH 140/195] Fixed NER model loading bug --- spacy/syntax/parser.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b9de1e114..48edb6d22 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -147,6 +147,9 @@ cdef class Parser: # TODO: remove this shim when we don't have to support older data if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') + # Convert string keys to int + if cfg.get('actions'): + cfg['actions'] = {int(action_name): labels for action_name, labels in cfg['actions'].items()} # TODO: remove this shim when we don't have to support older data for action_name, labels in dict(cfg.get('actions', {})).items(): # We need this to be sorted From a9362f1c73fd7197548f6d32ed997600d15f9ff2 Mon Sep 17 00:00:00 2001 From: Ondrej Kokes Date: Wed, 4 Oct 2017 12:55:07 +0200 Subject: [PATCH 141/195] Fixing links to SyntaxNet --- website/docs/api/index.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade index 24f3d4458..7e3f1a906 100644 --- a/website/docs/api/index.jade +++ b/website/docs/api/index.jade @@ -6,7 +6,7 @@ include ../../_includes/_mixins p | Here's a quick comparison of the functionalities offered by spaCy, - | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet], + | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet], | #[+a("http://www.nltk.org/py-modindex.html") NLTK] and | #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP]. @@ -107,7 +107,7 @@ p p | In 2016, Google released their - | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet] + | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet] | library, setting a new state of the art for syntactic dependency parsing | accuracy. SyntaxNet's algorithm is very similar to spaCy's. The main | difference is that SyntaxNet uses a neural network while spaCy uses a @@ -129,7 +129,7 @@ p +cell=data +row - +cell #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") Parsey McParseface] + +cell #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") Parsey McParseface] each data in [ 94.15, 89.08, 94.77 ] +cell=data From e81a608173e78b10da5984cf0d2632de29f407f1 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:47:48 -0400 Subject: [PATCH 142/195] Regression test for lemmatizer exceptions -- demonstrate issue #1387 --- spacy/tests/regression/test_issue1387.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 spacy/tests/regression/test_issue1387.py diff --git a/spacy/tests/regression/test_issue1387.py b/spacy/tests/regression/test_issue1387.py new file mode 100644 index 000000000..c5f01d145 --- /dev/null +++ b/spacy/tests/regression/test_issue1387.py @@ -0,0 +1,22 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc + +import pytest + +def test_issue1387(): + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + From ffb50d21a043a1028a7a8ac3f354483ec100fce6 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:49:02 -0400 Subject: [PATCH 143/195] Lemmatizer honors exceptions: Fix #1387 --- spacy/lemmatizer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7541c56b..1112bcee3 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules): # forms.append(string) forms.extend(exceptions.get(string, [])) oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: From b0d271809dab5146fdc45cfcfab2e467b8a9347e Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 10:49:28 -0400 Subject: [PATCH 144/195] Unit test for lemmatizer exceptions -- copied from regression test for #1387 --- spacy/tests/tagger/test_lemmatizer.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 5db0d0b2c..91ed7d2f1 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN): assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) assert all(t.lemma_ != '' for t in doc) + + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc +def test_tagger_lemmatizer_exceptions(): + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" From e77d8886f7bad951341060fee328eaa7ab4e927e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Oct 2017 22:22:04 +0200 Subject: [PATCH 145/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 995f6901f..97c53c3d2 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,7 +26,9 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Geovedi, [@geovedi](https://github.com/geovedi) * Jim Regan, [@jimregan](https://github.com/jimregan) +* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) @@ -41,6 +43,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * RaphaÃĢl Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) @@ -51,11 +54,15 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) +* Vimos Tan, [@Vimos](https://github.com/Vimos) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) * Wah Loon Keng, [@kengz](https://github.com/kengz) +* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom) * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) +* Yam, [@hscspring](https://github.com/hscspring) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) +* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh) * Yubing Dong, [@tomtung](https://github.com/tomtung) * Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) From e04e11070f78ea827ddce40e62ee9ce8c7f38489 Mon Sep 17 00:00:00 2001 From: Orion Montoya Date: Thu, 5 Oct 2017 17:45:45 -0400 Subject: [PATCH 146/195] Contributor agreement for Orion Montoya @mdcclv --- .github/contributors/mdcclv.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/mdcclv.md diff --git a/.github/contributors/mdcclv.md b/.github/contributors/mdcclv.md new file mode 100644 index 000000000..14ebfae26 --- /dev/null +++ b/.github/contributors/mdcclv.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------- | +| Name | Orion Montoya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 04-10-2017 | +| GitHub username | mdcclv | +| Website (optional) | http://www.mdcclv.com/ | From 763b54cbc38120f63c308b4d519c9fb2cb2408ae Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 6 Oct 2017 16:30:44 +0700 Subject: [PATCH 147/195] Update adding-languages.jade Fixed misspellings --- website/docs/usage/adding-languages.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 7d893b4eb..02dfb79ca 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -525,13 +525,13 @@ p | └── oov_prob # optional ├── pos/ # optional | ├── model # via nlp.tagger.model.dump(path) - | └── config.json # via Langage.train + | └── config.json # via Language.train ├── deps/ # optional | ├── model # via nlp.parser.model.dump(path) - | └── config.json # via Langage.train + | └── config.json # via Language.train └── ner/ # optional ├── model # via nlp.entity.model.dump(path) - └── config.json # via Langage.train + └── config.json # via Language.train p | This creates a spaCy data directory with a vocabulary model, ready to be From e89689a31d69180b9ee22603b488a3594a8383dc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 6 Oct 2017 18:02:40 +0200 Subject: [PATCH 148/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 97c53c3d2..9e210bd4c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -43,6 +43,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Orion Montoya, [@mdcclv](https://github.com/mdcclv) * Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * RaphaÃĢl Bournhonesque, [@raphael0202](https://github.com/raphael0202) From efe0800f91dd35d114cbcdf64845bdafa34de9f5 Mon Sep 17 00:00:00 2001 From: Yam Date: Mon, 9 Oct 2017 21:39:15 -0500 Subject: [PATCH 149/195] Update training.jade fix several changes --- website/docs/usage/training.jade | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 8a5c111bd..3a15ae2a1 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -33,12 +33,14 @@ p from spacy.vocab import Vocab from spacy.pipeline import EntityRecognizer from spacy.tokens import Doc + from spacy.gold import GoldParse vocab = Vocab() entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC']) doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + gold = GoldParse(doc, entities=['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + entity.update(doc, gold) entity.model.end_training() @@ -65,13 +67,14 @@ p.o-inline-list from spacy.vocab import Vocab from spacy.pipeline import DependencyParser from spacy.tokens import Doc + from spacy.gold import GoldParse vocab = Vocab() parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct']) doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'), - (1, 'punct')]) + gold = GoldParse(doc, [1,1,3,1,1], ['nsubj', 'ROOT', 'compound', 'dobj', 'punct']) + parser.update(doc, gold) parser.model.end_training() @@ -120,7 +123,7 @@ p +code. from spacy.vocab import Vocab - from spacy.pipeline import Tagger + from spacy.tagger import Tagger from spacy.tagger import P2_orth, P1_orth from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth From 3452d6ce521943fb0bb02f59d3d9e3a1bac218c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 11 Oct 2017 11:24:00 +0200 Subject: [PATCH 150/195] Resolve issue #1078 by simplifying URL pattern - avoid catastrophic backtracking - reduce character range of host name, domain name and TLD identifier --- spacy/language_data/tokenizer_exceptions.py | 6 +++--- spacy/tests/tokenizer/test_urls.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index b84adb2c4..9d5187d83 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -32,11 +32,11 @@ _URL_PATTERN = ( r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name - r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" + r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # domain name - r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" + r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*" # TLD identifier - r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" + r"(?:\.(?:[a-z]{2,}))" r")" # port number r"(?::\d{2,5})?" diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 959067110..3bb6521f1 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", - "http://⌘.ws", - "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(âœĒ)_in_parens", "http://foo.com/(something)?after=parens", - "http://â˜ē.damowmow.com/", "http://code.google.com/events/#&product=browser", "http://j.mp", "ftp://foo.bar/baz", @@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [ "http://a.b-c.de", "http://223.255.255.254", "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 - "http://âœĒdf.ws/123", - "http://➡.ws/䨚", - "http://Ų…ØĢاŲ„.ØĨØŽØĒØ¨Ø§Øą", - "http://䞋子.æĩ‹č¯•", - "http://ā¤‰ā¤Ļā¤žā¤šā¤°ā¤Ŗ.ā¤Ēā¤°āĨ€ā¤•āĨā¤ˇā¤ž", pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), + pytest.mark.xfail("http://⌘.ws"), + pytest.mark.xfail("http://⌘.ws/"), + pytest.mark.xfail("http://â˜ē.damowmow.com/"), + pytest.mark.xfail("http://âœĒdf.ws/123"), + pytest.mark.xfail("http://➡.ws/䨚"), + pytest.mark.xfail("http://Ų…ØĢاŲ„.ØĨØŽØĒØ¨Ø§Øą"), + pytest.mark.xfail("http://䞋子.æĩ‹č¯•"), + pytest.mark.xfail("http://ā¤‰ā¤Ļā¤žā¤šā¤°ā¤Ŗ.ā¤Ēā¤°āĨ€ā¤•āĨā¤ˇā¤ž"), ] URLS_SHOULD_NOT_MATCH = [ @@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", - "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://10.1.1.0", @@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [ pytest.mark.xfail("foo.com"), pytest.mark.xfail("http://1.1.1.1.1"), pytest.mark.xfail("http://www.foo.bar./"), + pytest.mark.xfail("http://-a.b.co"), ] From 2a78f4d6345084fda788a7f94beff963026b0e83 Mon Sep 17 00:00:00 2001 From: yuukos Date: Thu, 12 Oct 2017 22:23:19 +0700 Subject: [PATCH 151/195] updated .gitignore file added excluding PyCharm's idea directory --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 84ced41f8..ecd8ed39f 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,7 @@ Desktop.ini # Other *.tgz + + +# JetBrains PyCharm +.idea/ \ No newline at end of file From 7b9491679ffa235ce6cc3f8d3f94b00c14d40655 Mon Sep 17 00:00:00 2001 From: yuukos Date: Thu, 12 Oct 2017 22:24:20 +0700 Subject: [PATCH 152/195] added russian language support --- spacy/ru/__init__.py | 56 ++++++++++++++++++++++++++++++++ spacy/ru/language_data.py | 18 ++++++++++ spacy/ru/stop_words.py | 54 ++++++++++++++++++++++++++++++ spacy/ru/tokenizer_exceptions.py | 29 +++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 spacy/ru/__init__.py create mode 100644 spacy/ru/language_data.py create mode 100644 spacy/ru/stop_words.py create mode 100644 spacy/ru/tokenizer_exceptions.py diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py new file mode 100644 index 000000000..d8f38e199 --- /dev/null +++ b/spacy/ru/__init__.py @@ -0,0 +1,56 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc +from .language_data import * + + +class RussianTokenizer(object): + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian tokenizer requires the pymorphy2 library: " + "try to fix it with " + "pip install pymorphy2==0.8") + + _morph = MorphAnalyzer() + + def __init__(self, spacy_tokenizer, cls, nlp=None): + self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) + self._spacy_tokenizer = spacy_tokenizer + + def __call__(self, text): + words = [self._normalize(RussianTokenizer._get_word(token)) + for token in self._spacy_tokenizer(text)] + + return Doc(self.vocab, words, [False] * len(words)) + + @staticmethod + def _get_word(token): + return token.lemma_ if len(token.lemma_) > 0 else token.text + + @classmethod + def _normalize(cls, word): + return cls._morph.parse(word)[0].normal_form + + +class RussianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ru' + + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + + @classmethod + def create_tokenizer(cls, nlp=None): + tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp) + return RussianTokenizer(tokenizer, cls, nlp) + + +class Russian(Language): + lang = 'ru' + + Defaults = RussianDefaults diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py new file mode 100644 index 000000000..75ca41b65 --- /dev/null +++ b/spacy/ru/language_data.py @@ -0,0 +1,18 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + + +__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] \ No newline at end of file diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py new file mode 100644 index 000000000..ddb28af86 --- /dev/null +++ b/spacy/ru/stop_words.py @@ -0,0 +1,54 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +Đ° + +ĐąŅƒĐ´ĐĩĐŧ ĐąŅƒĐ´ĐĩŅ‚ ĐąŅƒĐ´ĐĩŅ‚Đĩ ĐąŅƒĐ´ĐĩŅˆŅŒ ĐąŅƒĐ´Ņƒ ĐąŅƒĐ´ŅƒŅ‚ ĐąŅƒĐ´ŅƒŅ‡Đ¸ ĐąŅƒĐ´ŅŒ ĐąŅƒĐ´ŅŒŅ‚Đĩ ĐąŅ‹ ĐąŅ‹Đģ ĐąŅ‹ĐģĐ° ĐąŅ‹Đģи ĐąŅ‹ĐģĐž +ĐąŅ‹Ņ‚ŅŒ + +в ваĐŧ ваĐŧи ваŅ вĐĩŅŅŒ вО вОŅ‚ вŅĐĩ вŅŅ‘ вŅĐĩĐŗĐž вŅĐĩĐš вŅĐĩĐŧ вŅŅ‘Đŧ вŅĐĩĐŧи вŅĐĩĐŧŅƒ вŅĐĩŅ… вŅĐĩŅŽ +вŅĐĩŅ вŅŅŽ вŅŅ вŅ‹ + +Đ´Đ° Đ´ĐģŅ Đ´Đž + +ĐĩĐŗĐž ĐĩдиĐŧ ĐĩĐ´ŅŅ‚ ĐĩĐĩ ĐĩŅ‘ ĐĩĐš ĐĩĐģ ĐĩĐģĐ° ĐĩĐŧ ĐĩĐŧŅƒ ĐĩĐŧŅŠ ĐĩŅĐģи ĐĩŅŅ‚ ĐĩŅŅ‚ŅŒ ĐĩŅˆŅŒ ĐĩŅ‰Đĩ ĐĩŅ‰Ņ‘ ĐĩŅŽ + +ĐļĐĩ + +Са + +и иС иĐģи иĐŧ иĐŧи иĐŧŅŠ иŅ… + +Đē ĐēĐ°Đē ĐēĐĩĐŧ ĐēĐž ĐēĐžĐŗĐ´Đ° ĐēĐžĐŗĐž ĐēĐžĐŧ ĐēĐžĐŧŅƒ ĐēĐžĐŧŅŒŅ ĐēĐžŅ‚ĐžŅ€Đ°Ņ ĐēĐžŅ‚ĐžŅ€ĐžĐŗĐž ĐēĐžŅ‚ĐžŅ€ĐžĐĩ ĐēĐžŅ‚ĐžŅ€ĐžĐš ĐēĐžŅ‚ĐžŅ€ĐžĐŧ +ĐēĐžŅ‚ĐžŅ€ĐžĐŧŅƒ ĐēĐžŅ‚ĐžŅ€ĐžŅŽ ĐēĐžŅ‚ĐžŅ€ŅƒŅŽ ĐēĐžŅ‚ĐžŅ€Ņ‹Đĩ ĐēĐžŅ‚ĐžŅ€Ņ‹Đš ĐēĐžŅ‚ĐžŅ€Ņ‹Đŧ ĐēĐžŅ‚ĐžŅ€Ņ‹Đŧи ĐēĐžŅ‚ĐžŅ€Ņ‹Ņ… ĐēŅ‚Đž + +ĐŧĐĩĐŊŅ ĐŧĐŊĐĩ ĐŧĐŊОК ĐŧĐŊĐžŅŽ ĐŧĐžĐŗ ĐŧĐžĐŗи ĐŧĐžĐŗиŅ‚Đĩ ĐŧĐžĐŗĐģĐ° ĐŧĐžĐŗĐģи ĐŧĐžĐŗĐģĐž ĐŧĐžĐŗŅƒ ĐŧĐžĐŗŅƒŅ‚ ĐŧĐžĐĩ ĐŧĐžŅ‘ ĐŧĐžĐĩĐŗĐž +ĐŧĐžĐĩĐš ĐŧĐžĐĩĐŧ ĐŧĐžŅ‘Đŧ ĐŧĐžĐĩĐŧŅƒ ĐŧĐžĐĩŅŽ ĐŧĐžĐļĐĩĐŧ ĐŧĐžĐļĐĩŅ‚ ĐŧĐžĐļĐĩŅ‚Đĩ ĐŧĐžĐļĐĩŅˆŅŒ ĐŧОи ĐŧОК ĐŧОиĐŧ ĐŧОиĐŧи ĐŧОиŅ… +ĐŧĐžŅ‡ŅŒ ĐŧĐžŅŽ ĐŧĐžŅ ĐŧŅ‹ + +ĐŊĐ° ĐŊĐ°Đŧ ĐŊĐ°Đŧи ĐŊĐ°Ņ ĐŊĐ°ŅĐ° ĐŊĐ°Ņˆ ĐŊĐ°ŅˆĐ° ĐŊĐ°ŅˆĐĩ ĐŊĐ°ŅˆĐĩĐŗĐž ĐŊĐ°ŅˆĐĩĐš ĐŊĐ°ŅˆĐĩĐŧ ĐŊĐ°ŅˆĐĩĐŧŅƒ ĐŊĐ°ŅˆĐĩŅŽ ĐŊĐ°ŅˆĐ¸ ĐŊĐ°ŅˆĐ¸Đŧ +ĐŊĐ°ŅˆĐ¸Đŧи ĐŊĐ°ŅˆĐ¸Ņ… ĐŊĐ°ŅˆŅƒ ĐŊĐĩ ĐŊĐĩĐŗĐž ĐŊĐĩĐĩ ĐŊĐĩŅ‘ ĐŊĐĩĐš ĐŊĐĩĐŧ ĐŊŅ‘Đŧ ĐŊĐĩĐŧŅƒ ĐŊĐĩŅ‚ ĐŊĐĩŅŽ ĐŊиĐŧ ĐŊиĐŧи ĐŊиŅ… ĐŊĐž + +Đž Ой ОдиĐŊ ОдĐŊĐ° ОдĐŊи ОдĐŊиĐŧ ОдĐŊиĐŧи ОдĐŊиŅ… ОдĐŊĐž ОдĐŊĐžĐŗĐž ОдĐŊОК ОдĐŊĐžĐŧ ОдĐŊĐžĐŧŅƒ ОдĐŊĐžŅŽ +ОдĐŊŅƒ ĐžĐŊ ĐžĐŊĐ° ĐžĐŊĐĩ ĐžĐŊи ĐžĐŊĐž ĐžŅ‚ + +ĐŋĐž ĐŋŅ€Đ¸ + +Ņ ŅĐ°Đŧ ŅĐ°ĐŧĐ° ŅĐ°Đŧи ŅĐ°ĐŧиĐŧ ŅĐ°ĐŧиĐŧи ŅĐ°ĐŧиŅ… ŅĐ°ĐŧĐž ŅĐ°ĐŧĐžĐŗĐž ŅĐ°ĐŧĐžĐŧ ŅĐ°ĐŧĐžĐŧŅƒ ŅĐ°ĐŧŅƒ ŅĐ˛ĐžĐĩ ŅĐ˛ĐžŅ‘ +ŅĐ˛ĐžĐĩĐŗĐž ŅĐ˛ĐžĐĩĐš ŅĐ˛ĐžĐĩĐŧ ŅĐ˛ĐžŅ‘Đŧ ŅĐ˛ĐžĐĩĐŧŅƒ ŅĐ˛ĐžĐĩŅŽ ŅĐ˛ĐžĐ¸ ŅĐ˛ĐžĐš ŅĐ˛ĐžĐ¸Đŧ ŅĐ˛ĐžĐ¸Đŧи ŅĐ˛ĐžĐ¸Ņ… ŅĐ˛ĐžŅŽ ŅĐ˛ĐžŅ +ŅĐĩĐąĐĩ ŅĐĩĐąŅ ŅĐžĐąĐžĐš ŅĐžĐąĐžŅŽ + +Ņ‚Đ° Ņ‚Đ°Đē Ņ‚Đ°ĐēĐ°Ņ Ņ‚Đ°ĐēиĐĩ Ņ‚Đ°ĐēиĐŧ Ņ‚Đ°ĐēиĐŧи Ņ‚Đ°ĐēиŅ… Ņ‚Đ°ĐēĐžĐŗĐž Ņ‚Đ°ĐēĐžĐĩ Ņ‚Đ°ĐēОК Ņ‚Đ°ĐēĐžĐŧ Ņ‚Đ°ĐēĐžĐŧŅƒ Ņ‚Đ°ĐēĐžŅŽ +Ņ‚Đ°ĐēŅƒŅŽ Ņ‚Đĩ Ņ‚ĐĩĐąĐĩ Ņ‚ĐĩĐąŅ Ņ‚ĐĩĐŧ Ņ‚ĐĩĐŧи Ņ‚ĐĩŅ… Ņ‚Đž Ņ‚ОйОК Ņ‚ОйОŅŽ Ņ‚ĐžĐŗĐž Ņ‚ОК Ņ‚ĐžĐģŅŒĐēĐž Ņ‚ĐžĐŧ Ņ‚ĐžĐŧĐ°Ņ… Ņ‚ĐžĐŧŅƒ +Ņ‚ĐžŅ‚ Ņ‚ĐžŅŽ Ņ‚Ņƒ Ņ‚Ņ‹ + +Ņƒ ŅƒĐļĐĩ + +Ņ‡ĐĩĐŗĐž Ņ‡ĐĩĐŧ Ņ‡Ņ‘Đŧ Ņ‡ĐĩĐŧŅƒ Ņ‡Ņ‚Đž Ņ‡Ņ‚ОйŅ‹ + +ŅŅ‚Đ° ŅŅ‚и ŅŅ‚иĐŧ ŅŅ‚иĐŧи ŅŅ‚иŅ… ŅŅ‚Đž ŅŅ‚ĐžĐŗĐž ŅŅ‚ОК ŅŅ‚ĐžĐŧ ŅŅ‚ĐžĐŧŅƒ ŅŅ‚ĐžŅ‚ ŅŅ‚ĐžŅŽ ŅŅ‚Ņƒ + +Ņ +""".split()) \ No newline at end of file diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py new file mode 100644 index 000000000..8df57a402 --- /dev/null +++ b/spacy/ru/tokenizer_exceptions.py @@ -0,0 +1,29 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TOKENIZER_EXCEPTIONS = { + "ПĐŊ.": [ + {ORTH: "ПĐŊ.", LEMMA: "ПоĐŊĐĩĐ´ĐĩĐģŅŒĐŊиĐē"} + ], + "ВŅ‚.": [ + {ORTH: "ВŅ‚.", LEMMA: "ВŅ‚ĐžŅ€ĐŊиĐē"} + ], + "ĐĄŅ€.": [ + {ORTH: "ĐĄŅ€.", LEMMA: "ĐĄŅ€ĐĩĐ´Đ°"} + ], + "ЧŅ‚.": [ + {ORTH: "ЧŅ‚.", LEMMA: "ЧĐĩŅ‚вĐĩŅ€Đŗ"} + ], + "ПŅ‚.": [ + {ORTH: "ПŅ‚.", LEMMA: "ПŅŅ‚ĐŊиŅ†Đ°"} + ], + "ĐĄĐą.": [ + {ORTH: "ĐĄĐą.", LEMMA: "ĐĄŅƒĐąĐąĐžŅ‚Đ°"} + ], + "ВŅ.": [ + {ORTH: "ВŅ.", LEMMA: "ВоŅĐēŅ€ĐĩŅĐĩĐŊŅŒĐĩ"} + ], +} \ No newline at end of file From f81dd284eb2e8c09c55a4fc37abb3e00e278f0a8 Mon Sep 17 00:00:00 2001 From: yuukos Date: Thu, 12 Oct 2017 22:28:34 +0700 Subject: [PATCH 153/195] updated spacy/__init__.py registered russian language via set_lang_class --- spacy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index f0d5ea0fc..1e5faf504 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -7,13 +7,13 @@ from .cli.info import info from .glossary import explain from .about import __version__ -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th, ru _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese, - th.Thai) + th.Thai, ru.Russian) for _lang in _languages: From 622b6d627078f5a5bc14ebb2840a64ec3db5d118 Mon Sep 17 00:00:00 2001 From: yuukos Date: Fri, 13 Oct 2017 13:57:29 +0700 Subject: [PATCH 154/195] updated Russian tokenizer moved the trying to import pymorph into __init__ --- spacy/ru/__init__.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py index d8f38e199..12b480a8a 100644 --- a/spacy/ru/__init__.py +++ b/spacy/ru/__init__.py @@ -8,17 +8,19 @@ from .language_data import * class RussianTokenizer(object): - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Russian tokenizer requires the pymorphy2 library: " - "try to fix it with " - "pip install pymorphy2==0.8") - - _morph = MorphAnalyzer() + _morph = None def __init__(self, spacy_tokenizer, cls, nlp=None): + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian tokenizer requires the pymorphy2 library: " + "try to fix it with " + "pip install pymorphy2==0.8") + + RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer) + self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) self._spacy_tokenizer = spacy_tokenizer @@ -36,6 +38,12 @@ class RussianTokenizer(object): def _normalize(cls, word): return cls._morph.parse(word)[0].normal_form + @classmethod + def _create_morph(cls, morph_analyzer_class): + if not cls._morph: + cls._morph = morph_analyzer_class() + return cls._morph + class RussianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) From a229b6e0ded3b1255fd77e00c197fa35c9030e5b Mon Sep 17 00:00:00 2001 From: yuukos Date: Fri, 13 Oct 2017 14:04:37 +0700 Subject: [PATCH 155/195] added tests for Russian language added tests of creating Russian Language instance and Russian tokenizer --- spacy/tests/conftest.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 90b947702..718a8265c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -16,7 +16,7 @@ from ..bn import Bengali from ..he import Hebrew from ..nb import Norwegian from ..th import Thai - +from ..ru import Russian from ..tokens import Doc from ..strings import StringStore @@ -30,7 +30,7 @@ import pytest # These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, - Swedish, Hungarian, Finnish, Bengali, Norwegian] + Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian] @pytest.fixture(params=LANGUAGES) @@ -53,6 +53,7 @@ def en_vocab(): def en_parser(): return English.Defaults.create_parser() + @pytest.fixture def es_tokenizer(): return Spanish.Defaults.create_tokenizer() @@ -83,11 +84,13 @@ def ja_tokenizer(): pytest.importorskip("MeCab") return Japanese.Defaults.create_tokenizer() + @pytest.fixture def japanese(): pytest.importorskip("MeCab") return Japanese() + @pytest.fixture def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() @@ -102,15 +105,30 @@ def bn_tokenizer(): def he_tokenizer(): return Hebrew.Defaults.create_tokenizer() + @pytest.fixture def nb_tokenizer(): return Norwegian.Defaults.create_tokenizer() + @pytest.fixture def th_tokenizer(): pythainlp = pytest.importorskip("pythainlp") return Thai.Defaults.create_tokenizer() + +@pytest.fixture +def ru_tokenizer(): + pytest.importorskip("pymorphy2") + return Russian.Defaults.create_tokenizer() + + +@pytest.fixture +def russian(): + pytest.importorskip("pymorphy2") + return Russian() + + @pytest.fixture def stringstore(): return StringStore() @@ -118,7 +136,7 @@ def stringstore(): @pytest.fixture def en_entityrecognizer(): - return English.Defaults.create_entity() + return English.Defaults.create_entity() @pytest.fixture @@ -130,6 +148,7 @@ def lemmatizer(): def text_file(): return StringIO() + @pytest.fixture def text_file_b(): return BytesIO() @@ -149,11 +168,11 @@ def DE(): def pytest_addoption(parser): parser.addoption("--models", action="store_true", - help="include tests that require full models") + help="include tests that require full models") parser.addoption("--vectors", action="store_true", - help="include word vectors tests") + help="include word vectors tests") parser.addoption("--slow", action="store_true", - help="include slow tests") + help="include slow tests") def pytest_runtest_setup(item): From 6fb9d75bd2a9ed049300b4237bec23d7a09e6845 Mon Sep 17 00:00:00 2001 From: yuukos Date: Fri, 13 Oct 2017 15:51:03 +0700 Subject: [PATCH 156/195] fixed test with creating tokenizer --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 718a8265c..de0facf49 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -30,7 +30,7 @@ import pytest # These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, - Swedish, Hungarian, Finnish, Bengali, Norwegian, Russian] + Swedish, Hungarian, Finnish, Bengali, Norwegian] @pytest.fixture(params=LANGUAGES) From ce00405afc176bd02363a7d703c3e61ef52fb851 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 13 Oct 2017 21:00:15 +0700 Subject: [PATCH 157/195] Create yuukos.md --- .github/contributors/yuukos.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/yuukos.md diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md new file mode 100644 index 000000000..aecafeecb --- /dev/null +++ b/.github/contributors/yuukos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alexey Kim | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13-12-2017 | +| GitHub username | yuukos | +| Website (optional) | | From 95836abee1c311bb95d291d0357f29b9f4e98e1c Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 13 Oct 2017 21:02:19 +0700 Subject: [PATCH 158/195] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 9e210bd4c..edd1ed30d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,7 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexey Kim, [@yuukos](https://github.com/yuukos) * Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) From a31d33be06b3a2c933bb1b0d4859778616065cb8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sat, 14 Oct 2017 19:28:04 +0900 Subject: [PATCH 159/195] Contributor agreement --- .github/contributors/polm.md | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/polm.md diff --git a/.github/contributors/polm.md b/.github/contributors/polm.md new file mode 100644 index 000000000..a2aa0cb65 --- /dev/null +++ b/.github/contributors/polm.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paul McCann | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-14 | +| GitHub username | polm | +| Website (optional) | http://dampfkraft.com| From 43eedf73f2aaf506e158115dfb328fb60bd91943 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Sun, 15 Oct 2017 23:33:25 +0900 Subject: [PATCH 160/195] [ja] Stash tokenizer output for speed Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM --- spacy/ja/__init__.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 2f85406c0..b2ec281f7 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,6 +16,13 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) +class JapaneseDoc(Doc): + def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None): + super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces) + # This saves tokenizer output so mecab doesn't have to be called again + # when determining POS tags. + self.detailed_tokens = detailed_tokens + def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -34,8 +41,9 @@ class JapaneseTokenizer(object): self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in detailed_tokens(self.tokenizer, text)] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + dtokens = detailed_tokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words)) def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -91,7 +99,7 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - dtokens = detailed_tokens(self.tokenizer, tokens.text) + dtokens = tokens.detailed_tokens rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) @@ -112,8 +120,7 @@ class Japanese(Language): Defaults = JapaneseDefaults def make_doc(self, text): - words = [str(t) for t in self.tokenizer(text)] - doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + jdoc = self.tokenizer(text) tagger = JapaneseDefaults.create_tagger(self.tokenizer) - tagger(doc) - return doc + tagger(jdoc) + return jdoc From 71ae8013ec5e981c9b44699afd82162c6f6c625b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 16 Oct 2017 00:24:34 +0900 Subject: [PATCH 161/195] [ja] Use user_details instead of a wrapper class Instead of using a JapaneseDoc wrapper class to store Mecab output, stash it in `user_data`. -POLM --- spacy/ja/__init__.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index b2ec281f7..26e39a593 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,12 +16,7 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) -class JapaneseDoc(Doc): - def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None): - super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces) - # This saves tokenizer output so mecab doesn't have to be called again - # when determining POS tags. - self.detailed_tokens = detailed_tokens +DETAILS_KEY = 'mecab_details' def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -43,7 +38,10 @@ class JapaneseTokenizer(object): def __call__(self, text): dtokens = detailed_tokens(self.tokenizer, text) words = [x.surface for x in dtokens] - return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words)) + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + # stash details tokens for tagger to use + doc.user_data[DETAILS_KEY] = dtokens + return doc def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -99,7 +97,7 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - dtokens = tokens.detailed_tokens + dtokens = tokens.user_data[DETAILS_KEY] rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) From 241d19a3e6f78918bc8296d574a1e65e4ce9381f Mon Sep 17 00:00:00 2001 From: yuukos Date: Mon, 16 Oct 2017 13:37:05 +0700 Subject: [PATCH 162/195] fixed Russian Tokenizer - added trailing space flags for tokens --- spacy/ru/__init__.py | 20 +++++++++++++++++--- spacy/ru/language_data.py | 2 +- spacy/ru/stop_words.py | 2 +- spacy/ru/tokenizer_exceptions.py | 3 ++- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py index 12b480a8a..8789cd6e5 100644 --- a/spacy/ru/__init__.py +++ b/spacy/ru/__init__.py @@ -25,15 +25,29 @@ class RussianTokenizer(object): self._spacy_tokenizer = spacy_tokenizer def __call__(self, text): - words = [self._normalize(RussianTokenizer._get_word(token)) - for token in self._spacy_tokenizer(text)] + get_norm = RussianTokenizer._get_norm + has_space = RussianTokenizer._has_space - return Doc(self.vocab, words, [False] * len(words)) + words_with_space_flags = [(get_norm(token), has_space(token, text)) + for token in self._spacy_tokenizer(text)] + + words, spaces = map(lambda s: list(s), zip(*words_with_space_flags)) + + return Doc(self.vocab, words, spaces) @staticmethod def _get_word(token): return token.lemma_ if len(token.lemma_) > 0 else token.text + @staticmethod + def _has_space(token, text): + pos_after_token = token.idx + len(token.text) + return pos_after_token < len(text) and text[pos_after_token] == ' ' + + @classmethod + def _get_norm(cls, token): + return cls._normalize(cls._get_word(token)) + @classmethod def _normalize(cls, word): return cls._morph.parse(word)[0].normal_form diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py index 75ca41b65..d33d388fd 100644 --- a/spacy/ru/language_data.py +++ b/spacy/ru/language_data.py @@ -15,4 +15,4 @@ TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) -__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] \ No newline at end of file +__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py index ddb28af86..2d89b7726 100644 --- a/spacy/ru/stop_words.py +++ b/spacy/ru/stop_words.py @@ -51,4 +51,4 @@ STOP_WORDS = set(""" ŅŅ‚Đ° ŅŅ‚и ŅŅ‚иĐŧ ŅŅ‚иĐŧи ŅŅ‚иŅ… ŅŅ‚Đž ŅŅ‚ĐžĐŗĐž ŅŅ‚ОК ŅŅ‚ĐžĐŧ ŅŅ‚ĐžĐŧŅƒ ŅŅ‚ĐžŅ‚ ŅŅ‚ĐžŅŽ ŅŅ‚Ņƒ Ņ -""".split()) \ No newline at end of file +""".split()) diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py index 8df57a402..f444f3df6 100644 --- a/spacy/ru/tokenizer_exceptions.py +++ b/spacy/ru/tokenizer_exceptions.py @@ -26,4 +26,5 @@ TOKENIZER_EXCEPTIONS = { "ВŅ.": [ {ORTH: "ВŅ.", LEMMA: "ВоŅĐēŅ€ĐĩŅĐĩĐŊŅŒĐĩ"} ], -} \ No newline at end of file +} + From b47b4e2654f69498b68c06a0b6464db4e924d268 Mon Sep 17 00:00:00 2001 From: Ramanan Balakrishnan Date: Wed, 18 Oct 2017 14:43:47 +0530 Subject: [PATCH 163/195] Support single value for attribute list in doc.to_scalar conversion --- spacy/tokens/doc.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index aca35a73f..9a644b86d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -496,13 +496,19 @@ cdef class Doc: cdef int i, j cdef attr_id_t feature cdef np.ndarray[attr_t, ndim=2] output + cdef np.ndarray[attr_t, ndim=1] output_1D # Make an array from the attributes --- otherwise our inner loop is Python # dict iteration. + if( type(py_attr_ids) is not list ): + py_attr_ids = [ py_attr_ids ] cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.c[i], feature) + if( len(attr_ids) == 1 ): + output_1D = output.reshape((self.length)) + return output_1D return output def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): From 8bd9b05fdc212e55b7714bb20594d8bb51657ba9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:13:36 +0200 Subject: [PATCH 164/195] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8a9ab517b..7cc47296c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,7 +70,7 @@ The [spaCy developer resources](https://github.com/explosion/spacy-dev-resources ### Contributor agreement -If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template]((.github/CONTRIBUTOR_AGREEMENT.md)) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user +If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user example_user would create the file `.github/contributors/example_user.md`. From 5a4b5b362c27f1948187915e2349c35db8a5d64c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:29:10 +0200 Subject: [PATCH 165/195] Create shuvanon.md --- .github/contributors/shuvanon.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/shuvanon.md diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md new file mode 100644 index 000000000..c915d48bf --- /dev/null +++ b/.github/contributors/shuvanon.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Shuvanon Razik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 3/12/2017 | +| GitHub username | shuvanon | +| Website (optional) | | From e787045cf55db5b68d878a291793b6e3786d6633 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 18 Oct 2017 14:31:57 +0200 Subject: [PATCH 166/195] Revert "filled up CONTRIBUTOR_AGREEMENT.md" This reverts commit 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4. --- .github/CONTRIBUTOR_AGREEMENT.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index c915d48bf..668b9dba2 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person + * [ ] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions. @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shuvanon Razik | +| Name | | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 3/12/2017 | -| GitHub username | shuvanon | +| Date | | +| GitHub username | | | Website (optional) | | From 9162ecb43ff2883f271da2a7d5cab17615288ac3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:36:19 +0200 Subject: [PATCH 167/195] Update CONTRIBUTOR_AGREEMENT.md --- .github/CONTRIBUTOR_AGREEMENT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index 668b9dba2..f34603065 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -88,7 +88,7 @@ U.S. Federal law. Any choice of law rules will not apply. mark both statements: * [ ] I am signing on behalf of myself as an individual and no other person - or entity, including my employer, has or will have rights with respect my + or entity, including my employer, has or will have rights with respect to my contributions. * [ ] I am signing on behalf of my employer or a legal entity and I have the From 0b239ee6461a77f41d50aab64040b4f97f5949a5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:37:08 +0200 Subject: [PATCH 168/195] Create ines.md --- .github/contributors/ines.md | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/ines.md diff --git a/.github/contributors/ines.md b/.github/contributors/ines.md new file mode 100644 index 000000000..5cd57b07e --- /dev/null +++ b/.github/contributors/ines.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ines Montani | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017/10/18 | +| GitHub username | ines | +| Website (optional) | https://explosion.ai | From 3357588b9fb6156cfcd48e3b9e556e413b5b9e27 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:41:31 +0200 Subject: [PATCH 169/195] Create honnibal.md --- .github/contributors/honnibal.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/honnibal.md diff --git a/.github/contributors/honnibal.md b/.github/contributors/honnibal.md new file mode 100644 index 000000000..3a700b7dd --- /dev/null +++ b/.github/contributors/honnibal.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthew Honnibal | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017-10-18 | +| GitHub username | honnibal | +| Website (optional) | https://explosion.ai | From e7b78370d99a59a80119ae1641b97ebbbb60088b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Oct 2017 14:41:38 +0200 Subject: [PATCH 170/195] Add note on origin of manually moved agreement See 8a2d22222dec5cf910df5a378cbcd9ea2ab53ec4 --- .github/contributors/shuvanon.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md index c915d48bf..82d02d8d2 100644 --- a/.github/contributors/shuvanon.md +++ b/.github/contributors/shuvanon.md @@ -1,3 +1,5 @@ + + # spaCy contributor agreement This spaCy Contributor Agreement (**"SCA"**) is based on the From f39fc34c95746d0f2ec8ad8105e76bdecc8aed33 Mon Sep 17 00:00:00 2001 From: demfier Date: Wed, 18 Oct 2017 22:32:58 +0530 Subject: [PATCH 171/195] Add minor update in README --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 0f3efc146..0fd807388 100644 --- a/README.rst +++ b/README.rst @@ -16,7 +16,7 @@ MIT license. .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy :alt: Travis Build Status - + .. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square :target: https://ci.appveyor.com/project/explosion/spacy :alt: Appveyor Build Status @@ -100,7 +100,7 @@ Top Performance * Fastest in the world: <50ms per document. No faster system has ever been announced. -* Accuracy within 1% of the current state of the art on all tasks performed +* Accuracy within 1% of the current state-of-the-art on all tasks performed (parsing, named entity recognition, part-of-speech tagging). The only more accurate systems are an order of magnitude slower or more. @@ -254,7 +254,7 @@ details. pip install -r requirements.txt pip install -e . -Compared to regular install via pip `requirements.txt `_ +Compared to a regular install via pip, `requirements.txt `_ additionally installs developer dependencies such as Cython. Instead of the above verbose commands, you can also use the following From 772c8035f779ad2043c6ef3a3c2db8bebbebd9cd Mon Sep 17 00:00:00 2001 From: demfier Date: Wed, 18 Oct 2017 23:12:24 +0530 Subject: [PATCH 172/195] Sign SCA --- .github/contributors/demfier.md | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/demfier.md diff --git a/.github/contributors/demfier.md b/.github/contributors/demfier.md new file mode 100644 index 000000000..1a730fc78 --- /dev/null +++ b/.github/contributors/demfier.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Gaurav Sahu | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-18 | +| GitHub username | demfier | +| Website (optional) | | From 44c61fde25af968b69b1f171e5681caceb16baed Mon Sep 17 00:00:00 2001 From: John Haley Date: Thu, 19 Oct 2017 08:56:28 -0700 Subject: [PATCH 173/195] Fix Keras install in keras_parikeh_entailment The master branch of Keras doesn't work with this example anymore so this pins Keras to version 1.2.2 for this example. --- examples/keras_parikh_entailment/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/keras_parikh_entailment/README.md b/examples/keras_parikh_entailment/README.md index adc80ce89..25324c095 100644 --- a/examples/keras_parikh_entailment/README.md +++ b/examples/keras_parikh_entailment/README.md @@ -47,13 +47,13 @@ First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spa English models (about 1GB of data): ```bash -pip install https://github.com/fchollet/keras/archive/master.zip +pip install https://github.com/fchollet/keras/archive/1.2.2.zip pip install spacy python -m spacy.en.download ``` ⚠ī¸ **Important:** In order for the example to run, you'll need to install Keras from -the master branch (and not via `pip install keras`). For more info on this, see +the 1.2.2 release (and not via `pip install keras`). For more info on this, see [#727](https://github.com/explosion/spaCy/issues/727). You'll also want to get Keras working on your GPU. This will depend on your From 989814c4b6690c29e1d6a0f1ffa79a8579960b9c Mon Sep 17 00:00:00 2001 From: John Haley Date: Thu, 19 Oct 2017 09:11:16 -0700 Subject: [PATCH 174/195] Create johnhaley81.md --- .github/contributors/johnhaley81.md | 106 ++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/johnhaley81.md diff --git a/.github/contributors/johnhaley81.md b/.github/contributors/johnhaley81.md new file mode 100644 index 000000000..277b3126c --- /dev/null +++ b/.github/contributors/johnhaley81.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | John Haley | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 19/10/2017 | +| GitHub username | johnhaley81 | +| Website (optional) | | From 5941aa96a12771ec3ca500c4df68b7cea0c25af1 Mon Sep 17 00:00:00 2001 From: Ramanan Balakrishnan Date: Wed, 18 Oct 2017 15:52:17 +0530 Subject: [PATCH 175/195] Support strings for attribute list in doc.to_array --- .github/contributors/ramananbalakrishnan.md | 106 ++++++++++++++++++++ spacy/tests/doc/test_array.py | 20 ++++ spacy/tokens/doc.pyx | 29 ++++-- 3 files changed, 146 insertions(+), 9 deletions(-) create mode 100644 .github/contributors/ramananbalakrishnan.md diff --git a/.github/contributors/ramananbalakrishnan.md b/.github/contributors/ramananbalakrishnan.md new file mode 100644 index 000000000..37492fb3d --- /dev/null +++ b/.github/contributors/ramananbalakrishnan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ramanan Balakrishnan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-18 | +| GitHub username | ramananbalakrishnan | +| Website (optional) | | diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index dd87aa763..ff10394d1 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab): assert feats_array[0][0] != feats_array[0][1] +def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array((ORTH, SHAPE)) + feats_array_stringy = tokens.to_array(("ORTH", "SHAPE")) + assert feats_array_stringy[0][0] == feats_array[0][0] + assert feats_array_stringy[0][1] == feats_array[0][1] + + +def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array(ORTH) + assert feats_array.shape == (3,) + + def test_doc_array_tag(en_tokenizer): text = "A nice sentence." pos = ['DET', 'ADJ', 'NOUN', 'PUNCT'] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9a644b86d..4f3b06946 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,6 +16,7 @@ from .token cimport Token from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t +from ..attrs import IDS from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE @@ -474,10 +475,13 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): - """ - Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape (N, M), where `N` is the length - of the document. The values will be 32-bit integers. + """Export given token attributes to a numpy `ndarray`. + + If `attr_ids` is a sequence of M attributes, the output array will + be of shape `(N, M)`, where N is the length of the `Doc` + (in tokens). If `attr_ids` is a single attribute, the output shape will + be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) + or string name (e.g. 'LEMMA' or 'lemma'). Example: from spacy import attrs @@ -486,22 +490,29 @@ cdef class Doc: np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) Arguments: - attr_ids (list[int]): A list of attribute ID ints. + attr_ids (list[]): A list of attributes (int IDs or string names). Returns: feat_array (numpy.ndarray[long, ndim=2]): A feature matrix, with one row per word, and one column per attribute - indicated in the input attr_ids. + indicated in the input `attr_ids`. """ cdef int i, j cdef attr_id_t feature cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=1] output_1D + # Handle scalar/list inputs of strings/ints for py_attr_ids + if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ): + py_attr_ids = [ py_attr_ids ] + py_attr_ids_input = [] + for py_attr_id in py_attr_ids: + if( type(py_attr_id) is int ): + py_attr_ids_input.append(py_attr_id) + else: + py_attr_ids_input.append(IDS[py_attr_id.upper()]) # Make an array from the attributes --- otherwise our inner loop is Python # dict iteration. - if( type(py_attr_ids) is not list ): - py_attr_ids = [ py_attr_ids ] - cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32) + cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.int32) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) for i in range(self.length): for j, feature in enumerate(attr_ids): From fbccc8c87d5456bb6b84730cbdd69abcccc64142 Mon Sep 17 00:00:00 2001 From: Ramanan Balakrishnan Date: Fri, 20 Oct 2017 14:23:48 +0530 Subject: [PATCH 176/195] Update documentation on doc.to_array --- website/docs/api/doc.jade | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 1c2911f52..59752b2a0 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -176,9 +176,14 @@ p +tag method p - | Export the document annotations to a numpy array of shape #[code N*M] - | where #[code N] is the length of the document and #[code M] is the number - | of attribute IDs to export. The values will be 32-bit integers. + | Export given token attributes to a numpy #[code ndarray]. + | If #[code attr_ids] is a sequence of #[code M] attributes, + | the output array will be of shape #[code (N, M)], where #[code N] + | is the length of the #[code Doc] (in tokens). If #[code attr_ids] is + | a single attribute, the output shape will be #[code (N,)]. You can + | specify attributes by integer ID (e.g. #[code spacy.attrs.LEMMA]) + | or string name (e.g. 'LEMMA' or 'lemma'). The values will be 32-bit + | integers. +aside-code("Example"). from spacy import attrs @@ -186,19 +191,26 @@ p # All strings mapped to integers, for easy export to numpy np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) + np_array = doc.to_array("POS") +table(["Name", "Type", "Description"]) +row +cell #[code attr_ids] - +cell ints - +cell A list of attribute ID ints. + +cell int or string + +cell + | A list of attributes (int IDs or string names) or + | a single attribute (int ID or string name) +footrow +cell return - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell + | #[code numpy.ndarray[ndim=2, dtype='int32']] or + | #[code numpy.ndarray[ndim=1, dtype='int32']] +cell | The exported attributes as a 2D numpy array, with one row per - | token and one column per attribute. + | token and one column per attribute (when #[code attr_ids] is a + | list), or as a 1D numpy array, with one item per attribute (when + | #[code attr_ids] is a single value). +h(2, "count_by") Doc.count_by +tag method From c0799430a7d126dcc0105898fca29d3e5ceff50a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 11:17:00 +0200 Subject: [PATCH 177/195] Make small changes to Doc.to_array * Change type-check logic to 'hasattr' (Python type-checking is brittle) * Small 'house style' edits, mostly making code more terse. --- spacy/tokens/doc.pyx | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4f3b06946..66936c4a5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -477,11 +477,11 @@ cdef class Doc: cpdef np.ndarray to_array(self, object py_attr_ids): """Export given token attributes to a numpy `ndarray`. - If `attr_ids` is a sequence of M attributes, the output array will - be of shape `(N, M)`, where N is the length of the `Doc` - (in tokens). If `attr_ids` is a single attribute, the output shape will - be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) - or string name (e.g. 'LEMMA' or 'lemma'). + If `attr_ids` is a sequence of M attributes, the output array will + be of shape `(N, M)`, where N is the length of the `Doc` + (in tokens). If `attr_ids` is a single attribute, the output shape will + be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) + or string name (e.g. 'LEMMA' or 'lemma'). Example: from spacy import attrs @@ -499,28 +499,25 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature + cdef np.ndarray[attr_t, ndim=1] attr_ids, output_1D cdef np.ndarray[attr_t, ndim=2] output - cdef np.ndarray[attr_t, ndim=1] output_1D # Handle scalar/list inputs of strings/ints for py_attr_ids - if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ): - py_attr_ids = [ py_attr_ids ] - py_attr_ids_input = [] - for py_attr_id in py_attr_ids: - if( type(py_attr_id) is int ): - py_attr_ids_input.append(py_attr_id) - else: - py_attr_ids_input.append(IDS[py_attr_id.upper()]) - # Make an array from the attributes --- otherwise our inner loop is Python + if not hasattr(py_attr_ids, '__iter__'): + py_attr_ids = [py_attr_ids] + + # Allow strings, e.g. 'lemma' or 'LEMMA' + convert_id = lambda id_: IDS[id_.upper()] if hasattr(id_, 'upper') else id_ + # Make an array from the attributes --- otherwise inner loop would be Python # dict iteration. - cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.int32) + attr_ids = numpy.asarray((convert_id(id_) for id_ in py_attr_ids), + dtype=numpy.int32) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.c[i], feature) - if( len(attr_ids) == 1 ): - output_1D = output.reshape((self.length)) - return output_1D - return output + # Handle 1d case + return output if len(attr_ids) >= 2 else output.reshape((self.length,)) def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): """ From 658536b5ce0c90a4baa7d1c25e7d3fad363f4d4d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 11:35:10 +0200 Subject: [PATCH 178/195] Fix to_array compile error --- spacy/tokens/doc.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66936c4a5..ce2a82cd0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -499,10 +499,10 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature - cdef np.ndarray[attr_t, ndim=1] attr_ids, output_1D + cdef np.ndarray[attr_t, ndim=1] attr_ids cdef np.ndarray[attr_t, ndim=2] output # Handle scalar/list inputs of strings/ints for py_attr_ids - if not hasattr(py_attr_ids, '__iter__'): + if not hasattr(py_attr_ids, '__iter__'): py_attr_ids = [py_attr_ids] # Allow strings, e.g. 'lemma' or 'LEMMA' From 7a46792376773f0f7ed55f9a1e71d0512c5eed2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 11:53:47 +0200 Subject: [PATCH 179/195] Fix compile error Closures not allowed in cpdef --- spacy/tokens/doc.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ce2a82cd0..3b2ef80fa 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -506,12 +506,11 @@ cdef class Doc: py_attr_ids = [py_attr_ids] # Allow strings, e.g. 'lemma' or 'LEMMA' - convert_id = lambda id_: IDS[id_.upper()] if hasattr(id_, 'upper') else id_ + py_attr_ids = [(IDS[id_.toupper()] if hasattr(id_, 'upper') else id_) + for id_ in py_attr_ids] # Make an array from the attributes --- otherwise inner loop would be Python # dict iteration. - attr_ids = numpy.asarray((convert_id(id_) for id_ in py_attr_ids), - dtype=numpy.int32) - + attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) for i in range(self.length): for j, feature in enumerate(attr_ids): From dbc276e3b2ca64a6f72d612629d773a9f44e13da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 13:02:13 +0200 Subject: [PATCH 180/195] Fix 'toupper()' -> 'upper()' --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3b2ef80fa..1bc8745c4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -506,7 +506,7 @@ cdef class Doc: py_attr_ids = [py_attr_ids] # Allow strings, e.g. 'lemma' or 'LEMMA' - py_attr_ids = [(IDS[id_.toupper()] if hasattr(id_, 'upper') else id_) + py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) for id_ in py_attr_ids] # Make an array from the attributes --- otherwise inner loop would be Python # dict iteration. From 80edc905f7490fbaabce47743879d6478e0dfcf1 Mon Sep 17 00:00:00 2001 From: mayukh18 Date: Sun, 22 Oct 2017 13:16:39 +0530 Subject: [PATCH 181/195] added a few bengali pronouns --- spacy/bn/morph_rules.py | 15 ++++++++++++++- spacy/bn/stop_words.py | 4 ++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/bn/morph_rules.py b/spacy/bn/morph_rules.py index efa5a6185..b63379325 100644 --- a/spacy/bn/morph_rules.py +++ b/spacy/bn/morph_rules.py @@ -11,11 +11,11 @@ MORPH_RULES = { 'āĻ•āĻŋ': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, 'āĻ¸ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'}, 'āĻ•āĻŋāĻ¸ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, - 'āĻ•āĻžāĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, 'āĻ¤āĻžāĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, 'āĻ¸ā§āĻŦā§ŸāĻ‚': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'āĻ•ā§‹āĻ¨āĻ—ā§āĻ˛ā§‹': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, 'āĻ¤ā§āĻŽāĻŋ': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'āĻ¤ā§āĻ‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, 'āĻ¤āĻžāĻĻā§‡āĻ°āĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, 'āĻ†āĻŽāĻ°āĻž': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'}, 'āĻ¯āĻŋāĻ¨āĻŋ': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, @@ -23,12 +23,15 @@ MORPH_RULES = { 'āĻ•ā§‹āĻ¨': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, 'āĻ•āĻžāĻ°āĻž': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, 'āĻ¤ā§‹āĻŽāĻžāĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'āĻ¤ā§‹āĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, 'āĻ–ā§‹āĻĻ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'āĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, 'āĻ¯āĻžāĻ°āĻž': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'}, 'āĻ¯ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, 'āĻ¤ā§‹āĻŽāĻ°āĻž': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'āĻ¤ā§‹āĻ°āĻž': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, 'āĻ¤ā§‹āĻŽāĻžāĻĻā§‡āĻ°āĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'āĻ¤ā§‹āĻĻā§‡āĻ°āĻ•ā§‡': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, 'āĻ†āĻĒāĻ¨': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'āĻ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'}, 'āĻ¨āĻŋāĻœ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, @@ -41,6 +44,10 @@ MORPH_RULES = { 'āĻ†āĻŽāĻžāĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'āĻŽā§‹āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'āĻŽā§‹āĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, 'āĻ¤āĻžāĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, 'āĻ¤ā§‹āĻŽāĻžāĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', @@ -49,7 +56,13 @@ MORPH_RULES = { 'Case': 'Nom'}, 'āĻ¤ā§‹āĻŽāĻžāĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'āĻ¤ā§‹āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, 'āĻ¤āĻžāĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'āĻ•āĻžāĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, + 'āĻ¤ā§‹āĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'āĻ¯āĻžāĻĻā§‡āĻ°': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, }, } diff --git a/spacy/bn/stop_words.py b/spacy/bn/stop_words.py index 5b513da7b..ca0ae934a 100644 --- a/spacy/bn/stop_words.py +++ b/spacy/bn/stop_words.py @@ -22,7 +22,7 @@ STOP_WORDS = set(""" āĻŸāĻŋ āĻ āĻŋāĻ• āĻ¤āĻ–āĻ¨ āĻ¤āĻ¤ āĻ¤āĻĨāĻž āĻ¤āĻŦā§ āĻ¤āĻŦā§‡ āĻ¤āĻž āĻ¤āĻžāĻāĻ•ā§‡ āĻ¤āĻžāĻāĻĻā§‡āĻ° āĻ¤āĻžāĻāĻ° āĻ¤āĻžāĻāĻ°āĻž āĻ¤āĻžāĻāĻšāĻžāĻ°āĻž āĻ¤āĻžāĻ‡ āĻ¤āĻžāĻ“ āĻ¤āĻžāĻ•ā§‡ āĻ¤āĻžāĻ¤ā§‡ āĻ¤āĻžāĻĻā§‡āĻ° āĻ¤āĻžāĻ° āĻ¤āĻžāĻ°āĻĒāĻ° āĻ¤āĻžāĻ°āĻž āĻ¤āĻžāĻ°āĻ‡ āĻ¤āĻžāĻšāĻ˛ā§‡ āĻ¤āĻžāĻšāĻž āĻ¤āĻžāĻšāĻžāĻ¤ā§‡ āĻ¤āĻžāĻšāĻžāĻ° āĻ¤āĻŋāĻ¨āĻ‡ -āĻ¤āĻŋāĻ¨āĻŋ āĻ¤āĻŋāĻ¨āĻŋāĻ“ āĻ¤ā§āĻŽāĻŋ āĻ¤ā§āĻ˛ā§‡ āĻ¤ā§‡āĻŽāĻ¨ āĻ¤ā§‹ āĻ¤ā§‹āĻŽāĻžāĻ° +āĻ¤āĻŋāĻ¨āĻŋ āĻ¤āĻŋāĻ¨āĻŋāĻ“ āĻ¤ā§āĻŽāĻŋ āĻ¤ā§āĻ˛ā§‡ āĻ¤ā§‡āĻŽāĻ¨ āĻ¤ā§‹ āĻ¤ā§‹āĻŽāĻžāĻ° āĻ¤ā§āĻ‡ āĻ¤ā§‹āĻ°āĻž āĻ¤ā§‹āĻ° āĻ¤ā§‹āĻŽāĻžāĻĻā§‡āĻ° āĻ¤ā§‹āĻĻā§‡āĻ° āĻĨāĻžāĻ•āĻŦā§‡ āĻĨāĻžāĻ•āĻŦā§‡āĻ¨ āĻĨāĻžāĻ•āĻž āĻĨāĻžāĻ•āĻžāĻ¯āĻŧ āĻĨāĻžāĻ•ā§‡ āĻĨāĻžāĻ•ā§‡āĻ¨ āĻĨā§‡āĻ•ā§‡ āĻĨā§‡āĻ•ā§‡āĻ‡ āĻĨā§‡āĻ•ā§‡āĻ“ āĻĨāĻžāĻ•āĻžā§Ÿ āĻĻāĻŋāĻ•ā§‡ āĻĻāĻŋāĻ¤ā§‡ āĻĻāĻŋāĻ¯āĻŧā§‡ āĻĻāĻŋāĻ¯āĻŧā§‡āĻ›ā§‡ āĻĻāĻŋāĻ¯āĻŧā§‡āĻ›ā§‡āĻ¨ āĻĻāĻŋāĻ˛ā§‡āĻ¨ āĻĻāĻŋā§Ÿā§‡ āĻĻā§ āĻĻā§āĻŸāĻŋ āĻĻā§āĻŸā§‹ āĻĻā§‡āĻ“āĻ¯āĻŧāĻž āĻĻā§‡āĻ“āĻ¯āĻŧāĻžāĻ° āĻĻā§‡āĻ–āĻ¤ā§‡ āĻĻā§‡āĻ–āĻž āĻĻā§‡āĻ–ā§‡ āĻĻā§‡āĻ¨ āĻĻā§‡āĻ¯āĻŧ āĻĻā§‡āĻļā§‡āĻ° āĻĻā§āĻŦāĻžāĻ°āĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡ āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻĻā§‡ā§Ÿ āĻĻā§‡āĻ“ā§ŸāĻž āĻĻā§‡āĻ“ā§ŸāĻžāĻ° āĻĻāĻŋāĻ¨ āĻĻā§āĻ‡ @@ -32,7 +32,7 @@ STOP_WORDS = set(""" āĻĢāĻ˛ā§‡ āĻĢāĻŋāĻ°ā§‡ āĻĢā§‡āĻ° āĻŦāĻ›āĻ° āĻŦāĻĻāĻ˛ā§‡ āĻŦāĻ°āĻ‚ āĻŦāĻ˛āĻ¤ā§‡ āĻŦāĻ˛āĻ˛ āĻŦāĻ˛āĻ˛ā§‡āĻ¨ āĻŦāĻ˛āĻž āĻŦāĻ˛ā§‡ āĻŦāĻ˛ā§‡āĻ›ā§‡āĻ¨ āĻŦāĻ˛ā§‡āĻ¨ āĻŦāĻ¸ā§‡ āĻŦāĻšā§ āĻŦāĻž āĻŦāĻžāĻĻā§‡ āĻŦāĻžāĻ° āĻŦāĻŋāĻ¨āĻž āĻŦāĻŋāĻ­āĻŋāĻ¨ā§āĻ¨ āĻŦāĻŋāĻļā§‡āĻˇ āĻŦāĻŋāĻˇāĻ¯āĻŧāĻŸāĻŋ āĻŦā§‡āĻļ āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻŦā§āĻ¯āĻžāĻĒāĻžāĻ°ā§‡ āĻŦāĻ•ā§āĻ¤āĻŦā§āĻ¯ āĻŦāĻ¨ āĻŦā§‡āĻļāĻŋ āĻ­āĻžāĻŦā§‡ āĻ­āĻžāĻŦā§‡āĻ‡ -āĻŽāĻ¤ āĻŽāĻ¤ā§‹ āĻŽāĻ¤ā§‹āĻ‡ āĻŽāĻ§ā§āĻ¯āĻ­āĻžāĻ—ā§‡ āĻŽāĻ§ā§āĻ¯ā§‡ āĻŽāĻ§ā§āĻ¯ā§‡āĻ‡ āĻŽāĻ§ā§āĻ¯ā§‡āĻ“ āĻŽāĻ¨ā§‡ āĻŽāĻžāĻ¤ā§āĻ° āĻŽāĻžāĻ§ā§āĻ¯āĻŽā§‡ āĻŽāĻžāĻ¨ā§āĻˇ āĻŽāĻžāĻ¨ā§āĻˇā§‡āĻ° āĻŽā§‹āĻŸ āĻŽā§‹āĻŸā§‡āĻ‡ +āĻŽāĻ¤ āĻŽāĻ¤ā§‹ āĻŽāĻ¤ā§‹āĻ‡ āĻŽāĻ§ā§āĻ¯āĻ­āĻžāĻ—ā§‡ āĻŽāĻ§ā§āĻ¯ā§‡ āĻŽāĻ§ā§āĻ¯ā§‡āĻ‡ āĻŽāĻ§ā§āĻ¯ā§‡āĻ“ āĻŽāĻ¨ā§‡ āĻŽāĻžāĻ¤ā§āĻ° āĻŽāĻžāĻ§ā§āĻ¯āĻŽā§‡ āĻŽāĻžāĻ¨ā§āĻˇ āĻŽāĻžāĻ¨ā§āĻˇā§‡āĻ° āĻŽā§‹āĻŸ āĻŽā§‹āĻŸā§‡āĻ‡ āĻŽā§‹āĻĻā§‡āĻ° āĻŽā§‹āĻ° āĻ¯āĻ–āĻ¨ āĻ¯āĻ¤ āĻ¯āĻ¤āĻŸāĻž āĻ¯āĻĨā§‡āĻˇā§āĻŸ āĻ¯āĻĻāĻŋ āĻ¯āĻĻāĻŋāĻ“ āĻ¯āĻž āĻ¯āĻžāĻāĻ° āĻ¯āĻžāĻāĻ°āĻž āĻ¯āĻžāĻ“āĻ¯āĻŧāĻž āĻ¯āĻžāĻ“āĻ¯āĻŧāĻžāĻ° āĻ¯āĻžāĻ•ā§‡ āĻ¯āĻžāĻšā§āĻ›ā§‡ āĻ¯āĻžāĻ¤ā§‡ āĻ¯āĻžāĻĻā§‡āĻ° āĻ¯āĻžāĻ¨ āĻ¯āĻžāĻŦā§‡ āĻ¯āĻžāĻ¯āĻŧ āĻ¯āĻžāĻ° āĻ¯āĻžāĻ°āĻž āĻ¯āĻžā§Ÿ āĻ¯āĻŋāĻ¨āĻŋ āĻ¯ā§‡ āĻ¯ā§‡āĻ–āĻžāĻ¨ā§‡ āĻ¯ā§‡āĻ¤ā§‡ āĻ¯ā§‡āĻ¨ āĻ¯ā§‡āĻŽāĻ¨ āĻ°āĻ•āĻŽ āĻ°āĻ¯āĻŧā§‡āĻ›ā§‡ āĻ°āĻžāĻ–āĻž āĻ°ā§‡āĻ–ā§‡ āĻ°ā§Ÿā§‡āĻ›ā§‡ From 5c7c08c2e3acf49673d6e3d914b4259328792b12 Mon Sep 17 00:00:00 2001 From: Jeroen Bobbeldijk Date: Sun, 22 Oct 2017 15:35:46 +0200 Subject: [PATCH 182/195] Add myself to contributors --- .github/contributors/jerbob92.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/jerbob92.md diff --git a/.github/contributors/jerbob92.md b/.github/contributors/jerbob92.md new file mode 100644 index 000000000..bb0430d14 --- /dev/null +++ b/.github/contributors/jerbob92.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jeroen Bobbeldijk | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22-10-2017 | +| GitHub username | jerbob92 | +| Website (optional) | | From 80a9652617a2cf7b76bd9bfb57f6d60ec393a8d8 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 24 Oct 2017 15:48:22 +1100 Subject: [PATCH 183/195] DOC "OP" key in token spec --- website/docs/usage/rule-based-matching.jade | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index db7c70608..cf27a6a94 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -59,6 +59,10 @@ p +h(2, "quantifiers") Using quantifiers +p + | Token specifiers may have quantifiers attached to them, by setting the "OP" key + | to one of the following values: + +table([ "Name", "Description", "Example"]) +row +cell #[code !] @@ -80,10 +84,18 @@ p +cell match 0 or 1 times +cell optional, max one +p + | Thus the following matcher will match "Hello, World" or "Hello world" or "Hello - - world": + ++code. + matcher = Matcher(nlp.vocab) + matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True, 'OP': '*'}, {LOWER: "world"}]) + p | There are no nested or scoped quantifiers. You can build those | behaviours with acceptors and | #[+api("matcher#add_entity") #[code on_match]] callbacks. + | All the operators are greedy: they will match as many tokens as possible. +h(2, "acceptor-functions") Acceptor functions From 1b64a44d85e542c62ddeaf1f89622dcf3fa72229 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 24 Oct 2017 11:48:20 +0200 Subject: [PATCH 184/195] Add dependency patterns example --- examples/dependency_patterns.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 examples/dependency_patterns.py diff --git a/examples/dependency_patterns.py b/examples/dependency_patterns.py new file mode 100644 index 000000000..776e045b7 --- /dev/null +++ b/examples/dependency_patterns.py @@ -0,0 +1,33 @@ +''' +Match a dependency pattern. See https://github.com/explosion/spaCy/pull/1120 + +We start by creating a DependencyTree for the Doc. This class models the document +dependency tree. Then we compile the query into a Pattern using the PatternParser. +The syntax is quite simple: + +we define a node named 'fox', that must match in the dep tree a token +whose orth_ is 'fox'. an anonymous token whose lemma is 'quick' must have fox +as parent, with a dep_ matching the regex am.* another anonymous token whose +orth_ matches the regex brown|yellow has fox as parent, with whathever dep_ +DependencyTree.match returns a list of PatternMatch. Notice that we can assign +names to anonymous or defined nodes ([word:fox]=f). We can get the Token mapped +to the fox node using match['f']. +''' +import spacy +from spacy.pattern import PatternParser, DependencyTree + +nlp = spacy.load('en') +doc = nlp("The quick brown fox jumped over the lazy dog.") +tree = DependencyTree(doc) + +query = """fox [word:fox]=f + [lemma:quick]=q >/am.*/ fox + [word:/brown|yellow/] > fox""" + +pattern = PatternParser.parse(query) +matches = tree.match(pattern) + +assert len(matches) == 1 +match = matches[0] + +assert match['f'] == doc[3] From a2e7e9be9883e34774cf425d76bb2974eb3ed215 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:12:47 +0200 Subject: [PATCH 185/195] Update landing --- website/index.jade | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/index.jade b/website/index.jade index df5428316..5ce00e2c1 100644 --- a/website/index.jade +++ b/website/index.jade @@ -33,7 +33,7 @@ include _includes/_mixins | spaCy is designed to help you do real work — to build real | products, or gather real insights. The library respects | your time, and tries to avoid wasting it. It's easy to - | install, and its API is simple and productive. I like to + | install, and its API is simple and productive. We like to | think of spaCy as the Ruby on Rails of Natural Language | Processing. @@ -102,7 +102,9 @@ include _includes/_mixins +item GIL-free #[strong multi-threading] +item Efficient binary serialization +item Easy #[strong deep learning] integration - +item Statistical models for #[strong English] and #[strong German] + +item + | Statistical models for #[strong English], + | #[strong German], #[strong French] and #[strong Spanish] +item State-of-the-art speed +item Robust, rigorously evaluated accuracy From fdd8dacb752b718a918908156e5ce316d3f36559 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:13:52 +0200 Subject: [PATCH 186/195] Fix compilation of color utility class names --- website/_harp.json | 2 +- website/assets/css/_base/_utilities.sass | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index 37a0b54dd..5a44f1a81 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -81,7 +81,7 @@ } ], - "V_CSS": "1.7", + "V_CSS": "1.8", "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 2c40858a8..49e98064b 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -125,7 +125,7 @@ .u-border-dotted border-top: 1px dotted $color-subtle -@each $name, $color in (theme: $color-theme, subtle: $color-subtle-dark, light: $color-back, red: $color-red, green: $color-green, yellow: $color-yellow) +@each $name, $color in (theme: $color-theme, subtle: $color-subtle-dark, light: $color-back, 'red': $color-red, 'green': $color-green, 'yellow': $color-yellow) .u-color-#{$name} color: $color From 91dbee1b8fc86e1c9678ede0404711142d8ac628 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:17:03 +0200 Subject: [PATCH 187/195] Add BILUO docs to NER annotation scheme --- website/docs/api/annotation.jade | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index d4b01a819..155c4d13b 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -86,6 +86,31 @@ include _annotation/_dep-labels include _annotation/_named-entities + | showed that the minimal #[strong Begin], #[strong In], #[strong Out] + | scheme was more difficult to learn than the #[strong BILUO] scheme that + | we use, which explicitly marks boundary tokens. + ++table(["Tag", "Description"]) + +row + +cell #[code #[span.u-color-theme B] EGIN] + +cell The first token of a multi-token entity. + + +row + +cell #[code #[span.u-color-theme I] N] + +cell An inner token of a multi-token entity. + + +row + +cell #[code #[span.u-color-theme L] AST] + +cell The final token of a multi-token entity. + + +row + +cell #[code #[span.u-color-theme U] NIT] + +cell A single-token entity. + + +row + +cell #[code #[span.u-color-theme O] UT] + +cell A non-entity token. + +h(2, "json-input") JSON input format for training p From 0e081d0167087f9f7f1d768956ccd2feb4a7ce11 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:17:54 +0200 Subject: [PATCH 188/195] Update JSON training format docs (resolves #1291) --- website/docs/api/annotation.jade | 73 ++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 155c4d13b..b5a17de89 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -86,6 +86,25 @@ include _annotation/_dep-labels include _annotation/_named-entities ++h(3, "biluo") BILUO Scheme + +p + | spaCy translates the character offsets into this scheme, in order to + | decide the cost of each action given the current state of the entity + | recogniser. The costs are then used to calculate the gradient of the + | loss, to train the model. The exact algorithm is a pastiche of + | well-known methods, and is not currently described in any single + | publication. The model is a greedy transition-based parser guided by a + | linear model whose weights are learned using the averaged perceptron + | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] + | imitation learning strategy. The transition system is equivalent to the + | BILOU tagging scheme. + ++aside("Why BILUO, not IOB?") + | There are several coding schemes for encoding entity annotations as + | token tags. These coding schemes are equally expressive, but not + | necessarily equally learnable. + | #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth] | showed that the minimal #[strong Begin], #[strong In], #[strong Out] | scheme was more difficult to learn than the #[strong BILUO] scheme that | we use, which explicitly marks boundary tokens. @@ -114,29 +133,39 @@ include _annotation/_named-entities +h(2, "json-input") JSON input format for training p - | spaCy takes training data in the following format: + | spaCy takes training data in JSON format. The built-in + | #[+a("/docs/usage/cli#convert") #[code convert] command] helps you + | convert the #[code .conllu] format used by the + | #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora] + | to spaCy's training format. + ++aside("Annotating entities") + | Named entities are provided in the #[+a("#biluo") BILUO] + | notation. Tokens outside an entity are set to #[code "O"] and tokens + | that are part of an entity are set to the entity label, prefixed by the + | BILUO marker. For example #[code "B-ORG"] describes the first token of + | a multi-token #[code ORG] entity and #[code "U-PERSON"] a single + | token representing a #[code PERSON] entity +code("Example structure"). - doc: { - id: string, - paragraphs: [{ - raw: string, - sents: [int], - tokens: [{ - start: int, - tag: string, - head: int, - dep: string - }], - ner: [{ - start: int, - end: int, - label: string - }], - brackets: [{ - start: int, - end: int, - label: string + [{ + "id": int, # ID of the document within the corpus + "paragraphs": [{ # list of paragraphs in the corpus + "raw": string, # raw text of the paragraph + "sentences": [{ # list of sentences in the paragraph + "tokens": [{ # list of tokens in the sentence + "id": int, # index of the token in the document + "dep": string, # dependency label + "head": int, # offset of token head relative to token index + "tag": string, # part-of-speech tag + "orth": string, # verbatim text of the token + "ner": string # BILUO label, e.g. "O" or "B-ORG" + }], + "brackets": [{ # phrase structure (NOT USED by current models) + "first": int, # index of first token + "last": int, # index of last token + "label": string # phrase label + }] }] }] - } + }] From 90601cf1b38bcaab121d8c413979c1813e6f4314 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:22:37 +0200 Subject: [PATCH 189/195] Fix formatting --- website/docs/usage/rule-based-matching.jade | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index cf27a6a94..4f8172797 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -23,7 +23,7 @@ p +code. from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, LOWER - + matcher = Matcher(nlp.vocab) matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}]) @@ -60,8 +60,8 @@ p +h(2, "quantifiers") Using quantifiers p - | Token specifiers may have quantifiers attached to them, by setting the "OP" key - | to one of the following values: + | Token specifiers may have quantifiers attached to them, by setting the + | #[code "OP"] key to one of the following values: +table([ "Name", "Description", "Example"]) +row @@ -85,8 +85,9 @@ p +cell optional, max one p - | Thus the following matcher will match "Hello, World" or "Hello world" or "Hello - - world": - + | Thus the following matcher will match "Hello, World" or "Hello world" or + | "Hello - - world": + +code. matcher = Matcher(nlp.vocab) matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True, 'OP': '*'}, {LOWER: "world"}]) From ebd2e5ff54eb28e276f5ec135e3a81e30cae4cf4 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:22:46 +0200 Subject: [PATCH 190/195] Fix matcher docs (resolves #1453) --- website/docs/usage/rule-based-matching.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index 4f8172797..1be219f0e 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -18,7 +18,7 @@ p Here's a minimal example. We first add a pattern that specifies three tokens: p | Once we've added the pattern, we can use the #[code matcher] as a - | callable, to receive a list of #[code (ent_id, start, end)] tuples. + | callable, to receive a list of #[code (ent_id, label, start, end)] tuples. +code. from spacy.matcher import Matcher From b51dcee3ce047410697d4fd0fc2406bcf5021cdd Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:25:49 +0200 Subject: [PATCH 191/195] Fix unicode in lightning tour example (resolves #1356) --- website/docs/usage/lightning-tour.jade | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 2fd390d26..6c98cf3f3 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -58,13 +58,13 @@ p assert token.shape_ == 'Xxxxx' for lexeme in nlp.vocab: if lexeme.is_alpha: - lexeme.shape_ = 'W' + lexeme.shape_ = u'W' elif lexeme.is_digit: - lexeme.shape_ = 'D' + lexeme.shape_ = u'D' elif lexeme.is_punct: - lexeme.shape_ = 'P' + lexeme.shape_ = u'P' else: - lexeme.shape_ = 'M' + lexeme.shape_ = u'M' assert token.shape_ == 'W' +h(2, "examples-numpy-arrays") Export to numpy arrays From dee289613399ea975d57d87c61269506d9db3687 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 24 Oct 2017 21:52:12 +0200 Subject: [PATCH 192/195] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e97a7ea16..feda380e6 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,20 +1,19 @@ - + ## Description - - + +### Types of changes + -## Types of changes - -- [ ] **Bug fix** (non-breaking change fixing an issue) -- [ ] **New feature** (non-breaking change adding functionality to spaCy) -- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) -- [ ] **Documentation** (addition to documentation of spaCy) - -## Checklist: - -- [ ] My change requires a change to spaCy's documentation. -- [ ] I have updated the documentation accordingly. -- [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. +## Checklist + +- [ ] I have submitted the spaCy Contributor Agreement. +- [ ] I ran the tests, and all new and existing tests passed. +- [ ] My changes don't require a change to the documentation, or if they do, I've added all required details. From 3dc3f10a40e494e68b418fbf5d925cc4654cf547 Mon Sep 17 00:00:00 2001 From: Jeffrey Gerard Date: Thu, 2 Nov 2017 09:28:26 -0700 Subject: [PATCH 193/195] Contributing agreement - IamJeffG --- .github/contributors/IamJeffG.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/IamJeffG.md diff --git a/.github/contributors/IamJeffG.md b/.github/contributors/IamJeffG.md new file mode 100644 index 000000000..030e711a2 --- /dev/null +++ b/.github/contributors/IamJeffG.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jeffrey Gerard | +| Company name (if applicable) | cephalo-ai / wellio | +| Title or role (if applicable) | Senior Data Scientist| +| Date | 11/02/2017 | +| GitHub username | IamJeffG | +| Website (optional) | | From fcc3b84be576007ecc653445463b11024583f319 Mon Sep 17 00:00:00 2001 From: uwol Date: Sun, 5 Nov 2017 12:47:44 +0100 Subject: [PATCH 194/195] added contributor agreement --- .github/uwol.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/uwol.md diff --git a/.github/uwol.md b/.github/uwol.md new file mode 100644 index 000000000..ddc82d220 --- /dev/null +++ b/.github/uwol.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ulrich Wolffgang | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-11-05 | +| GitHub username | uwol | +| Website (optional) | https://uwol.github.io/ | From 6c477d864b0f46b6101fc9d61abecd50eb922916 Mon Sep 17 00:00:00 2001 From: uwol Date: Sun, 5 Nov 2017 12:49:35 +0100 Subject: [PATCH 195/195] added contributor agreement --- .github/{ => contributors}/uwol.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/{ => contributors}/uwol.md (100%) diff --git a/.github/uwol.md b/.github/contributors/uwol.md similarity index 100% rename from .github/uwol.md rename to .github/contributors/uwol.md