From bdafb514c51d3c6aee0ad3ab5ac757ee5da8418e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:47:32 +0100 Subject: [PATCH 01/12] Update version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index f69191c88..a273cac0a 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.5", + "SPACY_VERSION": "1.6", "LATEST_NEWS": { "url": "https://explosion.ai/blog/spacy-user-survey", "title": "The results of the spaCy user survey" From baa6be8180eb1897ae1ddbbe0d93ad1614e646b7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:47:45 +0100 Subject: [PATCH 02/12] Update latest news to last blog post --- website/_harp.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index a273cac0a..e315d658c 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -14,8 +14,8 @@ "SPACY_VERSION": "1.6", "LATEST_NEWS": { - "url": "https://explosion.ai/blog/spacy-user-survey", - "title": "The results of the spaCy user survey" + "url": "https://explosion.ai/blog/deep-learning-formula-nlp", + "title": "The new deep learning formula for state-of-the-art NLP models" }, "SOCIAL": { From da3aca4020826d54befa6fc20c296631089c6368 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:48:29 +0100 Subject: [PATCH 03/12] Fix formatting --- website/docs/usage/_data.json | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 932abc99e..8bf5bfc98 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -232,7 +232,7 @@ "NLP with spaCy in 10 lines of code": { "url": "https://github.com/cytora/pycon-nlp-in-10-lines", "author": "Andraz Hribernik et al. (Cytora)", - "tags": [ "jupyter" ] + "tags": ["jupyter"] }, "Intro to NLP with spaCy": { "url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/", @@ -241,7 +241,7 @@ "NLP with spaCy and IPython Notebook": { "url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/", "author": "Dustin Miller (SharePoint)", - "tags": [ "jupyter" ] + "tags": ["jupyter"] }, "Getting Started with spaCy": { "url": "http://textminingonline.com/getting-started-with-spacy", @@ -254,7 +254,7 @@ "NLP (almost) From Scratch - POS Network with spaCy": { "url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html", "author": "Sujit Pal", - "tags": [ "gensim", "keras" ] + "tags": ["gensim", "keras"] }, "NLP tasks with various libraries": { "url": "http://clarkgrubb.com/nlp", @@ -270,44 +270,43 @@ "Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": { "url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb", "author": "Patrick Harrison (S&P Global)", - "tags": [ "jupyter", "gensim" ] + "tags": ["jupyter", "gensim"] }, - "Deep Learning with custom pipelines and Keras": { "url": "https://explosion.ai/blog/spacy-deep-learning-keras", "author": "Matthew Honnibal", - "tags": [ "keras", "sentiment" ] + "tags": ["keras", "sentiment"] }, "A decomposable attention model for Natural Language Inference": { "url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment", "author": "Matthew Honnibal", - "tags": [ "keras", "similarity" ] + "tags": ["keras", "similarity"] }, "Using the German model": { "url": "https://explosion.ai/blog/german-model", "author": "Wolfgang Seeker", - "tags": [ "multi-lingual" ] + "tags": ["multi-lingual"] }, "Sense2vec with spaCy and Gensim": { "url": "https://explosion.ai/blog/sense2vec-with-spacy", "author": "Matthew Honnibal", - "tags": [ "big data", "gensim" ] + "tags": ["big data", "gensim"] }, "Building your bot's brain with Node.js and spaCy": { "url": "https://explosion.ai/blog/chatbot-node-js-spacy", "author": "Wah Loon Keng", - "tags": [ "bots", "node.js" ] + "tags": ["bots", "node.js"] }, "An intent classifier with spaCy": { "url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/", "author": "Musio", - "tags": [ "bots", "keras" ] + "tags": ["bots", "keras"] }, "Visual Question Answering with spaCy": { "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook", "author": "Aaditya Prakash", - "tags": [ "vqa", "keras" ] + "tags": ["vqa", "keras"] } }, @@ -315,22 +314,22 @@ "Information extraction": { "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", "author": "Matthew Honnibal", - "tags": [ "snippet" ] + "tags": ["snippet"] }, "Neural bag of words": { "url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py", "author": "Matthew Honnibal", - "tags": [ "sentiment" ] + "tags": ["sentiment"] }, "Part-of-speech tagging": { "url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py", "author": "Matthew Honnibal", - "tags": [ "pos" ] + "tags": ["pos"] }, "Parallel parse": { "url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py", "author": "Matthew Honnibal", - "tags": [ "big data" ] + "tags": ["big data"] }, "Inventory count": { "url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count", @@ -339,8 +338,8 @@ "Multi-word matches": { "url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py", "author": "Matthew Honnibal", - "tags": [ "matcher", "out of date" ] + "tags": ["matcher", "out of date"] } } } -} +} \ No newline at end of file From 651bf411e0db70c6a25c009f255922db2303a2f0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:48:38 +0100 Subject: [PATCH 04/12] Add tutorial --- website/docs/usage/_data.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 8bf5bfc98..9681cb6ea 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -307,6 +307,11 @@ "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook", "author": "Aaditya Prakash", "tags": ["vqa", "keras"] + }, + "Extracting time suggestions from emails with spaCy": { + "url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2", + "author": "Chris Savvopoulos", + "tags": ["ner"] } }, From ab70f6e18d6c84b78815893f20fae29d3d0fd661 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Jan 2017 12:27:10 +0100 Subject: [PATCH 05/12] Update NER training example --- examples/training/train_ner.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 220244b93..bcc087d07 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -8,6 +8,12 @@ from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse from spacy.tagger import Tagger + +try: + unicode +except: + unicode = str + def train_ner(nlp, train_data, entity_types): # Add new words to vocab. @@ -24,7 +30,6 @@ def train_ner(nlp, train_data, entity_types): doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) - ner.model.end_training() return ner def save_model(ner, model_dir): @@ -33,8 +38,11 @@ def save_model(ner, model_dir): model_dir.mkdir() assert model_dir.is_dir() - with (model_dir / 'config.json').open('w') as file_: - json.dump(ner.cfg, file_) + with (model_dir / 'config.json').open('wb') as file_: + data = json.dumps(ner.cfg) + if isinstance(data, unicode): + data = data.encode('utf8') + file_.write(data) ner.model.dump(str(model_dir / 'model')) if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() From 0c2e5539cef7e80fb93f2018b370af78dc1d4ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sat, 28 Jan 2017 18:38:14 +0100 Subject: [PATCH 06/12] Specify version number for ujson and plac The required version was specified for plac in requirements.txt but not in setup.py, which could cause a conflicting version error. Similarly, set the version of ujson in requirements.txt to be the same as in setup.py --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8e318bc05..538862aed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ thinc>=6.2.0,<6.3.0 murmurhash>=0.26,<0.27 plac<0.9.3 six -ujson +ujson>=1.35 cloudpickle sputnik>=0.9.2,<0.10.0 diff --git a/setup.py b/setup.py index ef4e14020..70099da0a 100644 --- a/setup.py +++ b/setup.py @@ -234,7 +234,7 @@ def setup_package(): 'cymem>=1.30,<1.32', 'preshed>=0.46.0,<0.47.0', 'thinc>=6.2.0,<6.3.0', - 'plac', + 'plac<0.9.3', 'six', 'cloudpickle', 'pathlib', From 6c665b81dfede4da0af4432b8d0af885c7cd6d0e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 31 Jan 2017 00:46:21 +1100 Subject: [PATCH 07/12] Fix redundant == TAG in from_array conditional --- spacy/tokens/doc.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 30be63608..8ce2c7fe4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -585,9 +585,6 @@ cdef class Doc: elif attr_id == POS: for i in range(length): tokens[i].pos = values[i] - elif attr_id == TAG: - for i in range(length): - tokens[i].tag = values[i] elif attr_id == DEP: for i in range(length): tokens[i].dep = values[i] From e4c84321a590c5e3b1004fe542db7d7f3cca031b Mon Sep 17 00:00:00 2001 From: latkins Date: Tue, 31 Jan 2017 13:47:42 +0000 Subject: [PATCH 08/12] Added regression test for Issue #792. --- spacy/tests/regression/test_issue792.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 spacy/tests/regression/test_issue792.py diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py new file mode 100644 index 000000000..d5aef533f --- /dev/null +++ b/spacy/tests/regression/test_issue792.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +def test_issue792(en_tokenizer): + """Test for Issue #792: Trailing whitespace is removed after parsing.""" + text = "This is a string " + doc = en_tokenizer(text) + assert(doc.text_with_ws == text) + + text_unicode = "This is a string\u0020" + doc_unicode = en_tokenizer(text_unicode) + assert(doc_unicode.text_with_ws == text_unicode) From e6465b9ca35c0497b21a684ea4cc152df3b61334 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Jan 2017 15:14:42 +0100 Subject: [PATCH 09/12] Parametrize test cases and mark as xfail --- spacy/tests/regression/test_issue792.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py index d5aef533f..231261523 100644 --- a/spacy/tests/regression/test_issue792.py +++ b/spacy/tests/regression/test_issue792.py @@ -2,12 +2,9 @@ from __future__ import unicode_literals -def test_issue792(en_tokenizer): - """Test for Issue #792: Trailing whitespace is removed after parsing.""" - text = "This is a string " - doc = en_tokenizer(text) - assert(doc.text_with_ws == text) - - text_unicode = "This is a string\u0020" - doc_unicode = en_tokenizer(text_unicode) - assert(doc_unicode.text_with_ws == text_unicode) +@pytest.mark.xfail +@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) +def test_issue792(en_tokenizer, text): + """Test for Issue #792: Trailing whitespace is removed after parsing.""" + doc = en_tokenizer(text) + assert(doc.text_with_ws == text) From c304834e459f2536e788e845178de46551e1d7b0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Jan 2017 15:18:30 +0100 Subject: [PATCH 10/12] Add missing import --- spacy/tests/regression/test_issue792.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py index 231261523..a07059b6d 100644 --- a/spacy/tests/regression/test_issue792.py +++ b/spacy/tests/regression/test_issue792.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest @pytest.mark.xfail @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) From e4875834fe9e2c85c8e31efd0c4c7c868f7259ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Jan 2017 15:19:33 +0100 Subject: [PATCH 11/12] Fix formatting --- spacy/tests/regression/test_issue792.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py index a07059b6d..563e061a6 100644 --- a/spacy/tests/regression/test_issue792.py +++ b/spacy/tests/regression/test_issue792.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals import pytest + @pytest.mark.xfail @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) def test_issue792(en_tokenizer, text): """Test for Issue #792: Trailing whitespace is removed after parsing.""" doc = en_tokenizer(text) - assert(doc.text_with_ws == text) + assert doc.text_with_ws == text From 32a22291bcc0205cf89d2a159664a603529ab5a3 Mon Sep 17 00:00:00 2001 From: Matvey Ezhov Date: Tue, 31 Jan 2017 19:18:45 +0300 Subject: [PATCH 12/12] Small `Doc.count_by` documentation update Current example doesn't work --- spacy/tokens/doc.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 8ce2c7fe4..805a5b30c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -500,7 +500,8 @@ cdef class Doc: by the values of the given attribute ID. Example: - from spacy.en import English, attrs + from spacy.en import English + from spacy import attrs nlp = English() tokens = nlp(u'apple apple orange banana') tokens.count_by(attrs.ORTH)