Merge branch 'master' of https://github.com/explosion/spaCy

2025-07-16 03:02:41 +03:00 · 2017-01-31 13:27:34 -06:00 · 2017-01-31 13:27:34 -06:00 · 16ce7409e4
commit 16ce7409e4
parent 80aa4e114b ad0e4e4532
7 changed files with 52 additions and 30 deletions
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -9,6 +9,12 @@ from spacy.gold import GoldParse
 from spacy.tagger import Tagger

 
+try:
+    unicode
+except:
+    unicode = str
+
+
 def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
@ -24,7 +30,6 @@ def train_ner(nlp, train_data, entity_types):
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
-    ner.model.end_training()
    return ner

 def save_model(ner, model_dir):
@ -33,8 +38,11 @@ def save_model(ner, model_dir):
        model_dir.mkdir()
    assert model_dir.is_dir()

-    with (model_dir / 'config.json').open('w') as file_:
-        json.dump(ner.cfg, file_)
+    with (model_dir / 'config.json').open('wb') as file_:
+        data = json.dumps(ner.cfg)
+        if isinstance(data, unicode):
+            data = data.encode('utf8')
+        file_.write(data)
    ner.model.dump(str(model_dir / 'model'))
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
--- a/requirements.txt
+++ b/requirements.txt
@ -7,6 +7,6 @@ thinc>=6.2.0,<6.3.0
 murmurhash>=0.26,<0.27
 plac<0.9.3
 six
-ujson
+ujson>=1.35
 cloudpickle
 sputnik>=0.9.2,<0.10.0
--- a/setup.py
+++ b/setup.py
@ -234,7 +234,7 @@ def setup_package():
                'cymem>=1.30,<1.32',
                'preshed>=0.46.0,<0.47.0',
                'thinc>=6.2.0,<6.3.0',
-                'plac',
+                'plac<0.9.3',
                'six',
                'cloudpickle',
                'pathlib',
--- a/spacy/tests/regression/test_issue792.py
+++ b/spacy/tests/regression/test_issue792.py
@ -0,0 +1,12 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
+def test_issue792(en_tokenizer, text):
+    """Test for Issue #792: Trailing whitespace is removed after parsing."""
+    doc = en_tokenizer(text)
+    assert doc.text_with_ws == text
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -500,7 +500,8 @@ cdef class Doc:
        by the values of the given attribute ID.

        Example:
-            from spacy.en import English, attrs
+            from spacy.en import English
+            from spacy import attrs
            nlp = English()
            tokens = nlp(u'apple apple orange banana')
            tokens.count_by(attrs.ORTH)
@ -585,9 +586,6 @@ cdef class Doc:
            elif attr_id == POS:
                for i in range(length):
                    tokens[i].pos = <univ_pos_t>values[i]
-            elif attr_id == TAG:
-                for i in range(length):
-                    tokens[i].tag = <univ_pos_t>values[i]
            elif attr_id == DEP:
                for i in range(length):
                    tokens[i].dep = values[i]
--- a/website/_harp.json
+++ b/website/_harp.json
@ -12,10 +12,10 @@
        "COMPANY_URL": "https://explosion.ai",
        "DEMOS_URL": "https://demos.explosion.ai",

-        "SPACY_VERSION": "1.5",
+        "SPACY_VERSION": "1.6",
        "LATEST_NEWS": {
-            "url": "https://explosion.ai/blog/spacy-user-survey",
-            "title": "The results of the spaCy user survey"
+            "url": "https://explosion.ai/blog/deep-learning-formula-nlp",
+            "title": "The new deep learning formula for state-of-the-art NLP models"
        },

        "SOCIAL": {
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@ -232,7 +232,7 @@
            "NLP with spaCy in 10 lines of code": {
                "url": "https://github.com/cytora/pycon-nlp-in-10-lines",
                "author": "Andraz Hribernik et al. (Cytora)",
-                "tags": [ "jupyter" ]
+                "tags": ["jupyter"]
            },
            "Intro to NLP with spaCy": {
                "url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/",
@ -241,7 +241,7 @@
            "NLP with spaCy and IPython Notebook": {
                "url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/",
                "author": "Dustin Miller (SharePoint)",
-                "tags": [ "jupyter" ]
+                "tags": ["jupyter"]
            },
            "Getting Started with spaCy": {
                "url": "http://textminingonline.com/getting-started-with-spacy",
@ -254,7 +254,7 @@
            "NLP (almost) From Scratch - POS Network with spaCy": {
                "url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html",
                "author": "Sujit Pal",
-                "tags": [ "gensim", "keras" ]
+                "tags": ["gensim", "keras"]
            },
            "NLP tasks with various libraries": {
                "url": "http://clarkgrubb.com/nlp",
@ -270,44 +270,48 @@
            "Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": {
                "url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
                "author": "Patrick Harrison (S&P Global)",
-                "tags": [ "jupyter", "gensim" ]
+                "tags": ["jupyter", "gensim"]
            },
-
            "Deep Learning with custom pipelines and Keras": {
                "url": "https://explosion.ai/blog/spacy-deep-learning-keras",
                "author": "Matthew Honnibal",
-                "tags": [ "keras", "sentiment" ]
+                "tags": ["keras", "sentiment"]
            },
            "A decomposable attention model for Natural Language Inference": {
                "url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment",
                "author": "Matthew Honnibal",
-                "tags": [ "keras", "similarity" ]
+                "tags": ["keras", "similarity"]
            },

            "Using the German model": {
                "url": "https://explosion.ai/blog/german-model",
                "author": "Wolfgang Seeker",
-                "tags": [ "multi-lingual" ]
+                "tags": ["multi-lingual"]
            },
            "Sense2vec with spaCy and Gensim": {
                "url": "https://explosion.ai/blog/sense2vec-with-spacy",
                "author": "Matthew Honnibal",
-                "tags": [ "big data", "gensim" ]
+                "tags": ["big data", "gensim"]
            },
            "Building your bot's brain with Node.js and spaCy": {
                "url": "https://explosion.ai/blog/chatbot-node-js-spacy",
                "author": "Wah Loon Keng",
-                "tags": [ "bots", "node.js" ]
+                "tags": ["bots", "node.js"]
            },
            "An intent classifier with spaCy": {
                "url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/",
                "author": "Musio",
-                "tags": [ "bots", "keras" ]
+                "tags": ["bots", "keras"]
            },
            "Visual Question Answering with spaCy": {
                "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
                "author": "Aaditya Prakash",
-                "tags": [ "vqa", "keras" ]
+                "tags": ["vqa", "keras"]
+            },
+            "Extracting time suggestions from emails with spaCy": {
+                "url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2",
+                "author": "Chris Savvopoulos",
+                "tags": ["ner"]
            }
        },

@ -315,22 +319,22 @@
            "Information extraction": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
                "author": "Matthew Honnibal",
-                "tags": [ "snippet" ]
+                "tags": ["snippet"]
            },
            "Neural bag of words": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py",
                "author": "Matthew Honnibal",
-                "tags": [ "sentiment" ]
+                "tags": ["sentiment"]
            },
            "Part-of-speech tagging": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py",
                "author": "Matthew Honnibal",
-                "tags": [ "pos" ]
+                "tags": ["pos"]
            },
            "Parallel parse": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py",
                "author": "Matthew Honnibal",
-                "tags": [ "big data" ]
+                "tags": ["big data"]
            },
            "Inventory count": {
                "url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count",
@ -339,7 +343,7 @@
            "Multi-word matches": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py",
                "author": "Matthew Honnibal",
-                "tags": [ "matcher", "out of date" ]
+                "tags": ["matcher", "out of date"]
            }
        }
    }