This commit is contained in:
Matthew Honnibal 2017-01-31 13:27:34 -06:00
commit 16ce7409e4
7 changed files with 52 additions and 30 deletions

View File

@ -9,6 +9,12 @@ from spacy.gold import GoldParse
from spacy.tagger import Tagger
try:
unicode
except:
unicode = str
def train_ner(nlp, train_data, entity_types):
# Add new words to vocab.
for raw_text, _ in train_data:
@ -24,7 +30,6 @@ def train_ner(nlp, train_data, entity_types):
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
ner.update(doc, gold)
ner.model.end_training()
return ner
def save_model(ner, model_dir):
@ -33,8 +38,11 @@ def save_model(ner, model_dir):
model_dir.mkdir()
assert model_dir.is_dir()
with (model_dir / 'config.json').open('w') as file_:
json.dump(ner.cfg, file_)
with (model_dir / 'config.json').open('wb') as file_:
data = json.dumps(ner.cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
ner.model.dump(str(model_dir / 'model'))
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()

View File

@ -7,6 +7,6 @@ thinc>=6.2.0,<6.3.0
murmurhash>=0.26,<0.27
plac<0.9.3
six
ujson
ujson>=1.35
cloudpickle
sputnik>=0.9.2,<0.10.0

View File

@ -234,7 +234,7 @@ def setup_package():
'cymem>=1.30,<1.32',
'preshed>=0.46.0,<0.47.0',
'thinc>=6.2.0,<6.3.0',
'plac',
'plac<0.9.3',
'six',
'cloudpickle',
'pathlib',

View File

@ -0,0 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
doc = en_tokenizer(text)
assert doc.text_with_ws == text

View File

@ -500,7 +500,8 @@ cdef class Doc:
by the values of the given attribute ID.
Example:
from spacy.en import English, attrs
from spacy.en import English
from spacy import attrs
nlp = English()
tokens = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
@ -585,9 +586,6 @@ cdef class Doc:
elif attr_id == POS:
for i in range(length):
tokens[i].pos = <univ_pos_t>values[i]
elif attr_id == TAG:
for i in range(length):
tokens[i].tag = <univ_pos_t>values[i]
elif attr_id == DEP:
for i in range(length):
tokens[i].dep = values[i]

View File

@ -12,10 +12,10 @@
"COMPANY_URL": "https://explosion.ai",
"DEMOS_URL": "https://demos.explosion.ai",
"SPACY_VERSION": "1.5",
"SPACY_VERSION": "1.6",
"LATEST_NEWS": {
"url": "https://explosion.ai/blog/spacy-user-survey",
"title": "The results of the spaCy user survey"
"url": "https://explosion.ai/blog/deep-learning-formula-nlp",
"title": "The new deep learning formula for state-of-the-art NLP models"
},
"SOCIAL": {

View File

@ -272,7 +272,6 @@
"author": "Patrick Harrison (S&P Global)",
"tags": ["jupyter", "gensim"]
},
"Deep Learning with custom pipelines and Keras": {
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
"author": "Matthew Honnibal",
@ -308,6 +307,11 @@
"url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
"author": "Aaditya Prakash",
"tags": ["vqa", "keras"]
},
"Extracting time suggestions from emails with spaCy": {
"url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2",
"author": "Chris Savvopoulos",
"tags": ["ner"]
}
},