mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-06 04:43:17 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
16ce7409e4
|
@ -9,6 +9,12 @@ from spacy.gold import GoldParse
|
||||||
from spacy.tagger import Tagger
|
from spacy.tagger import Tagger
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode
|
||||||
|
except:
|
||||||
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
def train_ner(nlp, train_data, entity_types):
|
def train_ner(nlp, train_data, entity_types):
|
||||||
# Add new words to vocab.
|
# Add new words to vocab.
|
||||||
for raw_text, _ in train_data:
|
for raw_text, _ in train_data:
|
||||||
|
@ -24,7 +30,6 @@ def train_ner(nlp, train_data, entity_types):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
ner.update(doc, gold)
|
ner.update(doc, gold)
|
||||||
ner.model.end_training()
|
|
||||||
return ner
|
return ner
|
||||||
|
|
||||||
def save_model(ner, model_dir):
|
def save_model(ner, model_dir):
|
||||||
|
@ -33,8 +38,11 @@ def save_model(ner, model_dir):
|
||||||
model_dir.mkdir()
|
model_dir.mkdir()
|
||||||
assert model_dir.is_dir()
|
assert model_dir.is_dir()
|
||||||
|
|
||||||
with (model_dir / 'config.json').open('w') as file_:
|
with (model_dir / 'config.json').open('wb') as file_:
|
||||||
json.dump(ner.cfg, file_)
|
data = json.dumps(ner.cfg)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf8')
|
||||||
|
file_.write(data)
|
||||||
ner.model.dump(str(model_dir / 'model'))
|
ner.model.dump(str(model_dir / 'model'))
|
||||||
if not (model_dir / 'vocab').exists():
|
if not (model_dir / 'vocab').exists():
|
||||||
(model_dir / 'vocab').mkdir()
|
(model_dir / 'vocab').mkdir()
|
||||||
|
|
|
@ -7,6 +7,6 @@ thinc>=6.2.0,<6.3.0
|
||||||
murmurhash>=0.26,<0.27
|
murmurhash>=0.26,<0.27
|
||||||
plac<0.9.3
|
plac<0.9.3
|
||||||
six
|
six
|
||||||
ujson
|
ujson>=1.35
|
||||||
cloudpickle
|
cloudpickle
|
||||||
sputnik>=0.9.2,<0.10.0
|
sputnik>=0.9.2,<0.10.0
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -234,7 +234,7 @@ def setup_package():
|
||||||
'cymem>=1.30,<1.32',
|
'cymem>=1.30,<1.32',
|
||||||
'preshed>=0.46.0,<0.47.0',
|
'preshed>=0.46.0,<0.47.0',
|
||||||
'thinc>=6.2.0,<6.3.0',
|
'thinc>=6.2.0,<6.3.0',
|
||||||
'plac',
|
'plac<0.9.3',
|
||||||
'six',
|
'six',
|
||||||
'cloudpickle',
|
'cloudpickle',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
|
|
12
spacy/tests/regression/test_issue792.py
Normal file
12
spacy/tests/regression/test_issue792.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||||
|
def test_issue792(en_tokenizer, text):
|
||||||
|
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert doc.text_with_ws == text
|
|
@ -500,7 +500,8 @@ cdef class Doc:
|
||||||
by the values of the given attribute ID.
|
by the values of the given attribute ID.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
from spacy.en import English, attrs
|
from spacy.en import English
|
||||||
|
from spacy import attrs
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tokens = nlp(u'apple apple orange banana')
|
tokens = nlp(u'apple apple orange banana')
|
||||||
tokens.count_by(attrs.ORTH)
|
tokens.count_by(attrs.ORTH)
|
||||||
|
@ -585,9 +586,6 @@ cdef class Doc:
|
||||||
elif attr_id == POS:
|
elif attr_id == POS:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
tokens[i].pos = <univ_pos_t>values[i]
|
tokens[i].pos = <univ_pos_t>values[i]
|
||||||
elif attr_id == TAG:
|
|
||||||
for i in range(length):
|
|
||||||
tokens[i].tag = <univ_pos_t>values[i]
|
|
||||||
elif attr_id == DEP:
|
elif attr_id == DEP:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
tokens[i].dep = values[i]
|
tokens[i].dep = values[i]
|
||||||
|
|
|
@ -12,10 +12,10 @@
|
||||||
"COMPANY_URL": "https://explosion.ai",
|
"COMPANY_URL": "https://explosion.ai",
|
||||||
"DEMOS_URL": "https://demos.explosion.ai",
|
"DEMOS_URL": "https://demos.explosion.ai",
|
||||||
|
|
||||||
"SPACY_VERSION": "1.5",
|
"SPACY_VERSION": "1.6",
|
||||||
"LATEST_NEWS": {
|
"LATEST_NEWS": {
|
||||||
"url": "https://explosion.ai/blog/spacy-user-survey",
|
"url": "https://explosion.ai/blog/deep-learning-formula-nlp",
|
||||||
"title": "The results of the spaCy user survey"
|
"title": "The new deep learning formula for state-of-the-art NLP models"
|
||||||
},
|
},
|
||||||
|
|
||||||
"SOCIAL": {
|
"SOCIAL": {
|
||||||
|
|
|
@ -232,7 +232,7 @@
|
||||||
"NLP with spaCy in 10 lines of code": {
|
"NLP with spaCy in 10 lines of code": {
|
||||||
"url": "https://github.com/cytora/pycon-nlp-in-10-lines",
|
"url": "https://github.com/cytora/pycon-nlp-in-10-lines",
|
||||||
"author": "Andraz Hribernik et al. (Cytora)",
|
"author": "Andraz Hribernik et al. (Cytora)",
|
||||||
"tags": [ "jupyter" ]
|
"tags": ["jupyter"]
|
||||||
},
|
},
|
||||||
"Intro to NLP with spaCy": {
|
"Intro to NLP with spaCy": {
|
||||||
"url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/",
|
"url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/",
|
||||||
|
@ -241,7 +241,7 @@
|
||||||
"NLP with spaCy and IPython Notebook": {
|
"NLP with spaCy and IPython Notebook": {
|
||||||
"url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/",
|
"url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/",
|
||||||
"author": "Dustin Miller (SharePoint)",
|
"author": "Dustin Miller (SharePoint)",
|
||||||
"tags": [ "jupyter" ]
|
"tags": ["jupyter"]
|
||||||
},
|
},
|
||||||
"Getting Started with spaCy": {
|
"Getting Started with spaCy": {
|
||||||
"url": "http://textminingonline.com/getting-started-with-spacy",
|
"url": "http://textminingonline.com/getting-started-with-spacy",
|
||||||
|
@ -254,7 +254,7 @@
|
||||||
"NLP (almost) From Scratch - POS Network with spaCy": {
|
"NLP (almost) From Scratch - POS Network with spaCy": {
|
||||||
"url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html",
|
"url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html",
|
||||||
"author": "Sujit Pal",
|
"author": "Sujit Pal",
|
||||||
"tags": [ "gensim", "keras" ]
|
"tags": ["gensim", "keras"]
|
||||||
},
|
},
|
||||||
"NLP tasks with various libraries": {
|
"NLP tasks with various libraries": {
|
||||||
"url": "http://clarkgrubb.com/nlp",
|
"url": "http://clarkgrubb.com/nlp",
|
||||||
|
@ -270,44 +270,48 @@
|
||||||
"Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": {
|
"Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": {
|
||||||
"url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
|
"url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
|
||||||
"author": "Patrick Harrison (S&P Global)",
|
"author": "Patrick Harrison (S&P Global)",
|
||||||
"tags": [ "jupyter", "gensim" ]
|
"tags": ["jupyter", "gensim"]
|
||||||
},
|
},
|
||||||
|
|
||||||
"Deep Learning with custom pipelines and Keras": {
|
"Deep Learning with custom pipelines and Keras": {
|
||||||
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
|
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "keras", "sentiment" ]
|
"tags": ["keras", "sentiment"]
|
||||||
},
|
},
|
||||||
"A decomposable attention model for Natural Language Inference": {
|
"A decomposable attention model for Natural Language Inference": {
|
||||||
"url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment",
|
"url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "keras", "similarity" ]
|
"tags": ["keras", "similarity"]
|
||||||
},
|
},
|
||||||
|
|
||||||
"Using the German model": {
|
"Using the German model": {
|
||||||
"url": "https://explosion.ai/blog/german-model",
|
"url": "https://explosion.ai/blog/german-model",
|
||||||
"author": "Wolfgang Seeker",
|
"author": "Wolfgang Seeker",
|
||||||
"tags": [ "multi-lingual" ]
|
"tags": ["multi-lingual"]
|
||||||
},
|
},
|
||||||
"Sense2vec with spaCy and Gensim": {
|
"Sense2vec with spaCy and Gensim": {
|
||||||
"url": "https://explosion.ai/blog/sense2vec-with-spacy",
|
"url": "https://explosion.ai/blog/sense2vec-with-spacy",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "big data", "gensim" ]
|
"tags": ["big data", "gensim"]
|
||||||
},
|
},
|
||||||
"Building your bot's brain with Node.js and spaCy": {
|
"Building your bot's brain with Node.js and spaCy": {
|
||||||
"url": "https://explosion.ai/blog/chatbot-node-js-spacy",
|
"url": "https://explosion.ai/blog/chatbot-node-js-spacy",
|
||||||
"author": "Wah Loon Keng",
|
"author": "Wah Loon Keng",
|
||||||
"tags": [ "bots", "node.js" ]
|
"tags": ["bots", "node.js"]
|
||||||
},
|
},
|
||||||
"An intent classifier with spaCy": {
|
"An intent classifier with spaCy": {
|
||||||
"url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/",
|
"url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/",
|
||||||
"author": "Musio",
|
"author": "Musio",
|
||||||
"tags": [ "bots", "keras" ]
|
"tags": ["bots", "keras"]
|
||||||
},
|
},
|
||||||
"Visual Question Answering with spaCy": {
|
"Visual Question Answering with spaCy": {
|
||||||
"url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
|
"url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
|
||||||
"author": "Aaditya Prakash",
|
"author": "Aaditya Prakash",
|
||||||
"tags": [ "vqa", "keras" ]
|
"tags": ["vqa", "keras"]
|
||||||
|
},
|
||||||
|
"Extracting time suggestions from emails with spaCy": {
|
||||||
|
"url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2",
|
||||||
|
"author": "Chris Savvopoulos",
|
||||||
|
"tags": ["ner"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -315,22 +319,22 @@
|
||||||
"Information extraction": {
|
"Information extraction": {
|
||||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "snippet" ]
|
"tags": ["snippet"]
|
||||||
},
|
},
|
||||||
"Neural bag of words": {
|
"Neural bag of words": {
|
||||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py",
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "sentiment" ]
|
"tags": ["sentiment"]
|
||||||
},
|
},
|
||||||
"Part-of-speech tagging": {
|
"Part-of-speech tagging": {
|
||||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py",
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "pos" ]
|
"tags": ["pos"]
|
||||||
},
|
},
|
||||||
"Parallel parse": {
|
"Parallel parse": {
|
||||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py",
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "big data" ]
|
"tags": ["big data"]
|
||||||
},
|
},
|
||||||
"Inventory count": {
|
"Inventory count": {
|
||||||
"url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count",
|
"url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count",
|
||||||
|
@ -339,7 +343,7 @@
|
||||||
"Multi-word matches": {
|
"Multi-word matches": {
|
||||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py",
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
"tags": [ "matcher", "out of date" ]
|
"tags": ["matcher", "out of date"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user