Merge github.com:explosion/spaCy into dutch

This commit is contained in:
Janneke van der Zwaan 2016-12-13 09:25:23 +01:00
commit 4a3fdcce8a
10 changed files with 143 additions and 19 deletions

View File

@ -6,7 +6,7 @@ Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day 1 to be used in real products. It's commercial
open-source software, released under the MIT license.
💫 **Version 1.2 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
💫 **Version 1.3 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
.. image:: http://i.imgur.com/wFvLZyJ.png
:target: https://travis-ci.org/explosion/spaCy
@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En
Changelog
=========
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
-------------------------------------------------------------------------------------------------------------------------------------------
2016-12-03 `v1.3.0 <https://github.com/explosion/spaCy/releases>`_: *Improve API consistency*
---------------------------------------------------------------------------------------------
**✨ API improvements**
* Add ``Span.sentiment`` attribute.
* `#658 <https://github.com/explosion/spaCy/pull/658>`_: Add ``Span.noun_chunks`` iterator (thanks `@pokey <https://github.com/pokey>`_).
* `#642 <https://github.com/explosion/spaCy/pull/642>`_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_).
* `#638 <https://github.com/explosion/spaCy/pull/638>`_: Add German stopwords (thanks `@souravsingh <https://github.com/souravsingh>`_).
* `#614 <https://github.com/explosion/spaCy/pull/614>`_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi <https://github.com/sadovnychyi>`_).
**🔴 Bug fixes**
* Fix issue `#605 <https://github.com/explosion/spaCy/issues/605>`_: ``accept`` argument to ``Matcher`` now rejects matches as expected.
* Fix issue `#617 <https://github.com/explosion/spaCy/issues/617>`_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects.
* Fix issue `#639 <https://github.com/explosion/spaCy/issues/639>`_: Stop words in ``Language`` class now used as expected.
* Fix issues `#656 <https://github.com/explosion/spaCy/issues/656>`_, `#624 <https://github.com/explosion/spaCy/issues/624>`_: ``Tokenizer`` special-case rules now support arbitrary token attributes.
**📖 Documentation and examples**
* Add `"Customizing the tokenizer" <https://spacy.io/docs/usage/customizing-tokenizer>`_ workflow.
* Add `"Training the tagger, parser and entity recognizer" <https://spacy.io/docs/usage/training>`_ workflow.
* Add `"Entity recognition" <https://spacy.io/docs/usage/entity-recognition>`_ workflow.
* Fix various typos and inconsistencies.
**👥 Contributors**
Thanks to `@pokey <https://github.com/pokey>`_, `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_, `@souravsingh <https://github.com/souravsingh>`_, `@sadovnychyi <https://github.com/sadovnychyi>`_, `@manojsakhwar <https://github.com/manojsakhwar>`_, `@TiagoMRodrigues <https://github.com/TiagoMRodrigues>`_, `@savkov <https://github.com/savkov>`_, `@pspiegelhalter <https://github.com/pspiegelhalter>`_, `@chenb67 <https://github.com/chenb67>`_, `@kylepjohnson <https://github.com/kylepjohnson>`_, `@YanhaoYang <https://github.com/YanhaoYang>`_, `@tjrileywisc <https://github.com/tjrileywisc>`_, `@dechov <https://github.com/dechov>`_, `@wjt <https://github.com/wjt>`_, `@jsmootiv <https://github.com/jsmootiv>`_ and `@blarghmatey <https://github.com/blarghmatey>`_ for the pull requests!
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases/tag/v1.2.0>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
------------------------------------------------------------------------------------------------------------------------------------------------------
**✨ Major features and improvements**

View File

@ -0,0 +1,22 @@
# Load NER
from __future__ import unicode_literals
import spacy
import pathlib
from spacy.pipeline import EntityRecognizer
from spacy.vocab import Vocab
def load_model(model_dir):
model_dir = pathlib.Path(model_dir)
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
nlp.vocab.strings.load(file_)
nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
return (nlp, ner)
(nlp, ner) = load_model('ner')
doc = nlp.make_doc('Who is Shaka Khan?')
nlp.tagger(doc)
ner(doc)
for word in doc:
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

View File

@ -10,6 +10,13 @@ from spacy.tagger import Tagger
def train_ner(nlp, train_data, entity_types):
# Add new words to vocab.
for raw_text, _ in train_data:
doc = nlp.make_doc(raw_text)
for word in doc:
_ = nlp.vocab[word.orth]
# Train NER.
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
for itn in range(5):
random.shuffle(train_data)
@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types):
ner.model.end_training()
return ner
def save_model(ner, model_dir):
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
with (model_dir / 'config.json').open('w') as file_:
json.dump(ner.cfg, file_)
ner.model.dump(str(model_dir / 'model'))
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
ner.vocab.strings.dump(file_)
def main(model_dir=None):
if model_dir is not None:
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
# v1.1.2 onwards
if nlp.tagger is None:
print('---- WARNING ----')
print('Data directory not found')
print('please run: `python -m spacy.en.download force all` for better performance')
print('please run: `python -m spacy.en.download --force all` for better performance')
print('Using feature templates for tagging')
print('-----------------')
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
@ -56,16 +72,17 @@ def main(model_dir=None):
nlp.tagger(doc)
ner(doc)
for word in doc:
print(word.text, word.tag_, word.ent_type_, word.ent_iob)
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
if model_dir is not None:
with (model_dir / 'config.json').open('w') as file_:
json.dump(ner.cfg, file_)
ner.model.dump(str(model_dir / 'model'))
save_model(ner, model_dir)
if __name__ == '__main__':
main()
main('ner')
# Who "" 2
# is "" 2
# Shaka "" PERSON 3

View File

@ -69,7 +69,7 @@ def main(output_dir=None):
print(word.text, word.tag_, word.pos_)
if output_dir is not None:
tagger.model.dump(str(output_dir / 'pos' / 'model'))
with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
tagger.vocab.strings.dump(file_)

View File

@ -4,7 +4,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
__version__ = '1.2.0'
__version__ = '1.3.0'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'

View File

@ -426,3 +426,9 @@ cpdef enum symbol_t:
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT
# These symbols are currently missing. However, if we add them currently,
# we'll throw off the integer index and the model will have to be retrained.
# We therefore wait until the next data version to add them.
# acl

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals
from spacy.attrs import HEAD
from spacy.en import English
from spacy.tokens.doc import Doc
import numpy as np
import pytest
@ -49,3 +50,44 @@ def test_sent(doc):
assert span.sent.text == 'This is a sentence.'
span = doc[6:7]
assert span.sent.root.left_edge.text == 'This'
def test_default_sentiment(EN):
'''Test new span.sentiment property's default averaging behaviour'''
good = EN.vocab[u'good']
good.sentiment = 3.0
bad = EN.vocab[u'bad']
bad.sentiment = -2.0
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
good_stuff = doc[:2]
assert good_stuff.sentiment == 3.0 / 2
bad_stuff = doc[-2:]
assert bad_stuff.sentiment == -2. / 2
good_stuff_bad = doc[:-1]
assert good_stuff_bad.sentiment == (3.+-2) / 3.
def test_override_sentiment(EN):
'''Test new span.sentiment property's default averaging behaviour'''
good = EN.vocab[u'good']
good.sentiment = 3.0
bad = EN.vocab[u'bad']
bad.sentiment = -2.0
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
doc.user_span_hooks['sentiment'] = lambda span: 10.0
good_stuff = doc[:2]
assert good_stuff.sentiment == 10.0
bad_stuff = doc[-2:]
assert bad_stuff.sentiment == 10.0
good_stuff_bad = doc[:-1]
assert good_stuff_bad.sentiment == 10.0

View File

@ -179,6 +179,13 @@ cdef class Span:
self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm
property sentiment:
def __get__(self):
if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self)
else:
return sum([token.sentiment for token in self]) / len(self)
property text:
def __get__(self):
text = self.text_with_ws

View File

@ -14,7 +14,7 @@ p After reading this page, you should be able to:
+h(3, "no-job-too-big") No job too big
p
| When writing spaCy, one of my motos was #[em no job too big]. I wanted
| When writing spaCy, one of my mottos was #[em no job too big]. I wanted
| to make sure that if Google or Facebook were founded tomorrow, spaCy
| would be the obvious choice for them. I wanted spaCy to be the obvious
| choice for web-scale NLP. This meant sweating about performance, because

View File

@ -217,7 +217,7 @@ p
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]
nlp = spacy.load(entity=False, parser=False)
nlp = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(5):