This commit is contained in:
Matthew Honnibal 2018-02-07 01:29:39 +01:00
commit c087a14380
11 changed files with 83 additions and 10 deletions

View File

@ -87,11 +87,11 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT 7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements: mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person * [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my or entity, including my employer, has or will have rights with respect to my
contributions. contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the * [x] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity. actual authority to contractually bind that entity.
## Contributor Details ## Contributor Details

View File

@ -218,7 +218,7 @@ then call its ``load()`` method:
import spacy import spacy
import en_core_web_sm import en_core_web_sm
nlp = en_core_web_.load() nlp = en_core_web_sm.load()
doc = nlp(u'This is a sentence.') doc = nlp(u'This is a sentence.')
📖 **For more info and examples, check out the** 📖 **For more info and examples, check out the**

View File

@ -192,6 +192,7 @@ def setup_package():
'thinc>=6.10.1,<6.11.0', 'thinc>=6.10.1,<6.11.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'six', 'six',
'html5lib==1.0b8',
'pathlib', 'pathlib',
'ujson>=1.35', 'ujson>=1.35',
'dill>=0.2,<0.3', 'dill>=0.2,<0.3',

View File

@ -36,7 +36,7 @@ def init_model(lang, output_dir, freqs_loc, clusters_loc=None, vectors_loc=None,
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
probs, oov_prob = read_freqs(freqs_loc) probs, oov_prob = read_freqs(freqs_loc)
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else None, None vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
clusters = read_clusters(clusters_loc) if clusters_loc else {} clusters = read_clusters(clusters_loc) if clusters_loc else {}
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors) nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
@ -69,7 +69,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
lex_added += 1 lex_added += 1
nlp.vocab.cfg.update({'oov_prob': oov_prob}) nlp.vocab.cfg.update({'oov_prob': oov_prob})
if vectors_data: if len(vectors_data):
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if prune_vectors >= 1: if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors) nlp.vocab.prune_vectors(prune_vectors)

View File

@ -13,6 +13,12 @@ from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
# Borrowing french syntax parser because both languages use
# universal dependencies for tagging/parsing.
# Read here for more:
# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
from .syntax_iterators import SYNTAX_ITERATORS
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
lemma_lookup = LOOKUP lemma_lookup = LOOKUP
syntax_iterators = SYNTAX_ITERATORS
class Norwegian(Language): class Norwegian(Language):

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -461,7 +461,8 @@ class Language(object):
if hasattr(proc, 'begin_training'): if hasattr(proc, 'begin_training'):
proc.begin_training(get_gold_tuples(), proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline, pipeline=self.pipeline,
sgd=self._optimizer) sgd=self._optimizer,
**cfg)
return self._optimizer return self._optimizer
def evaluate(self, docs_golds, verbose=False): def evaluate(self, docs_golds, verbose=False):

View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from ...language import Language
def test_simple_ner():
cfg = {
'hidden_depth': 2, # should error out
}
nlp = Language()
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.get_pipe('ner').add_label('answer')
try:
nlp.begin_training(**cfg)
assert False # should error out
except ValueError:
assert True

View File

@ -245,7 +245,9 @@ p Check whether an extension has been registered on the #[code Doc] class.
+tag method +tag method
+tag-new(2) +tag-new(2)
p Create a #[code Span] object from the slice #[code doc.text[start : end]]. p
| Create a #[code Span] object from the slice #[code doc.text[start : end]].
| Returns #[code None] if the character indices don't map to a valid span.
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I like New York') doc = nlp(u'I like New York')
@ -276,7 +278,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+row("foot") +row("foot")
+cell returns +cell returns
+cell #[code Span] +cell #[code Span]
+cell The newly constructed object. +cell The newly constructed object or #[code None].
+h(2, "similarity") Doc.similarity +h(2, "similarity") Doc.similarity
+tag method +tag method

View File

@ -185,7 +185,7 @@ p
p p
| Install a version of the | Install a version of the
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or | #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Build Tools] or
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express] | #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
| that matches the version that was used to compile your Python | that matches the version that was used to compile your Python
| interpreter. For official distributions these are: | interpreter. For official distributions these are:

View File

@ -74,7 +74,8 @@ p
| #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet]. | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
| Simply convert the dependency parse or recognised entities to displaCy's | Simply convert the dependency parse or recognised entities to displaCy's
| format and set #[code manual=True] on either #[code render()] or | format and set #[code manual=True] on either #[code render()] or
| #[code serve()]. | #[code serve()]. When setting #[code ents] manually, make sure to supply
| them in the right order, i.e. starting with the lowest start position.
+aside-code("Example"). +aside-code("Example").
ex = [{'text': 'But Google is starting from behind.', ex = [{'text': 'But Google is starting from behind.',