This commit is contained in:
Matthew Honnibal 2018-02-07 01:29:39 +01:00
commit c087a14380
11 changed files with 83 additions and 10 deletions

View File

@ -87,11 +87,11 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
* [x] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details

View File

@ -218,7 +218,7 @@ then call its ``load()`` method:
import spacy
import en_core_web_sm
nlp = en_core_web_.load()
nlp = en_core_web_sm.load()
doc = nlp(u'This is a sentence.')
📖 **For more info and examples, check out the**

View File

@ -192,6 +192,7 @@ def setup_package():
'thinc>=6.10.1,<6.11.0',
'plac<1.0.0,>=0.9.6',
'six',
'html5lib==1.0b8',
'pathlib',
'ujson>=1.35',
'dill>=0.2,<0.3',

View File

@ -36,7 +36,7 @@ def init_model(lang, output_dir, freqs_loc, clusters_loc=None, vectors_loc=None,
vectors_loc = ensure_path(vectors_loc)
probs, oov_prob = read_freqs(freqs_loc)
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else None, None
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
clusters = read_clusters(clusters_loc) if clusters_loc else {}
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
@ -69,7 +69,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
lex_added += 1
nlp.vocab.cfg.update({'oov_prob': oov_prob})
if vectors_data:
if len(vectors_data):
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors)

View File

@ -13,6 +13,12 @@ from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
# Borrowing french syntax parser because both languages use
# universal dependencies for tagging/parsing.
# Read here for more:
# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
from .syntax_iterators import SYNTAX_ITERATORS
class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults):
stop_words = STOP_WORDS
tag_map = TAG_MAP
lemma_lookup = LOOKUP
syntax_iterators = SYNTAX_ITERATORS
class Norwegian(Language):

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -461,7 +461,8 @@ class Language(object):
if hasattr(proc, 'begin_training'):
proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline,
sgd=self._optimizer)
sgd=self._optimizer,
**cfg)
return self._optimizer
def evaluate(self, docs_golds, verbose=False):

View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from ...language import Language
def test_simple_ner():
cfg = {
'hidden_depth': 2, # should error out
}
nlp = Language()
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.get_pipe('ner').add_label('answer')
try:
nlp.begin_training(**cfg)
assert False # should error out
except ValueError:
assert True

View File

@ -245,7 +245,9 @@ p Check whether an extension has been registered on the #[code Doc] class.
+tag method
+tag-new(2)
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
p
| Create a #[code Span] object from the slice #[code doc.text[start : end]].
| Returns #[code None] if the character indices don't map to a valid span.
+aside-code("Example").
doc = nlp(u'I like New York')
@ -276,7 +278,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+row("foot")
+cell returns
+cell #[code Span]
+cell The newly constructed object.
+cell The newly constructed object or #[code None].
+h(2, "similarity") Doc.similarity
+tag method

View File

@ -185,7 +185,7 @@ p
p
| Install a version of the
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Build Tools] or
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
| that matches the version that was used to compile your Python
| interpreter. For official distributions these are:

View File

@ -74,7 +74,8 @@ p
| #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
| Simply convert the dependency parse or recognised entities to displaCy's
| format and set #[code manual=True] on either #[code render()] or
| #[code serve()].
| #[code serve()]. When setting #[code ents] manually, make sure to supply
| them in the right order, i.e. starting with the lowest start position.
+aside-code("Example").
ex = [{'text': 'But Google is starting from behind.',