mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
c087a14380
4
.github/CONTRIBUTOR_AGREEMENT.md
vendored
4
.github/CONTRIBUTOR_AGREEMENT.md
vendored
|
@ -87,11 +87,11 @@ U.S. Federal law. Any choice of law rules will not apply.
|
||||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
mark both statements:
|
mark both statements:
|
||||||
|
|
||||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
or entity, including my employer, has or will have rights with respect to my
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
contributions.
|
contributions.
|
||||||
|
|
||||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
actual authority to contractually bind that entity.
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
## Contributor Details
|
## Contributor Details
|
||||||
|
|
|
@ -218,7 +218,7 @@ then call its ``load()`` method:
|
||||||
import spacy
|
import spacy
|
||||||
import en_core_web_sm
|
import en_core_web_sm
|
||||||
|
|
||||||
nlp = en_core_web_.load()
|
nlp = en_core_web_sm.load()
|
||||||
doc = nlp(u'This is a sentence.')
|
doc = nlp(u'This is a sentence.')
|
||||||
|
|
||||||
📖 **For more info and examples, check out the**
|
📖 **For more info and examples, check out the**
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -192,6 +192,7 @@ def setup_package():
|
||||||
'thinc>=6.10.1,<6.11.0',
|
'thinc>=6.10.1,<6.11.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
'six',
|
'six',
|
||||||
|
'html5lib==1.0b8',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'ujson>=1.35',
|
'ujson>=1.35',
|
||||||
'dill>=0.2,<0.3',
|
'dill>=0.2,<0.3',
|
||||||
|
|
|
@ -36,7 +36,7 @@ def init_model(lang, output_dir, freqs_loc, clusters_loc=None, vectors_loc=None,
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
|
|
||||||
probs, oov_prob = read_freqs(freqs_loc)
|
probs, oov_prob = read_freqs(freqs_loc)
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else None, None
|
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
||||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||||
|
|
||||||
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
|
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
|
||||||
|
@ -69,7 +69,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
nlp.vocab.cfg.update({'oov_prob': oov_prob})
|
nlp.vocab.cfg.update({'oov_prob': oov_prob})
|
||||||
|
|
||||||
if vectors_data:
|
if len(vectors_data):
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if prune_vectors >= 1:
|
if prune_vectors >= 1:
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
|
@ -13,6 +13,12 @@ from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
# Borrowing french syntax parser because both languages use
|
||||||
|
# universal dependencies for tagging/parsing.
|
||||||
|
# Read here for more:
|
||||||
|
# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
lemma_lookup = LOOKUP
|
lemma_lookup = LOOKUP
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Norwegian(Language):
|
class Norwegian(Language):
|
||||||
|
|
42
spacy/lang/nb/syntax_iterators.py
Normal file
42
spacy/lang/nb/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(obj):
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add('conj')
|
||||||
|
np_label = doc.vocab.strings.add('NP')
|
||||||
|
seen = set()
|
||||||
|
for i, word in enumerate(obj):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i in seen:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {
|
||||||
|
'noun_chunks': noun_chunks
|
||||||
|
}
|
|
@ -461,7 +461,8 @@ class Language(object):
|
||||||
if hasattr(proc, 'begin_training'):
|
if hasattr(proc, 'begin_training'):
|
||||||
proc.begin_training(get_gold_tuples(),
|
proc.begin_training(get_gold_tuples(),
|
||||||
pipeline=self.pipeline,
|
pipeline=self.pipeline,
|
||||||
sgd=self._optimizer)
|
sgd=self._optimizer,
|
||||||
|
**cfg)
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds, verbose=False):
|
def evaluate(self, docs_golds, verbose=False):
|
||||||
|
|
19
spacy/tests/regression/test_issue1915.py
Normal file
19
spacy/tests/regression/test_issue1915.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# coding: utf8
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_ner():
|
||||||
|
cfg = {
|
||||||
|
'hidden_depth': 2, # should error out
|
||||||
|
}
|
||||||
|
|
||||||
|
nlp = Language()
|
||||||
|
nlp.add_pipe(nlp.create_pipe('ner'))
|
||||||
|
nlp.get_pipe('ner').add_label('answer')
|
||||||
|
try:
|
||||||
|
nlp.begin_training(**cfg)
|
||||||
|
assert False # should error out
|
||||||
|
except ValueError:
|
||||||
|
assert True
|
|
@ -245,7 +245,9 @@ p Check whether an extension has been registered on the #[code Doc] class.
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
p
|
||||||
|
| Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||||
|
| Returns #[code None] if the character indices don't map to a valid span.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
doc = nlp(u'I like New York')
|
doc = nlp(u'I like New York')
|
||||||
|
@ -276,7 +278,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||||
+row("foot")
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Span]
|
+cell #[code Span]
|
||||||
+cell The newly constructed object.
|
+cell The newly constructed object or #[code None].
|
||||||
|
|
||||||
+h(2, "similarity") Doc.similarity
|
+h(2, "similarity") Doc.similarity
|
||||||
+tag method
|
+tag method
|
||||||
|
|
|
@ -185,7 +185,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| Install a version of the
|
| Install a version of the
|
||||||
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or
|
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Build Tools] or
|
||||||
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
|
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
|
||||||
| that matches the version that was used to compile your Python
|
| that matches the version that was used to compile your Python
|
||||||
| interpreter. For official distributions these are:
|
| interpreter. For official distributions these are:
|
||||||
|
|
|
@ -74,7 +74,8 @@ p
|
||||||
| #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
|
| #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
|
||||||
| Simply convert the dependency parse or recognised entities to displaCy's
|
| Simply convert the dependency parse or recognised entities to displaCy's
|
||||||
| format and set #[code manual=True] on either #[code render()] or
|
| format and set #[code manual=True] on either #[code render()] or
|
||||||
| #[code serve()].
|
| #[code serve()]. When setting #[code ents] manually, make sure to supply
|
||||||
|
| them in the right order, i.e. starting with the lowest start position.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
ex = [{'text': 'But Google is starting from behind.',
|
ex = [{'text': 'But Google is starting from behind.',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user