Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-07-22 01:14:28 +02:00
commit c86445bdfd
13 changed files with 22173 additions and 16773 deletions

View File

@ -73,10 +73,10 @@ def generate_sentence(sent):
tokens = []
for i, id in enumerate(id_):
token = {}
token["orth"] = word[id]
token["tag"] = tag[id]
token["head"] = head[id] - i
token["dep"] = dep[id]
token["orth"] = word[i]
token["tag"] = tag[i]
token["head"] = head[i] - id
token["dep"] = dep[i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence

View File

@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)

View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import ORTH, LEMMA, POS
_exc = {}
for exc_data in [
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [dict(exc_data)],
for orth in [
"w.", "r."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)

View File

@ -15,6 +15,7 @@ class Chinese(Language):
raise ImportError("The Chinese tokenizer requires the Jieba library: "
"https://github.com/fxsjy/jieba")
words = list(jieba.cut(text, cut_all=True))
words=[x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -110,5 +110,35 @@ def es_noun_chunks(obj):
token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks}
'es': es_noun_chunks, 'fr': french_noun_chunks}

View File

@ -18,7 +18,7 @@ p
# Construction 2
from spacy.tokens import Doc
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
+h(2, "init") Doc.__init__

View File

@ -18,7 +18,7 @@ p
+cell=cell
p
| Fist, the raw text is split on whitespace characters, similar to
| First, the raw text is split on whitespace characters, similar to
| #[code text.split(' ')]. Then, the tokenizer processes the text from
| left to right. On each substring, it performs two checks:

View File

@ -181,7 +181,7 @@ p
from spacy.vocab import Vocab
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r')
moby_dick = open('moby_dick.txt', 'r').read()
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')

View File

@ -4,7 +4,7 @@ include ../../_includes/_mixins
p
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
| and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an
| and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
| official part of the library. Visualizing a dependency parse or named
| entities in a text is not only a fun NLP demo it can also be incredibly
| helpful in speeding up development and debugging your code and training

View File

@ -77,7 +77,7 @@ p
+code.
doc1 = nlp(u"Paris is the largest city in France.")
doc2 = nlp(u"Ljubljana is the capital of Lithuania.")
doc2 = nlp(u"Vilnius is the capital of Lithuania.")
doc3 = nlp(u"An emu is a large bird.")
for doc in [doc1, doc2, doc3]:
@ -85,13 +85,13 @@ p
print(doc.similarity(other_doc))
p
| Even though the sentences about Paris and Ljubljana consist of different
| Even though the sentences about Paris and Vilnius consist of different
| words and entities, they both describe the same concept and are seen as
| more similar than the sentence about emus. In this case, even a misspelled
| version of "Ljubljana" would still produce very similar results.
| version of "Vilnius" would still produce very similar results.
+table
- var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]}
- var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
- var counter = 0
+row