Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-07-22 01:14:28 +02:00
commit c86445bdfd
13 changed files with 22173 additions and 16773 deletions

View File

@ -73,10 +73,10 @@ def generate_sentence(sent):
tokens = [] tokens = []
for i, id in enumerate(id_): for i, id in enumerate(id_):
token = {} token = {}
token["orth"] = word[id] token["orth"] = word[i]
token["tag"] = tag[id] token["tag"] = tag[i]
token["head"] = head[id] - i token["head"] = head[i] - id
token["dep"] = dep[id] token["dep"] = dep[i]
tokens.append(token) tokens.append(token)
sentence["tokens"] = tokens sentence["tokens"] = tokens
return sentence return sentence

View File

@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None): def create_lemmatizer(cls, nlp=None):

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import ORTH, LEMMA, POS
_exc = {}
for exc_data in [
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [dict(exc_data)],
for orth in [
"w.", "r."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)

View File

@ -15,6 +15,7 @@ class Chinese(Language):
raise ImportError("The Chinese tokenizer requires the Jieba library: " raise ImportError("The Chinese tokenizer requires the Jieba library: "
"https://github.com/fxsjy/jieba") "https://github.com/fxsjy/jieba")
words = list(jieba.cut(text, cut_all=True)) words = list(jieba.cut(text, cut_all=True))
words=[x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -110,5 +110,35 @@ def es_noun_chunks(obj):
token = next_token(token) token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks} 'es': es_noun_chunks, 'fr': french_noun_chunks}

View File

@ -18,7 +18,7 @@ p
# Construction 2 # Construction 2
from spacy.tokens import Doc from spacy.tokens import Doc
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False]) spaces=[True, False, False])
+h(2, "init") Doc.__init__ +h(2, "init") Doc.__init__

View File

@ -18,7 +18,7 @@ p
+cell=cell +cell=cell
p p
| Fist, the raw text is split on whitespace characters, similar to | First, the raw text is split on whitespace characters, similar to
| #[code text.split(' ')]. Then, the tokenizer processes the text from | #[code text.split(' ')]. Then, the tokenizer processes the text from
| left to right. On each substring, it performs two checks: | left to right. On each substring, it performs two checks:

View File

@ -181,7 +181,7 @@ p
from spacy.vocab import Vocab from spacy.vocab import Vocab
nlp = spacy.load('en') nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r') moby_dick = open('moby_dick.txt', 'r').read()
doc = nlp(moby_dick) doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin') doc.to_disk('/moby_dick.bin')

View File

@ -4,7 +4,7 @@ include ../../_includes/_mixins
p p
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy] | As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
| and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an | and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
| official part of the library. Visualizing a dependency parse or named | official part of the library. Visualizing a dependency parse or named
| entities in a text is not only a fun NLP demo it can also be incredibly | entities in a text is not only a fun NLP demo it can also be incredibly
| helpful in speeding up development and debugging your code and training | helpful in speeding up development and debugging your code and training

View File

@ -77,7 +77,7 @@ p
+code. +code.
doc1 = nlp(u"Paris is the largest city in France.") doc1 = nlp(u"Paris is the largest city in France.")
doc2 = nlp(u"Ljubljana is the capital of Lithuania.") doc2 = nlp(u"Vilnius is the capital of Lithuania.")
doc3 = nlp(u"An emu is a large bird.") doc3 = nlp(u"An emu is a large bird.")
for doc in [doc1, doc2, doc3]: for doc in [doc1, doc2, doc3]:
@ -85,13 +85,13 @@ p
print(doc.similarity(other_doc)) print(doc.similarity(other_doc))
p p
| Even though the sentences about Paris and Ljubljana consist of different | Even though the sentences about Paris and Vilnius consist of different
| words and entities, they both describe the same concept and are seen as | words and entities, they both describe the same concept and are seen as
| more similar than the sentence about emus. In this case, even a misspelled | more similar than the sentence about emus. In this case, even a misspelled
| version of "Ljubljana" would still produce very similar results. | version of "Vilnius" would still produce very similar results.
+table +table
- var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]} - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
- var counter = 0 - var counter = 0
+row +row