mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c86445bdfd
|
@ -73,10 +73,10 @@ def generate_sentence(sent):
|
||||||
tokens = []
|
tokens = []
|
||||||
for i, id in enumerate(id_):
|
for i, id in enumerate(id_):
|
||||||
token = {}
|
token = {}
|
||||||
token["orth"] = word[id]
|
token["orth"] = word[i]
|
||||||
token["tag"] = tag[id]
|
token["tag"] = tag[i]
|
||||||
token["head"] = head[id] - i
|
token["head"] = head[i] - id
|
||||||
token["dep"] = dep[id]
|
token["dep"] = dep[i]
|
||||||
tokens.append(token)
|
tokens.append(token)
|
||||||
sentence["tokens"] = tokens
|
sentence["tokens"] = tokens
|
||||||
return sentence
|
return sentence
|
||||||
|
|
|
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None):
|
def create_lemmatizer(cls, nlp=None):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
42
spacy/lang/fr/syntax_iterators.py
Normal file
42
spacy/lang/fr/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(obj):
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add('conj')
|
||||||
|
np_label = doc.vocab.strings.add('NP')
|
||||||
|
seen = set()
|
||||||
|
for i, word in enumerate(obj):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i in seen:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {
|
||||||
|
'noun_chunks': noun_chunks
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
|
23
spacy/lang/pl/tokenizer_exceptions.py
Normal file
23
spacy/lang/pl/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import ORTH, LEMMA, POS
|
||||||
|
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
|
||||||
|
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
|
||||||
|
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||||
|
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||||
|
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||||
|
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
|
||||||
|
_exc[exc_data[ORTH]] = [dict(exc_data)],
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"w.", "r."]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(_exc)
|
|
@ -15,6 +15,7 @@ class Chinese(Language):
|
||||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||||
"https://github.com/fxsjy/jieba")
|
"https://github.com/fxsjy/jieba")
|
||||||
words = list(jieba.cut(text, cut_all=True))
|
words = list(jieba.cut(text, cut_all=True))
|
||||||
|
words=[x for x in words if x]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -110,5 +110,35 @@ def es_noun_chunks(obj):
|
||||||
token = next_token(token)
|
token = next_token(token)
|
||||||
|
|
||||||
|
|
||||||
|
def french_noun_chunks(obj):
|
||||||
|
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add('conj')
|
||||||
|
np_label = doc.vocab.strings.add('NP')
|
||||||
|
seen = set()
|
||||||
|
for i, word in enumerate(obj):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i in seen:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
|
||||||
|
|
||||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
||||||
'es': es_noun_chunks}
|
'es': es_noun_chunks, 'fr': french_noun_chunks}
|
||||||
|
|
|
@ -18,7 +18,7 @@ p
|
||||||
|
|
||||||
# Construction 2
|
# Construction 2
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||||
spaces=[True, False, False])
|
spaces=[True, False, False])
|
||||||
|
|
||||||
+h(2, "init") Doc.__init__
|
+h(2, "init") Doc.__init__
|
||||||
|
|
|
@ -18,7 +18,7 @@ p
|
||||||
+cell=cell
|
+cell=cell
|
||||||
|
|
||||||
p
|
p
|
||||||
| Fist, the raw text is split on whitespace characters, similar to
|
| First, the raw text is split on whitespace characters, similar to
|
||||||
| #[code text.split(' ')]. Then, the tokenizer processes the text from
|
| #[code text.split(' ')]. Then, the tokenizer processes the text from
|
||||||
| left to right. On each substring, it performs two checks:
|
| left to right. On each substring, it performs two checks:
|
||||||
|
|
||||||
|
|
|
@ -181,7 +181,7 @@ p
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
moby_dick = open('moby_dick.txt', 'r')
|
moby_dick = open('moby_dick.txt', 'r').read()
|
||||||
doc = nlp(moby_dick)
|
doc = nlp(moby_dick)
|
||||||
doc.to_disk('/moby_dick.bin')
|
doc.to_disk('/moby_dick.bin')
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ include ../../_includes/_mixins
|
||||||
|
|
||||||
p
|
p
|
||||||
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
|
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
|
||||||
| and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an
|
| and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
|
||||||
| official part of the library. Visualizing a dependency parse or named
|
| official part of the library. Visualizing a dependency parse or named
|
||||||
| entities in a text is not only a fun NLP demo – it can also be incredibly
|
| entities in a text is not only a fun NLP demo – it can also be incredibly
|
||||||
| helpful in speeding up development and debugging your code and training
|
| helpful in speeding up development and debugging your code and training
|
||||||
|
|
|
@ -77,7 +77,7 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
doc1 = nlp(u"Paris is the largest city in France.")
|
doc1 = nlp(u"Paris is the largest city in France.")
|
||||||
doc2 = nlp(u"Ljubljana is the capital of Lithuania.")
|
doc2 = nlp(u"Vilnius is the capital of Lithuania.")
|
||||||
doc3 = nlp(u"An emu is a large bird.")
|
doc3 = nlp(u"An emu is a large bird.")
|
||||||
|
|
||||||
for doc in [doc1, doc2, doc3]:
|
for doc in [doc1, doc2, doc3]:
|
||||||
|
@ -85,13 +85,13 @@ p
|
||||||
print(doc.similarity(other_doc))
|
print(doc.similarity(other_doc))
|
||||||
|
|
||||||
p
|
p
|
||||||
| Even though the sentences about Paris and Ljubljana consist of different
|
| Even though the sentences about Paris and Vilnius consist of different
|
||||||
| words and entities, they both describe the same concept and are seen as
|
| words and entities, they both describe the same concept and are seen as
|
||||||
| more similar than the sentence about emus. In this case, even a misspelled
|
| more similar than the sentence about emus. In this case, even a misspelled
|
||||||
| version of "Ljubljana" would still produce very similar results.
|
| version of "Vilnius" would still produce very similar results.
|
||||||
|
|
||||||
+table
|
+table
|
||||||
- var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]}
|
- var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
|
||||||
- var counter = 0
|
- var counter = 0
|
||||||
|
|
||||||
+row
|
+row
|
||||||
|
|
Loading…
Reference in New Issue
Block a user