mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c86445bdfd
|
@ -73,10 +73,10 @@ def generate_sentence(sent):
|
|||
tokens = []
|
||||
for i, id in enumerate(id_):
|
||||
token = {}
|
||||
token["orth"] = word[id]
|
||||
token["tag"] = tag[id]
|
||||
token["head"] = head[id] - i
|
||||
token["dep"] = dep[id]
|
||||
token["orth"] = word[i]
|
||||
token["tag"] = tag[i]
|
||||
token["head"] = head[i] - id
|
||||
token["dep"] = dep[i]
|
||||
tokens.append(token)
|
||||
sentence["tokens"] = tokens
|
||||
return sentence
|
||||
|
|
|
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
|||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
|
|||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
42
spacy/lang/fr/syntax_iterators.py
Normal file
42
spacy/lang/fr/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {
|
||||
'noun_chunks': noun_chunks
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
|
|
23
spacy/lang/pl/tokenizer_exceptions.py
Normal file
23
spacy/lang/pl/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import ORTH, LEMMA, POS
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
|
||||
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
|
||||
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)],
|
||||
|
||||
for orth in [
|
||||
"w.", "r."]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(_exc)
|
|
@ -15,6 +15,7 @@ class Chinese(Language):
|
|||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||
"https://github.com/fxsjy/jieba")
|
||||
words = list(jieba.cut(text, cut_all=True))
|
||||
words=[x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
|
|
|
@ -110,5 +110,35 @@ def es_noun_chunks(obj):
|
|||
token = next_token(token)
|
||||
|
||||
|
||||
def french_noun_chunks(obj):
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
||||
'es': es_noun_chunks}
|
||||
'es': es_noun_chunks, 'fr': french_noun_chunks}
|
||||
|
|
|
@ -18,7 +18,7 @@ p
|
|||
|
||||
# Construction 2
|
||||
from spacy.tokens import Doc
|
||||
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
spaces=[True, False, False])
|
||||
|
||||
+h(2, "init") Doc.__init__
|
||||
|
|
|
@ -18,7 +18,7 @@ p
|
|||
+cell=cell
|
||||
|
||||
p
|
||||
| Fist, the raw text is split on whitespace characters, similar to
|
||||
| First, the raw text is split on whitespace characters, similar to
|
||||
| #[code text.split(' ')]. Then, the tokenizer processes the text from
|
||||
| left to right. On each substring, it performs two checks:
|
||||
|
||||
|
|
|
@ -181,7 +181,7 @@ p
|
|||
from spacy.vocab import Vocab
|
||||
|
||||
nlp = spacy.load('en')
|
||||
moby_dick = open('moby_dick.txt', 'r')
|
||||
moby_dick = open('moby_dick.txt', 'r').read()
|
||||
doc = nlp(moby_dick)
|
||||
doc.to_disk('/moby_dick.bin')
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ include ../../_includes/_mixins
|
|||
|
||||
p
|
||||
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
|
||||
| and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an
|
||||
| and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
|
||||
| official part of the library. Visualizing a dependency parse or named
|
||||
| entities in a text is not only a fun NLP demo – it can also be incredibly
|
||||
| helpful in speeding up development and debugging your code and training
|
||||
|
|
|
@ -77,7 +77,7 @@ p
|
|||
|
||||
+code.
|
||||
doc1 = nlp(u"Paris is the largest city in France.")
|
||||
doc2 = nlp(u"Ljubljana is the capital of Lithuania.")
|
||||
doc2 = nlp(u"Vilnius is the capital of Lithuania.")
|
||||
doc3 = nlp(u"An emu is a large bird.")
|
||||
|
||||
for doc in [doc1, doc2, doc3]:
|
||||
|
@ -85,13 +85,13 @@ p
|
|||
print(doc.similarity(other_doc))
|
||||
|
||||
p
|
||||
| Even though the sentences about Paris and Ljubljana consist of different
|
||||
| Even though the sentences about Paris and Vilnius consist of different
|
||||
| words and entities, they both describe the same concept and are seen as
|
||||
| more similar than the sentence about emus. In this case, even a misspelled
|
||||
| version of "Ljubljana" would still produce very similar results.
|
||||
| version of "Vilnius" would still produce very similar results.
|
||||
|
||||
+table
|
||||
- var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]}
|
||||
- var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
|
||||
- var counter = 0
|
||||
|
||||
+row
|
||||
|
|
Loading…
Reference in New Issue
Block a user