Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-11-23 19:26:03 +03:00 · 2017-07-22 01:14:28 +02:00 · 2017-07-22 01:14:28 +02:00 · c86445bdfd
commit c86445bdfd
parent b3a749610e c91642efd5
13 changed files with 22173 additions and 16773 deletions
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -73,10 +73,10 @@ def generate_sentence(sent):
    tokens = []
    for i, id in enumerate(id_):
        token = {}
-        token["orth"] = word[id]
-        token["tag"] = tag[id]
-        token["head"] = head[id] - i
-        token["dep"] = dep[id]
+        token["orth"] = word[i]
+        token["tag"] = tag[i]
+        token["head"] = head[i] - id
+        token["dep"] = dep[i]
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
+from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
    infixes = tuple(TOKENIZER_INFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    token_match = TOKEN_MATCH
+    syntax_iterators = dict(SYNTAX_ITERATORS)

    @classmethod
    def create_lemmatizer(cls, nlp=None):
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'pl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)


--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, POS
+
+
+_exc = {}
+
+for exc_data in [
+    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
+    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
+    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
+    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
+    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
+    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "w.", "r."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -15,6 +15,7 @@ class Chinese(Language):
            raise ImportError("The Chinese tokenizer requires the Jieba library: "
                              "https://github.com/fxsjy/jieba")
        words = list(jieba.cut(text, cut_all=True))
+        words=[x for x in words if x]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))


--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -110,5 +110,35 @@ def es_noun_chunks(obj):
        token = next_token(token)


+def french_noun_chunks(obj):
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
-            'es': es_noun_chunks}
+            'es': es_noun_chunks, 'fr': french_noun_chunks}
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -18,7 +18,7 @@ p

    # Construction 2
    from spacy.tokens import Doc
-    doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
+    doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
                               spaces=[True, False, False])

 +h(2, "init") Doc.__init__
--- a/website/docs/usage/_spacy-101/_tokenization.jade
+++ b/website/docs/usage/_spacy-101/_tokenization.jade
@ -18,7 +18,7 @@ p
            +cell=cell

 p
-    |  Fist, the raw text is split on whitespace characters, similar to
+    |  First, the raw text is split on whitespace characters, similar to
    |  #[code text.split(' ')]. Then, the tokenizer processes the text from
    |  left to right. On each substring, it performs two checks:

--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -181,7 +181,7 @@ p
    from spacy.vocab import Vocab

    nlp = spacy.load('en')
-    moby_dick = open('moby_dick.txt', 'r')
+    moby_dick = open('moby_dick.txt', 'r').read()
    doc = nlp(moby_dick)
    doc.to_disk('/moby_dick.bin')

--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -4,7 +4,7 @@ include ../../_includes/_mixins

 p
    |  As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
-    |  and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an
+    |  and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
    |  official part of the library. Visualizing a dependency parse or named
    |  entities in a text is not only a fun NLP demo – it can also be incredibly
    |  helpful in speeding up development and debugging your code and training
--- a/website/docs/usage/word-vectors-similarities.jade
+++ b/website/docs/usage/word-vectors-similarities.jade
@ -77,7 +77,7 @@ p

 +code.
    doc1 = nlp(u"Paris is the largest city in France.")
-    doc2 = nlp(u"Ljubljana is the capital of Lithuania.")
+    doc2 = nlp(u"Vilnius is the capital of Lithuania.")
    doc3 = nlp(u"An emu is a large bird.")

    for doc in [doc1, doc2, doc3]:
@ -85,13 +85,13 @@ p
            print(doc.similarity(other_doc))

 p
-    |  Even though the sentences about Paris and Ljubljana consist of different
+    |  Even though the sentences about Paris and Vilnius consist of different
    |  words and entities, they both describe the same concept and are seen as
    |  more similar than the sentence about emus. In this case, even a misspelled
-    |  version of "Ljubljana" would still produce very similar results.
+    |  version of "Vilnius" would still produce very similar results.

 +table
-    - var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]}
+    - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
    - var counter = 0

    +row