introduce lang field for LexemeC to hold language id

put noun_chunk logic into iterators.py for each language separately
2025-11-07 11:27:37 +03:00 · 2016-03-10 13:01:34 +01:00 · 2016-03-10 13:01:34 +01:00 · 03fb498dbe
commit 03fb498dbe
parent bc9c62e279
16 changed files with 103 additions and 70 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
    else:
        file_ = loc.open()
    for i, line in enumerate(file_):
-        freq, doc_freq, key = line.split('\t', 2)
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
        file_ = loc.open()
    probs = {}
    for line in file_:
-        freq, doc_freq, key = line.split('\t', 2)
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
--- a/setup.py
+++ b/setup.py
@ -56,14 +56,15 @@ MOD_NAMES = [
    'spacy.tokens.doc',
    'spacy.tokens.span',
    'spacy.tokens.token',
-    'spacy.tokens.npchunks',
    'spacy.serialize.packer',
    'spacy.serialize.huffman',
    'spacy.serialize.bits',
    'spacy.cfile',
    'spacy.matcher',
    'spacy.syntax.ner',
-    'spacy.symbols']
+    'spacy.symbols',
+    'spacy.en.iterators',
+    'spacy.de.iterators']


 # By subclassing build_extensions we have the actual compiler that will be used
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -14,12 +14,12 @@ cpdef enum attr_id_t:
    LIKE_EMAIL
    IS_STOP
    IS_OOV
+    IS_BRACKET
+    IS_QUOTE
+    IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT

-    FLAG14 = 14
-    FLAG15
-    FLAG16
-    FLAG17
-    FLAG18
+    FLAG18 = 18
    FLAG19
    FLAG20
    FLAG21
@ -86,10 +86,6 @@ cpdef enum attr_id_t:
    SPACY
    PROB

-# Move these up to FLAG14--FLAG18 once we finish the functionality and
-# are ready to regenerate the model
-#IS_BRACKET
-#IS_QUOTE
-#IS_LEFT_PUNCT
-#IS_RIGHT_PUNCT
+    LANG
+    
 
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -13,10 +13,10 @@ IDS = {
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
-    "FLAG14": FLAG14,
-    "FLAG15": FLAG15,
-    "FLAG16": FLAG16,
-    "FLAG17": FLAG17,
+    "IS_BRACKET": IS_BRACKET,
+    "IS_QUOTE": IS_QUOTE,
+    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
+    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "FLAG18": FLAG18,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
@ -83,6 +83,7 @@ IDS = {
    "HEAD": HEAD,
    "SPACY": SPACY,
    "PROB": PROB,
+    "LANG": LANG,
 }

 # ATTR IDs, in order of the symbol
--- a/spacy/tokens/npchunks.pxd
+++ b/spacy/tokens/npchunks.pxd
--- a/spacy/tokens/npchunks.pyx
+++ b/spacy/tokens/npchunks.pyx
@ -1,31 +1,9 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span

-from ..structs cimport TokenC
-from .doc cimport Doc
-from .span cimport Span
+from spacy.parts_of_speech cimport NOUN

-from ..parts_of_speech cimport NOUN, PROPN, PRON
-
-def english(Span sent):
-    cdef const TokenC* word
-    strings = sent.doc.vocab.strings
-    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
-    np_deps = [strings[label] for label in labels]
-    conj = strings['conj']
-    np_label = strings['NP']
-    for i in range(sent.start, sent.end):
-        word = &sent.doc.c[i]
-        if word.pos == NOUN and word.dep in np_deps:
-            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-        elif word.pos == NOUN and word.dep == conj:
-            head = word+word.head
-            while head.dep == conj and head.head < 0:
-                head += head.head
-            # If the head is an NP, and we're coordinated to it, we're an NP
-            if head.dep in np_deps:
-                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-
-
-def german(Span sent):
+def noun_chunks(Span sent):
    # this function extracts spans headed by NOUNs starting from the left-most
    # syntactic dependent until the NOUN itself
    # for close apposition and measurement construction, the span is sometimes
@ -48,7 +26,3 @@ def german(Span sent):
                if rdep.pos == NOUN and rdep.dep == close_app:
                    rbracket = rdep.i+1
            yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
-
-
-
-
--- a/spacy/en/iterators.pxd
+++ b/spacy/en/iterators.pxd
--- a/spacy/en/iterators.pyx
+++ b/spacy/en/iterators.pyx
@ -0,0 +1,24 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span
+
+from spacy.parts_of_speech cimport NOUN
+
+def noun_chunks(Span sent):
+    cdef const TokenC* word
+    strings = sent.doc.vocab.strings
+    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
+    np_deps = [strings[label] for label in labels]
+    conj = strings['conj']
+    np_label = strings['NP']
+    for i in range(sent.start, sent.end):
+        word = &sent.doc.c[i]
+        if word.pos == NOUN and word.dep in np_deps:
+            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
+        elif word.pos == NOUN and word.dep == conj:
+            head = word+word.head
+            while head.dep == conj and head.head < 0:
+                head += head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
+
--- a/spacy/language.py
+++ b/spacy/language.py
@ -69,6 +69,7 @@ class Language(object):
            attrs.SUFFIX: cls.suffix,
            attrs.CLUSTER: cls.cluster,
            attrs.PROB: lambda string: oov_prob,
+            attrs.LANG: lambda string: cls.lang,
            attrs.IS_ALPHA: orth.is_alpha,
            attrs.IS_ASCII: orth.is_ascii,
            attrs.IS_DIGIT: cls.is_digit,
@ -77,10 +78,10 @@ class Language(object):
            attrs.IS_SPACE: cls.is_space,
            attrs.IS_TITLE: orth.is_title,
            attrs.IS_UPPER: orth.is_upper,
-            attrs.FLAG14: orth.is_bracket,
-            attrs.FLAG15: orth.is_quote,
-            attrs.FLAG16: orth.is_left_punct,
-            attrs.FLAG17: orth.is_right_punct,
+            attrs.IS_BRACKET: orth.is_bracket,
+            attrs.IS_QUOTE: orth.is_quote,
+            attrs.IS_LEFT_PUNCT: orth.is_left_punct,
+            attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
            attrs.LIKE_URL: orth.like_url,
            attrs.LIKE_NUM: orth.like_number,
            attrs.LIKE_EMAIL: orth.like_email,
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,6 +1,6 @@
 from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
 from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG

 from .structs cimport LexemeC
 from .strings cimport StringStore
@ -41,6 +41,8 @@ cdef class Lexeme:
            lex.suffix = value
        elif name == CLUSTER:
            lex.cluster = value
+        elif name == LANG:
+            lex.lang = value

    @staticmethod
    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -67,6 +69,8 @@ cdef class Lexeme:
            return lex.length
        elif feat_name == CLUSTER:
            return lex.cluster
+        elif feat_name == LANG:
+            return lex.lang
        else:
            return 0
    
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -18,10 +18,10 @@ import numpy

 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport FLAG14 as IS_BRACKET
-from .attrs cimport FLAG15 as IS_QUOTE
-from .attrs cimport FLAG16 as IS_LEFT_PUNCT
-from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
+from .attrs cimport IS_BRACKET
+from .attrs cimport IS_QUOTE
+from .attrs cimport IS_LEFT_PUNCT
+from .attrs cimport IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV


@ -123,6 +123,10 @@ cdef class Lexeme:
        def __get__(self): return self.c.cluster
        def __set__(self, int x): self.c.cluster = x
 
+    property lang:
+        def __get__(self): return self.c.lang
+        def __set__(self, int x): self.c.lang = x
+
    property prob:
        def __get__(self): return self.c.prob
        def __set__(self, float x): self.c.prob = x
@ -147,6 +151,10 @@ cdef class Lexeme:
        def __get__(self): return self.vocab.strings[self.c.suffix]
        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]

+    property lang_:
+        def __get__(self): return self.vocab.strings[self.c.lang]
+        def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
+
    property flags:
        def __get__(self): return self.c.flags
        def __set__(self, flags_t x): self.c.flags = x
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string):


 cpdef bint is_quote(unicode string):
-    quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯')
+    quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``')
    return string in quotes


 cpdef bint is_left_punct(unicode string):
-    left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮')        
+    left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``')
    return string in left_punct


 cpdef bint is_right_punct(unicode string):
-    right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯')        
+    right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''")
    return string in right_punct


--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -9,6 +9,8 @@ cdef struct LexemeC:

    flags_t flags

+    attr_t lang
+
    attr_t id
    attr_t length

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -8,6 +8,7 @@ import struct
 cimport numpy as np
 import math
 import six
+import warnings

 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
@ -23,7 +24,6 @@ from .token cimport Token
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice

-import npchunks

 DEF PADDING = 5

@ -241,11 +241,23 @@ cdef class Doc:
                "\npython -m spacy.en.download all\n"
                "to install the data")

-        chunk_rules = {'en':npchunks.english, 'de':npchunks.german}
+        from spacy.en.iterators import noun_chunks as en_noun_chunks
+        from spacy.de.iterators import noun_chunks as de_noun_chunks
+
+        chunk_rules = {'en':en_noun_chunks, 
+                       'de':de_noun_chunks,
+                       }

        for sent in self.sents:
-            lang = 'en' # todo: make dependent on language of root token
-            for chunk in chunk_rules.get(lang)(sent):
+            print(sent)
+            lang = sent.root.lang_
+            chunker = chunk_rules.get(lang,None)
+            if chunker == None:
+                warnings.warn("noun_chunks is not available for language %s." % lang)
+                print(sent.root.orth_)
+                continue
+
+            for chunk in chunker(sent):
                yield chunk

        
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT

 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport FLAG14 as IS_BRACKET
-from ..attrs cimport FLAG15 as IS_QUOTE
-from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
-from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
+from ..attrs cimport IS_BRACKET
+from ..attrs cimport IS_QUOTE
+from ..attrs cimport IS_LEFT_PUNCT
+from ..attrs cimport IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV

@ -95,6 +95,10 @@ cdef class Token:
        def __get__(self):
            return self.c.lex.prob

+    property lang:
+        def __get__(self):
+            return self.c.lex.lang
+
    property idx:
        def __get__(self):
            return self.c.idx
@ -310,6 +314,10 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]

+    property lang_:
+        def __get__(self):
+            return self.vocab.strings[self.c.lex.lang]
+
    property lemma_:
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -246,6 +246,7 @@ cdef class Vocab:
            fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
            fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
            fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
+            fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
        fp.close()

    def load_lexemes(self, loc):
@ -278,6 +279,7 @@ cdef class Vocab:
            fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
            fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
            fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
+            fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))

            lexeme.vector = EMPTY_VEC
            py_str = self.strings[lexeme.orth]