Merge pull request #306 from wbwseeker/german_noun_chunks

add German noun chunk functionality
2025-12-08 02:34:17 +03:00 · 2016-04-08 00:54:24 +10:00 · 2016-04-08 00:54:24 +10:00 · 872695759d
commit 872695759d
parent c628908479 d65ef41d08
15 changed files with 228 additions and 162 deletions
--- a/setup.py
+++ b/setup.py
@ -63,7 +63,8 @@ MOD_NAMES = [
    'spacy.cfile',
    'spacy.matcher',
    'spacy.syntax.ner',
-    'spacy.symbols']
+    'spacy.symbols',
+    'spacy.syntax.iterators']


 # By subclassing build_extensions we have the actual compiler that will be used
@ -213,3 +214,4 @@ def setup_package():

 if __name__ == '__main__':
    setup_package()
+
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -14,12 +14,12 @@ cpdef enum attr_id_t:
    LIKE_EMAIL
    IS_STOP
    IS_OOV
-   
-    FLAG14 = 14
-    FLAG15
-    FLAG16
-    FLAG17
-    FLAG18
+    IS_BRACKET
+    IS_QUOTE
+    IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT
+
+    FLAG18 = 18
    FLAG19
    FLAG20
    FLAG21
@ -85,11 +85,7 @@ cpdef enum attr_id_t:
    HEAD
    SPACY
    PROB
+
+    LANG
    
-# Move these up to FLAG14--FLAG18 once we finish the functionality and
-# are ready to regenerate the model
-#IS_BRACKET
-#IS_QUOTE
-#IS_LEFT_PUNCT
-#IS_RIGHT_PUNCT
 
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -13,10 +13,10 @@ IDS = {
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
-    "FLAG14": FLAG14,
-    "FLAG15": FLAG15,
-    "FLAG16": FLAG16,
-    "FLAG17": FLAG17,
+    "IS_BRACKET": IS_BRACKET,
+    "IS_QUOTE": IS_QUOTE,
+    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
+    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "FLAG18": FLAG18,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
@ -83,6 +83,7 @@ IDS = {
    "HEAD": HEAD,
    "SPACY": SPACY,
    "PROB": PROB,
+    "LANG": LANG,
 }

 # ATTR IDs, in order of the symbol
--- a/spacy/language.py
+++ b/spacy/language.py
@ -33,10 +33,6 @@ class Language(object):
    @staticmethod
    def norm(string):
        return string
-    
-    @staticmethod
-    def shape(string):
-        return orth.word_shape(string)

    @staticmethod
    def prefix(string):
@ -50,66 +46,14 @@ class Language(object):
    def cluster(string):
        return 0

-    @staticmethod
-    def is_alpha(string):
-        return orth.is_alpha(string)
-
-    @staticmethod
-    def is_ascii(string):
-        return orth.is_ascii(string)
-
    @staticmethod
    def is_digit(string):
        return string.isdigit()

-    @staticmethod
-    def is_lower(string):
-        return orth.is_lower(string)
-
-    @staticmethod
-    def is_punct(string):
-        return orth.is_punct(string)
-
    @staticmethod
    def is_space(string):
        return string.isspace()

-    @staticmethod
-    def is_title(string):
-        return orth.is_title(string)
-
-    @staticmethod
-    def is_bracket(string):
-        return orth.is_bracket(string)
-
-    @staticmethod
-    def is_quote(string):
-        return orth.is_quote(string)
-
-    @staticmethod
-    def is_left_punct(string):
-        return orth.is_left_punct(string)
-
-    @staticmethod
-    def is_right_punct(string):
-        return orth.is_right_punct(string)
-
-    @staticmethod
-    def is_upper(string):
-        return orth.is_upper(string)
-
-    @staticmethod
-    def like_url(string):
-        return orth.like_url(string)
-
-    @staticmethod
-    def like_num(string):
-        return orth.like_number(string)
-
-    @staticmethod
-    def like_email(string):
-        return orth.like_email(string)
-
    @staticmethod
    def is_stop(string):
        return 0
@ -120,26 +64,27 @@ class Language(object):
        return {
            attrs.LOWER: cls.lower,
            attrs.NORM: cls.norm,
-            attrs.SHAPE: cls.shape,
+            attrs.SHAPE: orth.word_shape,
            attrs.PREFIX: cls.prefix,
            attrs.SUFFIX: cls.suffix,
            attrs.CLUSTER: cls.cluster,
            attrs.PROB: lambda string: oov_prob,
-            attrs.IS_ALPHA: cls.is_alpha,
-            attrs.IS_ASCII: cls.is_ascii,
+            attrs.LANG: lambda string: cls.lang,
+            attrs.IS_ALPHA: orth.is_alpha,
+            attrs.IS_ASCII: orth.is_ascii,
            attrs.IS_DIGIT: cls.is_digit,
-            attrs.IS_LOWER: cls.is_lower,
-            attrs.IS_PUNCT: cls.is_punct,
+            attrs.IS_LOWER: orth.is_lower,
+            attrs.IS_PUNCT: orth.is_punct,
            attrs.IS_SPACE: cls.is_space,
-            attrs.IS_TITLE: cls.is_title,
-            attrs.IS_UPPER: cls.is_upper,
-            attrs.FLAG14: cls.is_bracket,
-            attrs.FLAG15: cls.is_quote,
-            attrs.FLAG16: cls.is_left_punct,
-            attrs.FLAG17: cls.is_right_punct,
-            attrs.LIKE_URL: cls.like_url,
-            attrs.LIKE_NUM: cls.like_num,
-            attrs.LIKE_EMAIL: cls.like_email,
+            attrs.IS_TITLE: orth.is_title,
+            attrs.IS_UPPER: orth.is_upper,
+            attrs.IS_BRACKET: orth.is_bracket,
+            attrs.IS_QUOTE: orth.is_quote,
+            attrs.IS_LEFT_PUNCT: orth.is_left_punct,
+            attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
+            attrs.LIKE_URL: orth.like_url,
+            attrs.LIKE_NUM: orth.like_number,
+            attrs.LIKE_EMAIL: orth.like_email,
            attrs.IS_STOP: cls.is_stop,
            attrs.IS_OOV: lambda string: True
        }
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,6 +1,6 @@
 from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
 from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG

 from .structs cimport LexemeC
 from .strings cimport StringStore
@ -41,6 +41,8 @@ cdef class Lexeme:
            lex.suffix = value
        elif name == CLUSTER:
            lex.cluster = value
+        elif name == LANG:
+            lex.lang = value

    @staticmethod
    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -67,6 +69,8 @@ cdef class Lexeme:
            return lex.length
        elif feat_name == CLUSTER:
            return lex.cluster
+        elif feat_name == LANG:
+            return lex.lang
        else:
            return 0
    
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -18,10 +18,10 @@ import numpy

 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport FLAG14 as IS_BRACKET
-from .attrs cimport FLAG15 as IS_QUOTE
-from .attrs cimport FLAG16 as IS_LEFT_PUNCT
-from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
+from .attrs cimport IS_BRACKET
+from .attrs cimport IS_QUOTE
+from .attrs cimport IS_LEFT_PUNCT
+from .attrs cimport IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV


@ -74,8 +74,8 @@ cdef class Lexeme:
                raise ValueError(
                    "Word vectors set to length 0. This may be because the "
                    "data is not installed. If you haven't already, run"
-                    "\npython -m spacy.en.download all\n"
-                    "to install the data."
+                    "\npython -m spacy.%s.download all\n"
+                    "to install the data." % self.vocab.lang
                )
 
            vector_view = <float[:length,]>self.c.vector
@ -123,6 +123,10 @@ cdef class Lexeme:
        def __get__(self): return self.c.cluster
        def __set__(self, int x): self.c.cluster = x
 
+    property lang:
+        def __get__(self): return self.c.lang
+        def __set__(self, int x): self.c.lang = x
+
    property prob:
        def __get__(self): return self.c.prob
        def __set__(self, float x): self.c.prob = x
@ -147,6 +151,10 @@ cdef class Lexeme:
        def __get__(self): return self.vocab.strings[self.c.suffix]
        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]

+    property lang_:
+        def __get__(self): return self.vocab.strings[self.c.lang]
+        def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
+
    property flags:
        def __get__(self): return self.c.flags
        def __set__(self, flags_t x): self.c.flags = x
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -5,9 +5,6 @@ import unicodedata
 import re


-TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
-
-
 # Binary string features
 cpdef bint is_alpha(unicode string):
    return string.isalpha()
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
    else:
        return True

+
 cpdef bint is_bracket(unicode string):
-    return False
+    brackets = ('(',')','[',']','{','}','<','>')
+    return string in brackets
+

 cpdef bint is_quote(unicode string):
-    if string in ('"', "'"):
-        return True
-    else:
-        return False
+    quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``')
+    return string in quotes
+

 cpdef bint is_left_punct(unicode string):
-    return False
+    left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``')
+    return string in left_punct
+

 cpdef bint is_right_punct(unicode string):
-    return False
+    right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''")
+    return string in right_punct


 cpdef bint is_title(unicode string):
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -9,6 +9,8 @@ cdef struct LexemeC:

    flags_t flags

+    attr_t lang
+
    attr_t id
    attr_t length

--- a/spacy/syntax/iterators.pxd
+++ b/spacy/syntax/iterators.pxd
@ -0,0 +1,16 @@
+
+from spacy.tokens.doc cimport Doc
+
+cdef class DocIterator:
+    cdef Doc _doc
+
+cdef class EnglishNounChunks(DocIterator):
+    cdef int i
+    cdef int _np_label
+    cdef set _np_deps
+
+cdef class GermanNounChunks(DocIterator):
+    cdef int i
+    cdef int _np_label
+    cdef set _np_deps
+    cdef int _close_app
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -0,0 +1,82 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span
+from spacy.tokens.doc cimport Doc
+from spacy.tokens.token cimport Token
+
+from spacy.parts_of_speech cimport NOUN
+
+# base class for document iterators
+cdef class DocIterator:
+    def __init__(self, Doc doc):
+        self._doc = doc
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise NotImplementedError
+
+
+cdef class EnglishNounChunks(DocIterator):
+    def __init__(self, Doc doc):
+        super(EnglishNounChunks,self).__init__(doc)
+        labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
+        self._np_label = self._doc.vocab.strings['NP']
+        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
+        self._conjunct = self._doc.vocab.strings['conj']
+        self.i = 0
+
+    def __next__(self):
+        cdef const TokenC* word
+        cdef widx
+        while self.i < self._doc.length:
+            widx = self.i
+            self.i += 1
+            word = &self._doc.c[widx]
+            if word.pos == NOUN:
+                if word.dep in self._np_deps:
+                    return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
+                elif word.dep == self._conjunct:
+                    head = word+word.head
+                    while head.dep == self._conjunct and head.head < 0:
+                        head += head.head
+                    # If the head is an NP, and we're coordinated to it, we're an NP
+                    if head.dep in self._np_deps:
+                        return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
+        raise StopIteration
+
+
+# this iterator extracts spans headed by NOUNs starting from the left-most
+# syntactic dependent until the NOUN itself
+# for close apposition and measurement construction, the span is sometimes
+# extended to the right of the NOUN
+# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
+# just "eine Tasse", same for "das Thema Familie"
+cdef class GermanNounChunks(DocIterator):
+    def __init__(self, Doc doc):
+        super(GermanNounChunks,self).__init__(doc)
+        labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
+        self._np_label = self._doc.vocab.strings['NP']
+        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
+        self._close_app = self._doc.vocab.strings['nk']
+        self.i = 0
+
+    def __next__(self):
+        cdef const TokenC* word
+        cdef int rbracket
+        cdef Token rdep
+        cdef widx
+        while self.i < self._doc.length:
+            widx = self.i
+            self.i += 1
+            word = &self._doc.c[widx]
+            if word.pos == NOUN and word.dep in self._np_deps:
+                rbracket = widx+1
+                # try to extend the span to the right
+                # to capture close apposition/measurement constructions
+                for rdep in self._doc[widx].rights:
+                    if rdep.pos == NOUN and rdep.dep == self._close_app:
+                        rbracket = rdep.i+1
+                return Span(self._doc, word.l_edge, rbracket, label=self._np_label)                
+        raise StopIteration
+
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC

+from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
+CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}


 DEBUG = False
@ -113,12 +115,9 @@ cdef class Parser:
        cdef int nr_feat = self.model.nr_feat
        with nogil:
            self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
-            tokens.is_parsed = True
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
-        # projectivize output
-        if self._projectivize:
-            PseudoProjectivity.deprojectivize(tokens)
+        self._finalize(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        cdef Pool mem = Pool()
@ -144,7 +143,7 @@ cdef class Parser:
                                raise ValueError("Error parsing doc: %s" % sent_str)
                PyErr_CheckSignals()
                for doc in queue:
-                    doc.is_parsed = True
+                    self._finalize(doc)
                    yield doc
                queue = []
        batch_size = len(queue)
@ -155,10 +154,19 @@ cdef class Parser:
                    with gil:
                        sent_str = queue[i].text
                        raise ValueError("Error parsing doc: %s" % sent_str)
-        for doc in queue:
-            doc.is_parsed = True
-            yield doc
        PyErr_CheckSignals()
+        for doc in queue:
+            self._finalize(doc)
+            yield doc
+
+    def _finalize(self, Doc doc):
+        # deprojectivize output
+        if self._projectivize:
+            PseudoProjectivity.deprojectivize(doc)
+        # set annotation-specific iterators
+        doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
+        # mark doc as parsed
+        doc.is_parsed = True

    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
        cdef ExampleC eg
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t

+from spacy.syntax.iterators cimport DocIterator
+

 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil

@ -42,6 +44,8 @@ cdef class Doc:
    cdef int length
    cdef int max_length

+    cdef DocIterator noun_chunks_iterator
+
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1

    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -8,6 +8,7 @@ import struct
 cimport numpy as np
 import math
 import six
+import warnings

 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
@ -80,6 +81,7 @@ cdef class Doc:
        self.is_parsed = False
        self._py_tokens = []
        self._vector = None
+        self.noun_chunks_iterator = DocIterator(self)

    def __getitem__(self, object i):
        """Get a Token or a Span from the Doc.
@ -230,33 +232,22 @@ cdef class Doc:
                    # Set start as B
                    self.c[start].ent_iob = 3

-    @property
-    def noun_chunks(self):
-        """Yield spans for base noun phrases."""
-        if not self.is_parsed:
-            raise ValueError(
-                "noun_chunks requires the dependency parse, which "
-                "requires data to be installed. If you haven't done so, run: "
-                "\npython -m spacy.en.download all\n"
-                "to install the data")
- 
-        cdef const TokenC* word
-        labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
-                  'attr', 'root']
-        np_deps = [self.vocab.strings[label] for label in labels]
-        conj = self.vocab.strings['conj']
-        np_label = self.vocab.strings['NP']
-        for i in range(self.length):
-            word = &self.c[i]
-            if word.pos == NOUN and word.dep in np_deps:
-                yield Span(self, word.l_edge, i+1, label=np_label)
-            elif word.pos == NOUN and word.dep == conj:
-                head = word+word.head
-                while head.dep == conj and head.head < 0:
-                    head += head.head
-                # If the head is an NP, and we're coordinated to it, we're an NP
-                if head.dep in np_deps:
-                    yield Span(self, word.l_edge, i+1, label=np_label)
+
+    property noun_chunks:
+        def __get__(self):
+            """Yield spans for base noun phrases."""
+            if not self.is_parsed:
+                raise ValueError(
+                    "noun_chunks requires the dependency parse, which "
+                    "requires data to be installed. If you haven't done so, run: "
+                    "\npython -m spacy.%s.download all\n"
+                    "to install the data" % self.vocab.lang)
+
+            yield from self.noun_chunks_iterator
+
+        def __set__(self, DocIterator):            
+            self.noun_chunks_iterator = DocIterator(self)
+

    @property
    def sents(self):
@ -267,8 +258,8 @@ cdef class Doc:
            raise ValueError(
                "sentence boundary detection requires the dependency parse, which "
                "requires data to be installed. If you haven't done so, run: "
-                "\npython -m spacy.en.download all\n"
-                "to install the data")
+                "\npython -m spacy.%s.download all\n"
+                "to install the data" % self.vocab.lang)
        cdef int i
        start = 0
        for i in range(1, self.length):
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT

 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport FLAG14 as IS_BRACKET
-from ..attrs cimport FLAG15 as IS_QUOTE
-from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
-from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
+from ..attrs cimport IS_BRACKET
+from ..attrs cimport IS_QUOTE
+from ..attrs cimport IS_LEFT_PUNCT
+from ..attrs cimport IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV

@ -95,6 +95,10 @@ cdef class Token:
        def __get__(self):
            return self.c.lex.prob

+    property lang:
+        def __get__(self):
+            return self.c.lex.lang
+
    property idx:
        def __get__(self):
            return self.c.idx
@ -161,8 +165,8 @@ cdef class Token:
                raise ValueError(
                    "Word vectors set to length 0. This may be because the "
                    "data is not installed. If you haven't already, run"
-                    "\npython -m spacy.en.download all\n"
-                    "to install the data."
+                    "\npython -m spacy.%s.download all\n"
+                    "to install the data." % self.vocab.lang
                )
            vector_view = <float[:length,]>self.c.lex.vector
            return numpy.asarray(vector_view)
@ -177,23 +181,11 @@ cdef class Token:

    property n_lefts:
        def __get__(self):
-            cdef int n = 0
-            cdef const TokenC* ptr = self.c - self.i
-            while ptr != self.c:
-                if ptr + ptr.head == self.c:
-                    n += 1
-                ptr += 1
-            return n
+            return self.c.l_kids

    property n_rights:
        def __get__(self):
-            cdef int n = 0
-            cdef const TokenC* ptr = self.c + (self.array_len - self.i)
-            while ptr != self.c:
-                if ptr + ptr.head == self.c:
-                    n += 1
-                ptr -= 1
-            return n
+            return self.c.r_kids

    property lefts:
        def __get__(self):
@ -415,6 +407,10 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]

+    property lang_:
+        def __get__(self):
+            return self.vocab.strings[self.c.lex.lang]
+
    property lemma_:
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -26,7 +26,7 @@ from . import symbols

 from cymem.cymem cimport Address
 from .serialize.packer cimport Packer
-from .attrs cimport PROB
+from .attrs cimport PROB, LANG

 try:
    import copy_reg
@ -104,6 +104,13 @@ cdef class Vocab:
                self._serializer = Packer(self, self.serializer_freqs)
            return self._serializer

+    property lang:
+        def __get__(self):
+            langfunc = None
+            if self.get_lex_attr:
+                langfunc = self.get_lex_attr.get(LANG,None)
+            return langfunc('_') if langfunc else ''
+
    def __len__(self):
        """The current number of lexemes stored."""
        return self.length
@ -245,6 +252,7 @@ cdef class Vocab:
            fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
            fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
            fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
+            fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
        fp.close()

    def load_lexemes(self, loc):
@ -277,6 +285,7 @@ cdef class Vocab:
            fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
            fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
            fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
+            fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))

            lexeme.vector = EMPTY_VEC
            py_str = self.strings[lexeme.orth]