* Work on train

2025-11-17 00:06:04 +03:00 · 2014-12-22 07:25:43 +11:00 · 2014-12-22 07:25:43 +11:00 · 4c4aa2c5c9
commit 4c4aa2c5c9
parent 4d4d2c0db4
13 changed files with 214 additions and 128 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -0,0 +1,44 @@
 from __future__ import unicode_literals
 from os import path
 from .. import orth
 from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..syntax.parser import GreedyParser
 from ..tokens import Tokens
 from ..morphology import Morphologizer
 from .lemmatizer import Lemmatizer
 from .pos import EnPosTagger
 from .attrs import get_flags
 def get_lex_props(string):
    return {'flags': get_flags(string), 'dense': 1}
 class English(object):
    def __init__(self, data_dir=None, pos_tag=True, parse=False):
        if data_dir is None:
            data_dir = path.join(path.dirname(__file__), 'data')
        self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
        if pos_tag:
            self.pos_tagger = EnPosTagger(data_dir,
                                          Morphologizer.from_dir(
                                              self.vocab.strings,
                                              Lemmatizer(path.join(data_dir, 'wordnet')),
                                              data_dir))
        else:
            self.pos_tagger = None
        if parse:
            self.parser = GreedyParser(data_dir)
        else:
            self.parser = None
    def __call__(self, text, pos_tag=True, parse=True):
        tokens = self.tokenizer.tokenize(text)
        if self.pos_tagger and pos_tag:
            self.pos_tagger(tokens)
        if self.parser and parse:
            self.parser.parse(tokens)
        return tokens
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -1,13 +1,13 @@
-from ..lexeme cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
+from ..typedefs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
-from ..lexeme cimport FLAG8, FLAG9
+from ..typedefs cimport FLAG8, FLAG9
-from ..lexeme cimport ID as _ID
+from ..typedefs cimport ID as _ID
-from ..lexeme cimport SIC as _SIC
+from ..typedefs cimport SIC as _SIC
-from ..lexeme cimport SHAPE as _SHAPE
+from ..typedefs cimport SHAPE as _SHAPE
-from ..lexeme cimport DENSE as _DENSE
+from ..typedefs cimport DENSE as _DENSE
-from ..lexeme cimport SHAPE as _SHAPE
+from ..typedefs cimport SHAPE as _SHAPE
-from ..lexeme cimport PREFIX as _PREFIX
+from ..typedefs cimport PREFIX as _PREFIX
-from ..lexeme cimport SUFFIX as _SUFFIX
+from ..typedefs cimport SUFFIX as _SUFFIX
-from ..lexeme cimport LEMMA as _LEMMA
+from ..typedefs cimport LEMMA as _LEMMA
 # Work around the lack of global cpdef variables
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -3,4 +3,4 @@ from ..morphology cimport Morphologizer
 cdef class EnPosTagger(Tagger):
-    cdef Morphologizer morphologizer
+    cdef readonly Morphologizer morphologizer
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,88 +1,9 @@
-from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
 from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
 from .structs cimport Lexeme
 from .strings cimport StringStore
 # Reserve 64 values for flag features
 cpdef enum attr_id_t:
    FLAG0
    FLAG1
    FLAG2
    FLAG3
    FLAG4
    FLAG5
    FLAG6
    FLAG7
    FLAG8
    FLAG9
    FLAG10
    FLAG11
    FLAG12
    FLAG13
    FLAG14
    FLAG15
    FLAG16
    FLAG17
    FLAG18
    FLAG19
    FLAG20
    FLAG21
    FLAG22
    FLAG23
    FLAG24
    FLAG25
    FLAG26
    FLAG27
    FLAG28
    FLAG29
    FLAG30
    FLAG31
    FLAG32
    FLAG33
    FLAG34
    FLAG35
    FLAG36
    FLAG37
    FLAG38
    FLAG39
    FLAG40
    FLAG41
    FLAG42
    FLAG43
    FLAG44
    FLAG45
    FLAG46
    FLAG47
    FLAG48
    FLAG49
    FLAG50
    FLAG51
    FLAG52
    FLAG53
    FLAG54
    FLAG55
    FLAG56
    FLAG57
    FLAG58
    FLAG59
    FLAG60
    FLAG61
    FLAG62
    FLAG63
    ID
    SIC
    DENSE
    SHAPE
    PREFIX
    SUFFIX
    LENGTH
    CLUSTER
    POS_TYPE
    LEMMA
 cdef Lexeme EMPTY_LEXEME
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -24,7 +24,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
    lex.prefix = string_store[string[:1]]
    lex.suffix = string_store[string[-3:]]
    lex.shape = string_store[orth.word_shape(string)]
    lex.dense = string_store[props['dense']]
    lex.flags = props.get('flags', 0)
    return lex
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -59,9 +59,10 @@ cdef class Morphologizer:
    @classmethod
    def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
-        tag_map = None
+        tagger_cfg = json.loads(open(path.join(data_dir, 'pos', 'config.json')).read())
-        irregulars = None
+        tag_map = tagger_cfg['tag_map']
-        tag_names = None
+        tag_names = tagger_cfg['tag_names']
        irregulars = json.loads(open(path.join(data_dir, 'morphs.json')).read())
        return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
                   tag_names=tag_names)
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -11,6 +11,11 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
 cdef class _SymbolMap:
    cdef dict _string_to_id
    cdef list _id_to_string
 cdef class StringStore:
    cdef Pool mem
    cdef Utf8Str* strings
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -9,13 +9,42 @@ from .typedefs cimport hash_t
 SEPARATOR = '\n|-SEP-|\n'
 cdef class _SymbolMap:
    def __init__(self):
        self._string_to_id = {'': 0}
        self._id_to_string = ['']
    def __iter__(self):
        for id_, string in enumerate(self._id_to_string[1:]):
            yield string, id_
    def __getitem__(self, object string_or_id):
        cdef bytes byte_string
        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
            if string_or_id < 1 or string_or_id >= self.size:
                raise IndexError(string_or_id)
            return self._int_to_string[string_or_id]
        else:
            string = string_or_id
            if isinstance(string, unicode):
                string = string.encode('utf8')
            if string in self._string_to_id:
                id_ = self._string_to_id[string]
            else:
                id_ = len(self._string_to_id)
                self._string_to_id[string] = id_
                self._id_to_string.append(string)
            return id_
 cdef class StringStore:
    def __init__(self):
        self.mem = Pool()
        self._map = PreshMap()
        self._resize_at = 10000
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
-        self.size = 1
+        self.pos_tags = _SymbolMap()
        self.dep_tags = _SymbolMap()
    property size:
        def __get__(self):
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -1,11 +1,10 @@
 from libc.stdint cimport uint32_t, uint64_t
 from thinc.features cimport Extractor
 from thinc.learner cimport LinearModel
 from .arc_eager cimport TransitionSystem
-from ..tokens cimport Tokens, TokenC
+from ..structs cimport TokenC
-from ._state cimport State
+from ..tokens cimport Tokens
 cdef class GreedyParser:
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -3,6 +3,7 @@
 from __future__ import unicode_literals
 from os import path
 import re
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
@ -27,7 +28,7 @@ cdef class Tokenizer:
        self._prefix_re = prefix_re
        self._suffix_re = suffix_re
        self._infix_re = infix_re
-        self.vocab = Vocab(self.get_props)
+        self.vocab = vocab
        self._load_special_tokenization(rules)
    @classmethod
@ -39,11 +40,12 @@ cdef class Tokenizer:
        assert path.exists(data_dir) and path.isdir(data_dir)
        rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
-        return cls(vocab, rules, prefix_re, suffix_re, infix_re)
+        return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re),
                   re.compile(infix_re))
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
-        cdef Tokens tokens = Tokens(self.vocab.strings, length)
+        cdef Tokens tokens = Tokens(self.vocab, length)
        if length == 0:
            return tokens
        cdef UniStr string_struct
@ -76,7 +78,7 @@ cdef class Tokenizer:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
        cdef int length = len(string)
-        cdef Tokens tokens = Tokens(self.vocab.strings, length)
+        cdef Tokens tokens = Tokens(self.vocab, length)
        if length == 0:
            return tokens
        cdef int i = 0
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -4,11 +4,11 @@ import numpy as np
 cimport numpy as np
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
-from .structs cimport Lexeme, TokenC, Morphology
+from .typedefs cimport flags_t
-
+from .structs cimport Morphology, TokenC, Lexeme
-from .typedefs cimport flags_t, attr_t, flags_t
+from .vocab cimport Vocab
 from .strings cimport StringStore
@ -22,7 +22,7 @@ ctypedef fused LexemeOrToken:
 cdef class Tokens:
    cdef Pool mem
-    cdef StringStore strings
+    cdef Vocab vocab
    cdef list tag_names
    cdef TokenC* data
@ -36,7 +36,7 @@ cdef class Tokens:
 cdef class Token:
-    cdef public StringStore strings
+    cdef readonly StringStore string_store
    cdef public int i
    cdef public int idx
    cdef int pos
@ -44,18 +44,18 @@ cdef class Token:
    cdef public int head
    cdef public int dep_tag
-    cdef public attr_t id
+    cdef public atom_t id
-    cdef public attr_t cluster
+    cdef public atom_t cluster
-    cdef public attr_t length
+    cdef public atom_t length
-    cdef public attr_t postype
+    cdef public atom_t postype
-    cdef public attr_t sensetype
+    cdef public atom_t sensetype
-    cdef public attr_t sic
+    cdef public atom_t sic
-    cdef public attr_t norm
+    cdef public atom_t norm
-    cdef public attr_t shape
+    cdef public atom_t shape
-    cdef public attr_t asciied
+    cdef public atom_t asciied
-    cdef public attr_t prefix
+    cdef public atom_t prefix
-    cdef public attr_t suffix
+    cdef public atom_t suffix
    cdef public float prob
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,7 +2,9 @@
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
-from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t
+from .lexeme cimport get_attr, EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
 cimport cython
 import numpy as np
@ -30,8 +32,8 @@ cdef class Tokens:
    >>> from spacy.en import EN
    >>> tokens = EN.tokenize('An example sentence.')
    """
-    def __init__(self, StringStore string_store, string_length=0):
+    def __init__(self, Vocab vocab, string_length=0):
-        self.string_store = string_store
+        self.vocab = vocab
        if string_length >= 3:
            size = int(string_length / 3.0)
        else:
@ -50,7 +52,7 @@ cdef class Tokens:
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Token(self.string_store, i, self.data[i].idx, self.data[i].pos,
+        return Token(self.vocab.strings, i, self.data[i].idx, self.data[i].pos,
                     self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
                     self.data[i].lex[0])
@ -119,10 +121,10 @@ cdef class Token:
                 int pos, int lemma, int head, int dep_tag, dict lex):
        self.string_store = string_store
        self.idx = idx
-        self.pos = pos
+        self.pos_id = pos
        self.i = i
        self.head = head
-        self.dep_tag = dep_tag
+        self.dep_id = dep_tag
        self.id = lex['id']
        self.lemma = lemma
@ -154,6 +156,9 @@ cdef class Token:
            cdef bytes utf8string = self.string_store[self.lemma]
            return utf8string.decode('utf8')
    property dep:
        def __get__(self):
            return self.string_store.dep_tags[self.dep]
    property pos:
        def __get__(self):
-            return self.lang.pos_tagger.tag_names[self.pos]
+            return self.string_store.pos_tags[self.pos]
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -21,6 +21,87 @@ cpdef enum univ_tag_t:
    N_UNIV_TAGS
 # Reserve 64 values for flag features
 cpdef enum attr_id_t:
    FLAG0
    FLAG1
    FLAG2
    FLAG3
    FLAG4
    FLAG5
    FLAG6
    FLAG7
    FLAG8
    FLAG9
    FLAG10
    FLAG11
    FLAG12
    FLAG13
    FLAG14
    FLAG15
    FLAG16
    FLAG17
    FLAG18
    FLAG19
    FLAG20
    FLAG21
    FLAG22
    FLAG23
    FLAG24
    FLAG25
    FLAG26
    FLAG27
    FLAG28
    FLAG29
    FLAG30
    FLAG31
    FLAG32
    FLAG33
    FLAG34
    FLAG35
    FLAG36
    FLAG37
    FLAG38
    FLAG39
    FLAG40
    FLAG41
    FLAG42
    FLAG43
    FLAG44
    FLAG45
    FLAG46
    FLAG47
    FLAG48
    FLAG49
    FLAG50
    FLAG51
    FLAG52
    FLAG53
    FLAG54
    FLAG55
    FLAG56
    FLAG57
    FLAG58
    FLAG59
    FLAG60
    FLAG61
    FLAG62
    FLAG63
    ID
    SIC
    DENSE
    SHAPE
    PREFIX
    SUFFIX
    LENGTH
    CLUSTER
    POS_TYPE
    LEMMA
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
 ctypedef uint32_t attr_t
`@ -3,4 +3,4 @@ from ..morphology cimport Morphologizer`


	`cdef class EnPosTagger(Tagger):`	`cdef class EnPosTagger(Tagger):`
	`cdef Morphologizer morphologizer`	`cdef readonly Morphologizer morphologizer`