* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

This commit is contained in:
Matthew Honnibal 2015-07-13 20:20:58 +02:00
parent 3ea8756c24
commit 6eef0bf9ab
13 changed files with 39 additions and 29 deletions

View File

@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Doc
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max

View File

@ -1,14 +0,0 @@
from .tokens cimport Doc
from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab
from .strings cimport StringStore
cdef class Span:
cdef readonly Doc _seq
cdef public int i
cdef public int start
cdef public int end
cdef readonly int label

View File

@ -12,7 +12,7 @@ from libc.string cimport memset
from itertools import combinations
from ..tokens cimport TokenC
from ..structs cimport TokenC
from .stateclass cimport StateClass

View File

@ -4,7 +4,8 @@ from .._ml cimport Model
from .arc_eager cimport TransitionSystem
from ..tokens cimport Doc, TokenC
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
cdef class Parser:

View File

@ -32,7 +32,9 @@ from thinc.learner cimport LinearModel
from thinc.search cimport Beam
from thinc.search cimport MaxViolation
from ..tokens cimport Doc, TokenC
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore

View File

@ -8,7 +8,7 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology, UniStr
from .strings cimport StringStore
from .tokens cimport Doc
from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached

View File

@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict
from . import util
from .util import read_lang_data
from .tokens import Doc
from .tokens.doc cimport Doc
cdef class Tokenizer:

View File

@ -1,5 +0,0 @@
from .doc cimport Doc
from .token cimport Token
from .spans cimport Spans
__all__ = [Doc, Token, Spans]

View File

@ -1,5 +1,5 @@
from .doc import Doc
from .token import Token
from .spans import Spans
from .spans import Span
__all__ = [Doc, Token, Spans]
__all__ = [Doc, Token, Span]

View File

@ -31,5 +31,3 @@ cdef class Doc:
cpdef np.ndarray to_array(self, object features)
cdef int set_parse(self, const TokenC* parsed) except -1

9
spacy/tokens/spans.pxd Normal file
View File

@ -0,0 +1,9 @@
from .doc cimport Doc
cdef class Span:
cdef readonly Doc _seq
cdef public int i
cdef public int start
cdef public int end
cdef readonly int label

View File

@ -1,6 +1,10 @@
from __future__ import unicode_literals
from collections import defaultdict
from ..structs cimport Morphology, TokenC, LexemeC
from ..typedefs cimport flags_t, attr_id_t, attr_t
from ..parts_of_speech cimport univ_pos_t
cdef class Span:
"""A slice from a Doc object."""

View File

@ -3,6 +3,18 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
from ..lexeme cimport check_flag
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
import numpy
from ..parts_of_speech import UNIV_POS_NAMES
from ..typedefs cimport LEMMA
from ..typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..typedefs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
cdef class Token:
@ -279,3 +291,6 @@ cdef class Token:
property dep_:
def __get__(self):
return self.vocab.strings[self.c.dep]
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}