From 6eef0bf9ab4a571991ed419e73e918e62d5224c2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 13 Jul 2015 20:20:58 +0200 Subject: [PATCH] * Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx --- spacy/en/pos.pyx | 2 +- spacy/spans.pxd | 14 -------------- spacy/syntax/_parse_features.pyx | 2 +- spacy/syntax/parser.pxd | 3 ++- spacy/syntax/parser.pyx | 4 +++- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 2 +- spacy/tokens/__init__.pxd | 5 ----- spacy/tokens/__init__.py | 4 ++-- spacy/tokens/doc.pxd | 2 -- spacy/tokens/spans.pxd | 9 +++++++++ spacy/{ => tokens}/spans.pyx | 4 ++++ spacy/tokens/token.pyx | 15 +++++++++++++++ 13 files changed, 39 insertions(+), 29 deletions(-) delete mode 100644 spacy/spans.pxd create mode 100644 spacy/tokens/spans.pxd rename spacy/{ => tokens}/spans.pyx (96%) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index de795c1f3..a9a55d621 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE from ..typedefs cimport id_t from ..structs cimport TokenC, Morphology, LexemeC -from ..tokens cimport Doc +from ..tokens.doc cimport Doc from ..morphology cimport set_morph_from_dict from .._ml cimport arg_max diff --git a/spacy/spans.pxd b/spacy/spans.pxd deleted file mode 100644 index 8afcdfa6a..000000000 --- a/spacy/spans.pxd +++ /dev/null @@ -1,14 +0,0 @@ -from .tokens cimport Doc -from .typedefs cimport flags_t, attr_id_t, attr_t -from .parts_of_speech cimport univ_pos_t -from .structs cimport Morphology, TokenC, LexemeC -from .vocab cimport Vocab -from .strings cimport StringStore - - -cdef class Span: - cdef readonly Doc _seq - cdef public int i - cdef public int start - cdef public int end - cdef readonly int label diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 35a49f21c..3220fb7f5 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -12,7 +12,7 @@ from libc.string cimport memset from itertools import combinations -from ..tokens cimport TokenC +from ..structs cimport TokenC from .stateclass cimport StateClass diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index dfc5c74a2..2c17464e7 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -4,7 +4,8 @@ from .._ml cimport Model from .arc_eager cimport TransitionSystem -from ..tokens cimport Doc, TokenC +from ..tokens.doc cimport Doc +from ..structs cimport TokenC cdef class Parser: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 89102a454..a3c2eb886 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -32,7 +32,9 @@ from thinc.learner cimport LinearModel from thinc.search cimport Beam from thinc.search cimport MaxViolation -from ..tokens cimport Doc, TokenC +from ..structs cimport TokenC + +from ..tokens.doc cimport Doc from ..strings cimport StringStore diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index d5c68f8e5..1d3c5b9c3 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -8,7 +8,7 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .structs cimport LexemeC, TokenC, Morphology, UniStr from .strings cimport StringStore -from .tokens cimport Doc +from .tokens.doc cimport Doc from .vocab cimport Vocab, _Cached diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index d287ec9ca..aa348abd0 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict from . import util from .util import read_lang_data -from .tokens import Doc +from .tokens.doc cimport Doc cdef class Tokenizer: diff --git a/spacy/tokens/__init__.pxd b/spacy/tokens/__init__.pxd index 3148fc08e..e69de29bb 100644 --- a/spacy/tokens/__init__.pxd +++ b/spacy/tokens/__init__.pxd @@ -1,5 +0,0 @@ -from .doc cimport Doc -from .token cimport Token -from .spans cimport Spans - -__all__ = [Doc, Token, Spans] diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index aa55dbaeb..9950ee703 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,5 +1,5 @@ from .doc import Doc from .token import Token -from .spans import Spans +from .spans import Span -__all__ = [Doc, Token, Spans] +__all__ = [Doc, Token, Span] diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index dc9fa6064..63f5bd815 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -31,5 +31,3 @@ cdef class Doc: cpdef np.ndarray to_array(self, object features) cdef int set_parse(self, const TokenC* parsed) except -1 - - diff --git a/spacy/tokens/spans.pxd b/spacy/tokens/spans.pxd new file mode 100644 index 000000000..d9704ad1f --- /dev/null +++ b/spacy/tokens/spans.pxd @@ -0,0 +1,9 @@ +from .doc cimport Doc + + +cdef class Span: + cdef readonly Doc _seq + cdef public int i + cdef public int start + cdef public int end + cdef readonly int label diff --git a/spacy/spans.pyx b/spacy/tokens/spans.pyx similarity index 96% rename from spacy/spans.pyx rename to spacy/tokens/spans.pyx index d2acbedec..9385e9777 100644 --- a/spacy/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,6 +1,10 @@ from __future__ import unicode_literals from collections import defaultdict +from ..structs cimport Morphology, TokenC, LexemeC +from ..typedefs cimport flags_t, attr_id_t, attr_t +from ..parts_of_speech cimport univ_pos_t + cdef class Span: """A slice from a Doc object.""" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index e34b8a806..b1bde6a13 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -3,6 +3,18 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray +cimport numpy as np +np.import_array() + +import numpy + + +from ..parts_of_speech import UNIV_POS_NAMES + +from ..typedefs cimport LEMMA +from ..typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER +from ..typedefs cimport POS, LEMMA, TAG, DEP +from ..parts_of_speech cimport CONJ, PUNCT cdef class Token: @@ -279,3 +291,6 @@ cdef class Token: property dep_: def __get__(self): return self.vocab.strings[self.c.dep] + + +_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}