From bb80937544c7579e95cc14a17b0e1b83ff4105a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Dec 2014 18:45:16 +1100 Subject: [PATCH] * Upd docstrings --- spacy/en/__init__.py | 57 ++++++++++- spacy/en/pos.pyx | 6 ++ spacy/syntax/_parse_features.pyx | 162 ++++++++++++++++++++++++++++--- spacy/tokens.pyx | 73 +++++++++++--- spacy/vocab.pyx | 3 +- 5 files changed, 272 insertions(+), 29 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 3ccc0ceb6..396802ab3 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -16,16 +16,64 @@ def get_lex_props(string): class English(object): - def __init__(self, data_dir=None, tag=True, parse=False): + """The English NLP pipeline. + + Provides a tokenizer, lexicon, part-of-speech tagger and parser. + + Keyword args: + data_dir (unicode): A path to a directory, from which to load the pipeline. + If None, looks for a directory named "data/" in the same directory as + the present file, i.e. path.join(path.dirname(__file__, 'data')). + If path.join(data_dir, 'pos') exists, the tagger is loaded from it. + If path.join(data_dir, 'deps') exists, the parser is loaded from it. + See Pipeline Directory Structure for details. + + Attributes: + vocab (spacy.vocab.Vocab): The lexicon. + + strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs. + + tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline. + + tagger (spacy.en.pos.EnPosTagger): + The part-of-speech tagger, which also performs lemmatization and + morphological analysis. + + parser (spacy.syntax.parser.GreedyParser): + A greedy shift-reduce dependency parser. + + + """ + def __init__(self, data_dir=None): if data_dir is None: data_dir = path.join(path.dirname(__file__), 'data') self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props) self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir) - self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None - self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None + if path.exists(path.join(data_dir, 'pos')): + self.tagger = EnPosTagger(self.vocab.strings, data_dir) + else: + self.tagger = None + if path.exists(path.join(data_dir, 'deps')): + self.parser = GreedyParser(path.join(data_dir, 'deps')) + else: + self.parser = None self.strings = self.vocab.strings def __call__(self, text, tag=True, parse=True): + """Apply the pipeline to some text. + + Args: + text (unicode): The text to be processed. + + Keyword args: + tag (bool): Whether to add part-of-speech tags to the text. This + will also set morphological analysis and lemmas. + + parse (bool): Whether to add dependency-heads and labels to the text. + + Returns: + tokens (spacy.tokens.Tokens): + """ tokens = self.tokenizer.tokenize(text) if self.tagger and tag: self.tagger(tokens) @@ -35,7 +83,10 @@ class English(object): @property def tags(self): + """List of part-of-speech tag names.""" if self.tagger is None: return [] else: return self.tagger.tag_names + + diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index d0a5c50b8..d973490ee 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -204,6 +204,7 @@ cdef struct _CachedMorph: cdef class EnPosTagger(Tagger): + """A part-of-speech tagger for English""" def __init__(self, StringStore strings, data_dir): model_dir = path.join(data_dir, 'pos') Tagger.__init__(self, path.join(model_dir)) @@ -224,6 +225,11 @@ cdef class EnPosTagger(Tagger): self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) def __call__(self, Tokens tokens): + """Apply the tagger, setting the POS tags onto the Tokens object. + + Args: + tokens (Tokens): The tokens to be tagged. + """ cdef int i cdef atom_t[N_CONTEXT_FIELDS] context cdef TokenC* t = tokens.data diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index caaae8dce..5c7f39ff7 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -13,7 +13,8 @@ from itertools import combinations from ..tokens cimport TokenC from ._state cimport State from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2 -from ._state cimport get_left, get_right +from ._state cimport has_head, get_left, get_right +from ._state cimport count_left_kids, count_right_kids cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: @@ -24,10 +25,12 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[3] = 0 context[4] = 0 context[5] = 0 + context[6] = 0 else: context[0] = token.lex.sic - context[1] = token.pos - context[2] = token.lex.cluster + context[1] = token.lemma + context[2] = token.fine_pos + context[3] = token.lex.cluster # We've read in the string little-endian, so now we can take & (2**n)-1 # to get the first n bits of the cluster. # e.g. s = "1110010101" @@ -40,9 +43,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: # What we're doing here is picking a number where all bits are 1, e.g. # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in # the source that are set to 1. - context[3] = token.lex.cluster & 63 - context[4] = token.lex.cluster & 15 - context[5] = token.dep_tag + context[4] = token.lex.cluster & 63 + context[5] = token.lex.cluster & 15 + context[6] = token.dep_tag if has_head(token) else 0 cdef int fill_context(atom_t* context, State* state) except -1: @@ -66,12 +69,148 @@ cdef int fill_context(atom_t* context, State* state) except -1: context[dist] = state.stack[0] - state.i else: context[dist] = 0 - context[N0lv] = 0 - context[S0lv] = 0 - context[S0rv] = 0 - context[S1lv] = 0 - context[S1rv] = 0 + context[N0lv] = max(count_left_kids(get_n0(state)), 5) + context[S0lv] = max(count_left_kids(get_s0(state)), 5) + context[S0rv] = max(count_right_kids(get_s0(state)), 5) + context[S1lv] = max(count_left_kids(get_s1(state)), 5) + context[S1rv] = max(count_right_kids(get_s1(state)), 5) + context[S0_has_head] = 0 + context[S1_has_head] = 0 + context[S2_has_head] = 0 + if state.stack_len >= 1: + context[S0_has_head] = has_head(get_s0(state)) + 1 + if state.stack_len >= 2: + context[S1_has_head] = has_head(get_s1(state)) + 1 + if state.stack_len >= 3: + context[S2_has_head] = has_head(get_s2(state)) + + +unigrams = ( + (S2W, S2p), + (S2c6, S2p), + + (S1W, S1p), + (S1c6, S1p), + + (S0W, S0p), + (S0c6, S0p), + + (N0W, N0p), + (N0p,), + (N0c,), + (N0c6, N0p), + (N0L,), + + (N1W, N1p), + (N1c6, N1p), + + (N2W, N2p), + (N2c6, N2p), + + (S0r2W, S0r2p), + (S0r2c6, S0r2p), + (S0r2L,), + + (S0rW, S0rp), + (S0rc6, S0rp), + (S0rL,), + + (S0l2W, S0l2p), + (S0l2c6, S0l2p), + (S0l2L,), + + (S0lW, S0lp), + (S0lc6, S0lp), + (S0lL,), + + (N0l2W, N0l2p), + (N0l2c6, N0l2p), + (N0l2L,), + + (N0lW, N0lp), + (N0lc6, N0lp), + (N0lL,), +) + + +s0_n0 = ( + (S0W, S0p, N0W, N0p), + (S0c, S0p, N0c, N0p), + (S0c6, S0p, N0c6, N0p), + (S0c4, S0p, N0c4, N0p), + (S0p, N0p), + (S0W, N0p), + (S0p, N0W), + (S0W, N0c), + (S0c, N0W), + (S0p, N0c), + (S0c, N0p), + (S0W, S0rp, N0p), + (S0p, S0rp, N0p), + (S0p, N0lp, N0W), + (S0p, N0lp, N0p), +) + + +s1_n0 = ( + (S1p, N0p), + (S1c, N0c), + (S1c, N0p), + (S1p, N0c), + (S1W, S1p, N0p), + (S1p, N0W, N0p), + (S1c6, S1p, N0c6, N0p), +) + + +s0_n1 = ( + (S0p, N1p), + (S0c, N1c), + (S0c, N1p), + (S0p, N1c), + (S0W, S0p, N1p), + (S0p, N1W, N1p), + (S0c6, S0p, N1c6, N1p), +) + +n0_n1 = ( + (N0W, N0p, N1W, N1p), + (N0W, N0p, N1p), + (N0p, N1W, N1p), + (N0c, N0p, N1c, N1p), + (N0c6, N0p, N1c6, N1p), + (N0c, N1c), + (N0p, N1c), +) + +tree_shape = ( + (dist,), + (S0p, S0_has_head, S1_has_head, S2_has_head), + (S0p, S0lv, S0rv), + (N0p, N0lv), +) + +trigrams = ( + (N0p, N1p, N2p), + (S0p, S0lp, S0l2p), + (S0p, S0rp, S0r2p), + (S0p, S1p, S2p), + (S1p, S0p, N0p), + (S0p, S0lp, N0p), + (S0p, N0p, N0lp), + (N0p, N0lp, N0l2p), + + (S0W, S0p, S0rL, S0r2L), + (S0p, S0rL, S0r2L), + + (S0W, S0p, S0lL, S0l2L), + (S0p, S0lL, S0l2L), + + (N0W, N0p, N0lL, N0l2L), + (N0p, N0lL, N0l2L), +) + arc_eager = ( (S0w, S0p), @@ -86,7 +225,6 @@ arc_eager = ( (N2w, N2p), (N2w,), (N2p,), - (S0w, S0p, N0w, N0p), (S0w, S0p, N0w), (S0w, N0w, N0p), diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index d4ffbb701..009a0ecb6 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,4 +1,5 @@ # cython: profile=True + from preshed.maps cimport PreshMap from preshed.counter cimport PreshCounter @@ -59,13 +60,7 @@ cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil: cdef class Tokens: - """A sequence of references to Lexeme objects. - - The Tokens class provides fast and memory-efficient access to lexical features, - and can efficiently export the data to a numpy array. - - >>> from spacy.en import EN - >>> tokens = EN.tokenize('An example sentence.') + """Access and set annotations onto some text. """ def __init__(self, Vocab vocab, string_length=0): self.vocab = vocab @@ -86,10 +81,20 @@ cdef class Tokens: self.length = 0 def __getitem__(self, i): + """Retrieve a token. + + Returns: + token (Token): + """ bounds_check(i, self.length, PADDING) return Token(self, i) def __iter__(self): + """Iterate over the tokens. + + Yields: + token (Token): + """ for i in range(self.length): yield self[i] @@ -148,6 +153,11 @@ cdef class Tokens: @cython.freelist(64) cdef class Token: + """An individual token. + + Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens + object. + """ def __init__(self, Tokens tokens, int i): self._seq = tokens self.i = i @@ -163,21 +173,44 @@ cdef class Token: return self.string + ' ' def __len__(self): + """The number of unicode code-points in the original string. + + Returns: + length (int): + """ return self._seq.data[self.i].lex.length property idx: + """The index into the original string at which the token starts. + + The following is supposed to always be true: + + >>> original_string[token.idx:token.idx len(token) == token.string + """ def __get__(self): return self._seq.data[self.i].idx - property length: - def __get__(self): - return self._seq.data[self.i].lex.length - property cluster: + """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering + + Similar words have better-than-chance likelihood of having similar cluster + IDs, although the clustering is quite noisy. Cluster IDs make good features, + and help to make models slightly more robust to domain variation. + + A common trick is to use only the first N bits of a cluster ID in a feature, + as the more general part of the hierarchical clustering is often more accurate + than the lower categories. + + To assist in this, I encode the cluster IDs little-endian, to allow a simple + bit-mask: + + >>> six_bits = cluster & (2**6 - 1) + """ def __get__(self): return self._seq.data[self.i].lex.cluster property string: + """The unicode string of the word, with no whitespace padding.""" def __get__(self): cdef const TokenC* t = &self._seq.data[self.i] if t.lex.sic == 0: @@ -186,6 +219,9 @@ cdef class Token: return utf8string.decode('utf8') property lemma: + """The unicode string of the word's lemma. If no part-of-speech tag is + assigned, the most common part-of-speech tag of the word is used. + """ def __get__(self): cdef const TokenC* t = &self._seq.data[self.i] if t.lemma == 0: @@ -193,15 +229,27 @@ cdef class Token: cdef bytes utf8string = self._seq.vocab.strings[t.lemma] return utf8string.decode('utf8') - property dep: + property dep_tag: + """The ID integer of the word's dependency label. If no parse has been + assigned, defaults to 0. + """ def __get__(self): return self._seq.data[self.i].dep_tag property pos: + """The ID integer of the word's part-of-speech tag, from the 13-tag + Google Universal Tag Set. Constants for this tag set are available in + spacy.typedefs. + """ def __get__(self): return self._seq.data[self.i].pos property fine_pos: + """The ID integer of the word's fine-grained part-of-speech tag, as assigned + by the tagger model. Fine-grained tags include morphological information, + and other distinctions, and allow a more accurate tagger to be trained. + """ + def __get__(self): return self._seq.data[self.i].fine_pos @@ -210,6 +258,7 @@ cdef class Token: return self._seq.data[self.i].lex.sic property head: + """The token predicted by the parser to be the head of the current token.""" def __get__(self): cdef const TokenC* t = &self._seq.data[self.i] return Token(self._seq, self.i + t.head) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b0722d312..1b5fb9443 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -33,8 +33,6 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed, cdef class Vocab: '''A map container for a language's Lexeme structs. - - Also interns UTF-8 strings, and maps them to consecutive integer IDs. ''' def __init__(self, data_dir=None, get_lex_props=None): self.mem = Pool() @@ -53,6 +51,7 @@ cdef class Vocab: self.load(path.join(data_dir, 'lexemes')) def __len__(self): + """The current number of lexemes stored.""" return self.lexemes.size() cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: