From bb80937544c7579e95cc14a17b0e1b83ff4105a6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Dec 2014 18:45:16 +1100
Subject: [PATCH] * Upd docstrings

---
 spacy/en/__init__.py             |  57 ++++++++++-
 spacy/en/pos.pyx                 |   6 ++
 spacy/syntax/_parse_features.pyx | 162 ++++++++++++++++++++++++++++---
 spacy/tokens.pyx                 |  73 +++++++++++---
 spacy/vocab.pyx                  |   3 +-
 5 files changed, 272 insertions(+), 29 deletions(-)

diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index 3ccc0ceb6..396802ab3 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -16,16 +16,64 @@ def get_lex_props(string):
 
 
 class English(object):
-    def __init__(self, data_dir=None, tag=True, parse=False):
+    """The English NLP pipeline.
+
+    Provides a tokenizer, lexicon, part-of-speech tagger and parser.
+
+    Keyword args:
+        data_dir (unicode): A path to a directory, from which to load the pipeline.
+            If None, looks for a directory named "data/" in the same directory as
+            the present file, i.e. path.join(path.dirname(__file__, 'data')).
+            If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
+            If path.join(data_dir, 'deps') exists, the parser is loaded from it.
+            See Pipeline Directory Structure for details.
+
+    Attributes:
+        vocab (spacy.vocab.Vocab): The lexicon.
+
+        strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.
+
+        tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.
+
+        tagger (spacy.en.pos.EnPosTagger):
+            The part-of-speech tagger, which also performs lemmatization and
+            morphological analysis.
+
+        parser (spacy.syntax.parser.GreedyParser):
+            A greedy shift-reduce dependency parser.
+
+
+    """
+    def __init__(self, data_dir=None):
         if data_dir is None:
             data_dir = path.join(path.dirname(__file__), 'data')
         self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
         self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
-        self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
-        self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None
+        if path.exists(path.join(data_dir, 'pos')):
+            self.tagger = EnPosTagger(self.vocab.strings, data_dir)
+        else:
+            self.tagger = None
+        if path.exists(path.join(data_dir, 'deps')):
+            self.parser = GreedyParser(path.join(data_dir, 'deps'))
+        else:
+            self.parser = None
         self.strings = self.vocab.strings
 
     def __call__(self, text, tag=True, parse=True):
+        """Apply the pipeline to some text.
+        
+        Args:
+            text (unicode): The text to be processed.
+
+        Keyword args:
+            tag (bool): Whether to add part-of-speech tags to the text.  This
+                will also set morphological analysis and lemmas.
+
+            parse (bool): Whether to add dependency-heads and labels to the text.
+
+        Returns:
+            tokens (spacy.tokens.Tokens):
+        """
         tokens = self.tokenizer.tokenize(text)
         if self.tagger and tag:
             self.tagger(tokens)
@@ -35,7 +83,10 @@ class English(object):
 
     @property
     def tags(self):
+        """List of part-of-speech tag names."""
         if self.tagger is None:
             return []
         else:
             return self.tagger.tag_names
+
+
diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx
index d0a5c50b8..d973490ee 100644
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@@ -204,6 +204,7 @@ cdef struct _CachedMorph:
 
 
 cdef class EnPosTagger(Tagger):
+    """A part-of-speech tagger for English"""
     def __init__(self, StringStore strings, data_dir):
         model_dir = path.join(data_dir, 'pos')
         Tagger.__init__(self, path.join(model_dir))
@@ -224,6 +225,11 @@ cdef class EnPosTagger(Tagger):
         self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
 
     def __call__(self, Tokens tokens):
+        """Apply the tagger, setting the POS tags onto the Tokens object.
+
+        Args:
+            tokens (Tokens): The tokens to be tagged.
+        """
         cdef int i
         cdef atom_t[N_CONTEXT_FIELDS] context
         cdef TokenC* t = tokens.data
diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx
index caaae8dce..5c7f39ff7 100644
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@@ -13,7 +13,8 @@ from itertools import combinations
 from ..tokens cimport TokenC
 from ._state cimport State
 from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
-from ._state cimport get_left, get_right
+from ._state cimport has_head, get_left, get_right
+from ._state cimport count_left_kids, count_right_kids
 
 
 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
@@ -24,10 +25,12 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
         context[3] = 0
         context[4] = 0
         context[5] = 0
+        context[6] = 0
     else:
         context[0] = token.lex.sic
-        context[1] = token.pos
-        context[2] = token.lex.cluster
+        context[1] = token.lemma
+        context[2] = token.fine_pos
+        context[3] = token.lex.cluster
         # We've read in the string little-endian, so now we can take & (2**n)-1
         # to get the first n bits of the cluster.
         # e.g. s = "1110010101"
@@ -40,9 +43,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
         # What we're doing here is picking a number where all bits are 1, e.g.
         # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
         # the source that are set to 1.
-        context[3] = token.lex.cluster & 63
-        context[4] = token.lex.cluster & 15
-        context[5] = token.dep_tag
+        context[4] = token.lex.cluster & 63
+        context[5] = token.lex.cluster & 15
+        context[6] = token.dep_tag if has_head(token) else 0
 
 
 cdef int fill_context(atom_t* context, State* state) except -1:
@@ -66,12 +69,148 @@ cdef int fill_context(atom_t* context, State* state) except -1:
         context[dist] = state.stack[0] - state.i
     else:
         context[dist] = 0
-    context[N0lv] = 0 
-    context[S0lv] = 0
-    context[S0rv] = 0
-    context[S1lv] = 0
-    context[S1rv] = 0
+    context[N0lv] = max(count_left_kids(get_n0(state)), 5)
+    context[S0lv] = max(count_left_kids(get_s0(state)), 5)
+    context[S0rv] = max(count_right_kids(get_s0(state)), 5)
+    context[S1lv] = max(count_left_kids(get_s1(state)), 5)
+    context[S1rv] = max(count_right_kids(get_s1(state)), 5)
 
+    context[S0_has_head] = 0
+    context[S1_has_head] = 0
+    context[S2_has_head] = 0
+    if state.stack_len >= 1:
+        context[S0_has_head] = has_head(get_s0(state)) + 1
+        if state.stack_len >= 2:
+            context[S1_has_head] = has_head(get_s1(state)) + 1
+            if state.stack_len >= 3:
+                context[S2_has_head] = has_head(get_s2(state))
+
+
+unigrams = (
+    (S2W, S2p),
+    (S2c6, S2p),
+    
+    (S1W, S1p),
+    (S1c6, S1p),
+
+    (S0W, S0p),
+    (S0c6, S0p),
+ 
+    (N0W, N0p),
+    (N0p,),
+    (N0c,),
+    (N0c6, N0p),
+    (N0L,),
+ 
+    (N1W, N1p),
+    (N1c6, N1p),
+ 
+    (N2W, N2p),
+    (N2c6, N2p),
+
+    (S0r2W, S0r2p),
+    (S0r2c6, S0r2p),
+    (S0r2L,),
+
+    (S0rW, S0rp),
+    (S0rc6, S0rp),
+    (S0rL,),
+
+    (S0l2W, S0l2p),
+    (S0l2c6, S0l2p),
+    (S0l2L,),
+
+    (S0lW, S0lp),
+    (S0lc6, S0lp),
+    (S0lL,),
+
+    (N0l2W, N0l2p),
+    (N0l2c6, N0l2p),
+    (N0l2L,),
+
+    (N0lW, N0lp),
+    (N0lc6, N0lp),
+    (N0lL,),
+)
+
+
+s0_n0 = (
+    (S0W, S0p, N0W, N0p),
+    (S0c, S0p, N0c, N0p),
+    (S0c6, S0p, N0c6, N0p),
+    (S0c4, S0p, N0c4, N0p),
+    (S0p, N0p),
+    (S0W, N0p),
+    (S0p, N0W),
+    (S0W, N0c),
+    (S0c, N0W),
+    (S0p, N0c),
+    (S0c, N0p),
+    (S0W, S0rp, N0p),
+    (S0p, S0rp, N0p),
+    (S0p, N0lp, N0W),
+    (S0p, N0lp, N0p),
+)
+
+
+s1_n0 = (
+    (S1p, N0p),
+    (S1c, N0c),
+    (S1c, N0p),
+    (S1p, N0c),
+    (S1W, S1p, N0p),
+    (S1p, N0W, N0p),
+    (S1c6, S1p, N0c6, N0p),
+)
+
+
+s0_n1 = (
+    (S0p, N1p),
+    (S0c, N1c),
+    (S0c, N1p),
+    (S0p, N1c),
+    (S0W, S0p, N1p),
+    (S0p, N1W, N1p),
+    (S0c6, S0p, N1c6, N1p),
+)
+
+n0_n1 = (
+    (N0W, N0p, N1W, N1p),
+    (N0W, N0p, N1p),
+    (N0p, N1W, N1p),
+    (N0c, N0p, N1c, N1p),
+    (N0c6, N0p, N1c6, N1p),
+    (N0c, N1c),
+    (N0p, N1c),
+)
+
+tree_shape = (
+    (dist,),
+    (S0p, S0_has_head, S1_has_head, S2_has_head),
+    (S0p, S0lv, S0rv),
+    (N0p, N0lv),
+)
+
+trigrams = (
+    (N0p, N1p, N2p),
+    (S0p, S0lp, S0l2p),
+    (S0p, S0rp, S0r2p),
+    (S0p, S1p, S2p),
+    (S1p, S0p, N0p),
+    (S0p, S0lp, N0p),
+    (S0p, N0p, N0lp),
+    (N0p, N0lp, N0l2p),
+    
+    (S0W, S0p, S0rL, S0r2L),
+    (S0p, S0rL, S0r2L),
+
+    (S0W, S0p, S0lL, S0l2L),
+    (S0p, S0lL, S0l2L),
+
+    (N0W, N0p, N0lL, N0l2L),
+    (N0p, N0lL, N0l2L),
+)
+ 
 
 arc_eager = (
     (S0w, S0p),
@@ -86,7 +225,6 @@ arc_eager = (
     (N2w, N2p),
     (N2w,),
     (N2p,),
-
     (S0w, S0p, N0w, N0p),
     (S0w, S0p, N0w),
     (S0w, N0w, N0p),
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index d4ffbb701..009a0ecb6 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,4 +1,5 @@
 # cython: profile=True
+
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
 
@@ -59,13 +60,7 @@ cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
 
 
 cdef class Tokens:
-    """A sequence of references to Lexeme objects.
-
-    The Tokens class provides fast and memory-efficient access to lexical features,
-    and can efficiently export the data to a numpy array.
-
-    >>> from spacy.en import EN
-    >>> tokens = EN.tokenize('An example sentence.')
+    """Access and set annotations onto some text.
     """
     def __init__(self, Vocab vocab, string_length=0):
         self.vocab = vocab
@@ -86,10 +81,20 @@ cdef class Tokens:
         self.length = 0
 
     def __getitem__(self, i):
+        """Retrieve a token.
+        
+        Returns:
+            token (Token):
+        """
         bounds_check(i, self.length, PADDING)
         return Token(self, i)
 
     def __iter__(self):
+        """Iterate over the tokens.
+
+        Yields:
+            token (Token):
+        """
         for i in range(self.length):
             yield self[i]
 
@@ -148,6 +153,11 @@ cdef class Tokens:
 
 @cython.freelist(64)
 cdef class Token:
+    """An individual token.
+
+    Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
+    object.
+    """
     def __init__(self, Tokens tokens, int i):
         self._seq = tokens
         self.i = i
@@ -163,21 +173,44 @@ cdef class Token:
             return self.string + ' '
 
     def __len__(self):
+        """The number of unicode code-points in the original string.
+
+        Returns:
+            length (int):
+        """
         return self._seq.data[self.i].lex.length
 
     property idx:
+        """The index into the original string at which the token starts.
+
+        The following is supposed to always be true:
+        
+        >>> original_string[token.idx:token.idx len(token) == token.string
+        """
         def __get__(self):
             return self._seq.data[self.i].idx
 
-    property length:
-        def __get__(self):
-            return self._seq.data[self.i].lex.length
-
     property cluster:
+        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
+    
+        Similar words have better-than-chance likelihood of having similar cluster
+        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
+        and help to make models slightly more robust to domain variation.
+
+        A common trick is to use only the first N bits of a cluster ID in a feature,
+        as the more general part of the hierarchical clustering is often more accurate
+        than the lower categories.
+
+        To assist in this, I encode the cluster IDs little-endian, to allow a simple
+        bit-mask:
+
+        >>> six_bits = cluster & (2**6 - 1)
+        """
         def __get__(self):
             return self._seq.data[self.i].lex.cluster
 
     property string:
+        """The unicode string of the word, with no whitespace padding."""
         def __get__(self):
             cdef const TokenC* t = &self._seq.data[self.i]
             if t.lex.sic == 0:
@@ -186,6 +219,9 @@ cdef class Token:
             return utf8string.decode('utf8')
 
     property lemma:
+        """The unicode string of the word's lemma.  If no part-of-speech tag is
+        assigned, the most common part-of-speech tag of the word is used.
+        """
         def __get__(self):
             cdef const TokenC* t = &self._seq.data[self.i]
             if t.lemma == 0:
@@ -193,15 +229,27 @@ cdef class Token:
             cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
             return utf8string.decode('utf8')
 
-    property dep:
+    property dep_tag:
+        """The ID integer of the word's dependency label.  If no parse has been
+        assigned, defaults to 0.
+        """
         def __get__(self):
             return self._seq.data[self.i].dep_tag
 
     property pos:
+        """The ID integer of the word's part-of-speech tag, from the 13-tag
+        Google Universal Tag Set.  Constants for this tag set are available in
+        spacy.typedefs.
+        """
         def __get__(self):
             return self._seq.data[self.i].pos
 
     property fine_pos:
+        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
+        by the tagger model.  Fine-grained tags include morphological information,
+        and other distinctions, and allow a more accurate tagger to be trained.
+        """
+ 
         def __get__(self):
             return self._seq.data[self.i].fine_pos
 
@@ -210,6 +258,7 @@ cdef class Token:
             return self._seq.data[self.i].lex.sic
 
     property head:
+        """The token predicted by the parser to be the head of the current token."""
         def __get__(self):
             cdef const TokenC* t = &self._seq.data[self.i]
             return Token(self._seq, self.i + t.head)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index b0722d312..1b5fb9443 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -33,8 +33,6 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
 
 cdef class Vocab:
     '''A map container for a language's Lexeme structs.
-    
-    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
     '''
     def __init__(self, data_dir=None, get_lex_props=None):
         self.mem = Pool()
@@ -53,6 +51,7 @@ cdef class Vocab:
             self.load(path.join(data_dir, 'lexemes'))
 
     def __len__(self):
+        """The current number of lexemes stored."""
         return self.lexemes.size()
 
     cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: