Fix doc strings

2025-07-16 03:02:41 +03:00 · 2016-11-01 12:25:36 +01:00 · 2016-11-01 12:25:36 +01:00 · b86f8af0c1
commit b86f8af0c1
parent 18aab4f71e
12 changed files with 592 additions and 39 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -212,6 +212,7 @@ def _consume_ent(tags):


 cdef class GoldParse:
+    """Collection for training annotations."""
    @classmethod
    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
        _, words, tags, heads, deps, entities = annot_tuples
@ -220,6 +221,25 @@ cdef class GoldParse:

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
+        """Create a GoldParse.
+
+        Arguments:
+            doc (Doc):
+                The document the annotations refer to.
+            words:
+                A sequence of unicode word strings.
+            tags:
+                A sequence of strings, representing tag annotations.
+            heads:
+                A sequence of integers, representing syntactic head offsets.
+            deps:
+                A sequence of strings, representing the syntactic relation types.
+            entities:
+                A sequence of named entity annotations, either as BILUO tag strings,
+                or as (start_char, end_char, label) tuples, representing the entity
+                positions.
+        Returns (GoldParse): The newly constructed object.
+        """
        if words is None:
            words = [token.text for token in doc]
        if tags is None:
@ -280,10 +300,16 @@ cdef class GoldParse:
            self.heads = proj_heads

    def __len__(self):
+        """Get the number of gold-standard tokens.
+        
+        Returns (int): The number of gold-standard tokens.
+        """
        return self.length

    @property
    def is_projective(self):
+        """Whether the provided syntactic annotations form a projective dependency
+        tree."""
        return not nonproj.is_nonproj_tree(self.heads)


--- a/spacy/language.py
+++ b/spacy/language.py
@ -293,13 +293,14 @@ class Language(object):
            text (unicode): The text to be processed.

        Returns:
-            tokens (spacy.tokens.Doc):
+            doc (Doc): A container for accessing the annotations.

-        >>> from spacy.en import English
-        >>> nlp = English()
-        >>> tokens = nlp('An example sentence. Another example sentence.')
-        >>> tokens[0].orth_, tokens[0].head.tag_
-        ('An', 'NN')
+        Example:
+            >>> from spacy.en import English
+            >>> nlp = English()
+            >>> tokens = nlp('An example sentence. Another example sentence.')
+            >>> tokens[0].orth_, tokens[0].head.tag_
+            ('An', 'NN')
        """
        doc = self.make_doc(text)
        if self.entity and entity:
@ -314,6 +315,16 @@ class Language(object):
        return doc

    def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
+        '''Process texts as a stream, and yield Doc objects in order.
+        
+        Supports GIL-free multi-threading.
+        
+        Arguments:
+            texts (iterator)
+            tag (bool)
+            parse (bool)
+            entity (bool)
+        '''
        skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
        stream = (self.make_doc(text) for text in texts)
        for proc in self.pipeline:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -36,6 +36,13 @@ cdef class Lexeme:
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
+        """Create a Lexeme object.
+
+        Arguments:
+            vocab (Vocab): The parent vocabulary
+            orth (int): The orth id of the lexeme.
+        Returns (Lexeme): The newly constructd object.
+        """
        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
@ -73,12 +80,33 @@ cdef class Lexeme:
        return self.c.orth

    def set_flag(self, attr_id_t flag_id, bint value):
+        """Change the value of a boolean flag.
+
+        Arguments:
+            flag_id (int): The attribute ID of the flag to set.
+            value (bool): The new value of the flag.
+        """
        Lexeme.c_set_flag(self.c, flag_id, value)
    
    def check_flag(self, attr_id_t flag_id):
+        """Check the value of a boolean flag.
+
+        Arguments:
+            flag_id (int): The attribute ID of the flag to query.
+        Returns (bool): The value of the flag.
+        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
+        '''Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+        Arguments:
+            other:
+                The object to compare with. By default, accepts Doc, Span,
+                Token and Lexeme objects.
+        Returns:
+            score (float): A scalar similarity score. Higher is more similar.
+        '''
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -165,6 +165,7 @@ def _convert_strings(token_specs, string_store):


 cdef class Matcher:
+    '''Match sequences of tokens, based on pattern rules.'''
    cdef Pool mem
    cdef vector[TokenPatternC*] patterns
    cdef readonly Vocab vocab
@ -175,6 +176,16 @@ cdef class Matcher:
    
    @classmethod
    def load(cls, path, vocab):
+        '''Load the matcher and patterns from a file path.
+
+        Arguments:
+            path (Path):
+                Path to a JSON-formatted patterns file.
+            vocab (Vocab):
+                The vocabulary that the documents to match over will refer to.
+        Returns:
+            Matcher: The newly constructed object.
+        '''
        if (path / 'gazetteer.json').exists():
            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
                patterns = json.load(file_)
@ -183,6 +194,16 @@ cdef class Matcher:
        return cls(vocab, patterns)

    def __init__(self, vocab, patterns={}):
+        """Create the Matcher.
+
+        Arguments:
+            vocab (Vocab):
+                The vocabulary object, which must be shared with the documents
+                the matcher will operate on.
+            patterns (dict): Patterns to add to the matcher.
+        Returns:
+            The newly constructed object.
+        """
        self._patterns = {}
        self._entities = {}
        self._acceptors = {}
@ -203,6 +224,22 @@ cdef class Matcher:

    def add_entity(self, entity_key, attrs=None, if_exists='raise',
                   acceptor=None, on_match=None):
+        """Add an entity to the matcher.
+
+        Arguments:
+            entity_key (unicode or int):
+                An ID for the entity.
+            attrs:
+                Attributes to associate with the Matcher.
+            if_exists ('raise', 'ignore' or 'update'):
+                Controls what happens if the entity ID already exists. Defaults to 'raise'.
+            acceptor:
+                Callback function to filter matches of the entity.
+            on_match:
+                Callback function to act on matches of the entity.
+        Returns:
+            None
+        """
        if if_exists not in ('raise', 'ignore', 'update'):
            raise ValueError(
                "Unexpected value for if_exists: %s.\n"
@ -224,6 +261,18 @@ cdef class Matcher:
        self._callbacks[entity_key] = on_match

    def add_pattern(self, entity_key, token_specs, label=""):
+        """Add a pattern to the matcher.
+
+        Arguments:
+            entity_key (unicode or int):
+                An ID for the entity.
+            token_specs:
+                Description of the pattern to be matched.
+            label:
+                Label to assign to the matched pattern. Defaults to "".
+        Returns:
+            None
+        """
        entity_key = self.normalize_entity_key(entity_key)
        if not self.has_entity(entity_key):
            self.add_entity(entity_key)
@ -249,10 +298,24 @@ cdef class Matcher:
            return entity_key

    def has_entity(self, entity_key):
+        """Check whether the matcher has an entity.
+
+        Arguments:
+            entity_key (string or int): The entity key to check.
+        Returns:
+            bool: Whether the matcher has the entity.
+        """
        entity_key = self.normalize_entity_key(entity_key)
        return entity_key in self._entities

    def get_entity(self, entity_key):
+        """Retrieve the attributes stored for an entity.
+
+        Arguments:
+            entity_key (unicode or int): The entity to retrieve.
+        Returns:
+            The entity attributes if present, otherwise None.
+        """
        entity_key = self.normalize_entity_key(entity_key)
        if entity_key in self._entities:
            return self._entities[entity_key]
@ -260,6 +323,17 @@ cdef class Matcher:
            return None

    def __call__(self, Doc doc, acceptor=None):
+        """Find all token sequences matching the supplied patterns on the Doc.
+
+        Arguments:
+            doc (Doc):
+                The document to match over.
+        Returns:
+            list
+            A list of (entity_key, label_id, start, end) tuples,
+            describing the matches. A match tuple describes a span doc[start:end].
+            The label_id and entity_key are both integers.
+        """
        if acceptor is not None:
            raise ValueError(
                "acceptor keyword argument to Matcher deprecated. Specify acceptor "
@ -340,6 +414,18 @@ cdef class Matcher:
        return matches

    def pipe(self, docs, batch_size=1000, n_threads=2):
+        """Match a stream of documents, yielding them in turn.
+
+        Arguments:
+            docs: A stream of documents.
+            batch_size (int):
+                The number of documents to accumulate into a working set.
+            n_threads (int):
+                The number of threads with which to work on the buffer in parallel,
+                if the Matcher implementation supports multi-threading.
+        Yields:
+            Doc Documents, in order.
+        """
        for doc in docs:
            self(doc)
            yield doc
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -11,6 +11,7 @@ from .attrs import DEP, ENT_TYPE


 cdef class EntityRecognizer(Parser):
+    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown
    
    feature_templates = get_feature_templates('ner')
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -73,6 +73,11 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
 cdef class StringStore:
    '''Map strings to and from integer IDs.'''
    def __init__(self, strings=None, freeze=False):
+        '''Create the StringStore.
+
+        Arguments:
+            strings: A sequence of unicode strings to add to the store.
+        '''
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
@ -89,9 +94,22 @@ cdef class StringStore:
            return self.size -1

    def __len__(self):
+        """The number of strings in the store.
+
+        Returns:
+            int The number of strings in the store.
+        """
        return self.size-1

    def __getitem__(self, object string_or_id):
+        """Retrieve a string from a given integer ID, or vice versa.
+        
+        Arguments:
+            string_or_id (bytes or unicode or int):
+                The value to encode.
+        Returns:
+            unicode or int: The value to retrieved.
+        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
@ -127,12 +145,23 @@ cdef class StringStore:
                return utf8str - self.c

    def __contains__(self, unicode string not None):
+        """Check whether a string is in the store.
+
+        Arguments:
+            string (unicode): The string to check.
+        Returns bool:
+            Whether the store contains the string.
+        """
        if len(string) == 0:
            return True
        cdef hash_t key = hash_string(string)
        return self._map.get(key) is not NULL

    def __iter__(self):
+        """Iterate over the strings in the store, in order.
+
+        Yields: unicode A string in the store.
+        """
        cdef int i
        for i in range(self.size):
            yield _decode(&self.c[i]) if i > 0 else u''
@ -185,6 +214,13 @@ cdef class StringStore:
        return &self.c[self.size-1]

    def dump(self, file_):
+        """Save the strings to a JSON file.
+
+        Arguments:
+            file_ (buffer): The file to save the strings.
+        Returns:
+            None
+        """
        string_data = json.dumps(list(self))
        if not isinstance(string_data, unicode):
            string_data = string_data.decode('utf8')
@ -192,6 +228,13 @@ cdef class StringStore:
        file_.write(string_data)

    def load(self, file_):
+        """Load the strings from a JSON file.
+
+        Arguments:
+            file_ (buffer): The file from which to load the strings.
+        Returns:
+            None
+        """
        strings = json.load(file_)
        if strings == ['']:
            return None
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -74,8 +74,21 @@ cdef class ParserModel(AveragedPerceptron):


 cdef class Parser:
+    """Base class of the DependencyParser and EntityRecognizer."""
    @classmethod
    def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
+        """Load the statistical model from the supplied path.
+
+        Arguments:
+            path (Path):
+                The path to load from.
+            vocab (Vocab):
+                The vocabulary. Must be shared by the documents to be processed.
+            require (bool):
+                Whether to raise an error if the files are not found.
+        Returns (Parser):
+            The newly constructed object.
+        """
        with (path / 'config.json').open() as file_:
            cfg = json.load(file_)
        # TODO: remove this shim when we don't have to support older data
@ -90,6 +103,16 @@ cdef class Parser:
        return self

    def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
+        """Create a Parser.
+
+        Arguments:
+            vocab (Vocab):
+                The vocabulary object. Must be shared with documents to be processed.
+            model (thinc.linear.AveragedPerceptron):
+                The statistical model.
+        Returns (Parser):
+            The newly constructed object.
+        """
        if TransitionSystem is None:
            TransitionSystem = self.TransitionSystem
        self.vocab = vocab
@ -107,6 +130,13 @@ cdef class Parser:
        return (Parser, (self.vocab, self.moves, self.model), None, None)

    def __call__(self, Doc tokens):
+        """Apply the entity recognizer, setting the annotations onto the Doc object.
+
+        Arguments:
+            doc (Doc): The document to be processed.
+        Returns:
+            None
+        """
        cdef int nr_feat = self.model.nr_feat
        with nogil:
            status = self.parseC(tokens.c, tokens.length, nr_feat)
@ -117,6 +147,16 @@ cdef class Parser:
        self.moves.finalize_doc(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
+        """Process a stream of documents.
+
+        Arguments:
+            stream: The sequence of documents to process.
+            batch_size (int):
+                The number of documents to accumulate into a working set.
+            n_threads (int):
+                The number of threads with which to work on the buffer in parallel.
+        Yields (Doc): Documents, in order.
+        """
        cdef Pool mem = Pool()
        cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
        cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
@ -194,6 +234,16 @@ cdef class Parser:
        return 0
  
    def update(self, Doc tokens, GoldParse gold):
+        """Update the statistical model.
+
+        Arguments:
+            doc (Doc):
+                The example document for the update.
+            gold (GoldParse):
+                The gold-standard annotations, to calculate the loss.
+        Returns (float):
+            The loss on this example.
+        """
        self.moves.preprocess_gold(gold)
        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls.c)
@ -220,9 +270,24 @@ cdef class Parser:
        return loss

    def step_through(self, Doc doc):
+        """Set up a stepwise state, to introspect and control the transition sequence.
+
+        Arguments:
+            doc (Doc): The document to step through.
+        Returns (StepwiseState):
+            A state object, to step through the annotation process.
+        """
        return StepwiseState(self, doc)

    def from_transition_sequence(self, Doc doc, sequence):
+        """Control the annotations on a document by specifying a transition sequence
+        to follow.
+
+        Arguments:
+            doc (Doc): The document to annotate.
+            sequence: A sequence of action names, as unicode strings.
+        Returns: None
+        """
        with self.step_through(doc) as stepwise:
            for transition in sequence:
                stepwise.transition(transition)
@ -233,7 +298,6 @@ cdef class Parser:
            self.moves.add_action(action, label)
                

-
 cdef class StepwiseState:
    cdef readonly StateClass stcls
    cdef readonly Example eg
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -102,9 +102,21 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:


 cdef class Tagger:
-    """A part-of-speech tagger for English"""
+    """Annotate part-of-speech tags on Doc objects."""
    @classmethod
    def load(cls, path, vocab, require=False):
+        """Load the statistical model from the supplied path.
+
+        Arguments:
+            path (Path):
+                The path to load from.
+            vocab (Vocab):
+                The vocabulary. Must be shared by the documents to be processed.
+            require (bool):
+                Whether to raise an error if the files are not found.
+        Returns (Tagger):
+            The newly created object.
+        """
        # TODO: Change this to expect config.json when we don't have to
        # support old data.
        path = path if not isinstance(path, basestring) else pathlib.Path(path)
@ -126,6 +138,16 @@ cdef class Tagger:
        return self

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
+        """Create a Tagger.
+
+        Arguments:
+            vocab (Vocab):
+                The vocabulary object. Must be shared with documents to be processed.
+            model (thinc.linear.AveragedPerceptron):
+                The statistical model.
+        Returns (Tagger):
+            The newly constructed object.
+        """
        if model is None:
            model = TaggerModel(cfg.get('features', self.feature_templates))
        self.vocab = vocab
@ -154,8 +176,10 @@ cdef class Tagger:
    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.

-        Args:
-            tokens (Doc): The tokens to be tagged.
+        Arguments:
+            doc (Doc): The tokens to be tagged.
+        Returns:
+            None
        """
        if tokens.length == 0:
            return 0
@ -178,11 +202,33 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
+        """Tag a stream of documents.
+
+        Arguments:
+            stream: The sequence of documents to tag.
+            batch_size (int):
+                The number of documents to accumulate into a working set.
+            n_threads (int):
+                The number of threads with which to work on the buffer in parallel,
+                if the Matcher implementation supports multi-threading.
+        Yields:
+            Doc Documents, in order.
+        """
        for doc in stream:
            self(doc)
            yield doc
    
    def update(self, Doc tokens, GoldParse gold):
+        """Update the statistical model, with tags supplied for the given document.
+
+        Arguments:
+            doc (Doc):
+                The document to update on.
+            gold (GoldParse):
+                Manager for the gold-standard tags.
+        Returns (int):
+            Number of tags correct.
+        """
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -219,6 +219,16 @@ cdef class Doc:
        return self.__str__()

    def similarity(self, other):
+        '''Make a semantic similarity estimate. The default estimate is cosine
+        similarity using an average of word vectors.
+
+        Arguments:
+            other (object): The object to compare with. By default, accepts Doc,
+                Span, Token and Lexeme objects.
+
+        Return:
+            score (float): A scalar similarity score. Higher is more similar.
+        '''
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
        if self.vector_norm == 0 or other.vector_norm == 0:
@ -226,6 +236,9 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property has_vector:
+        '''
+        A boolean value indicating whether a word vector is associated with the object.
+        '''
        def __get__(self):
            if 'has_vector' in self.user_hooks:
                return self.user_hooks['has_vector'](self)
@ -233,6 +246,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)

    property vector:
+        '''
+        A real-valued meaning representation. Defaults to an average of the token vectors.
+        
+        Type: numpy.ndarray[ndim=1, dtype='float32']
+        '''
        def __get__(self):
            if 'vector' in self.user_hooks:
                return self.user_hooks['vector'](self)
@ -266,13 +284,15 @@ cdef class Doc:
    def string(self):
        return self.text
    
-    @property
-    def text_with_ws(self):
-        return self.text
+    property text
+        '''A unicode representation of the document text.'''
+        def __get__(self):
+            return u''.join(t.text_with_ws for t in self)

-    @property
-    def text(self):
-        return u''.join(t.text_with_ws for t in self)
+    property text_with_ws:
+        '''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
+        def __get__(self):
+            return self.text

    property ents:
        '''
@ -567,7 +587,6 @@ cdef class Doc:
        set_children_from_heads(self.c, self.length)
        self.is_parsed = bool(HEAD in attrs or DEP in attrs)
        self.is_tagged = bool(TAG in attrs or POS in attrs)
-
        return self

    def to_bytes(self):
@ -612,7 +631,22 @@ cdef class Doc:
            yield n_bytes_str + data

    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """Merge a multi-word expression into a single token."""
+        """Retokenize the document, such that the span at doc.text[start_idx : end_idx]
+        is merged into a single token. If start_idx and end_idx do not mark start
+        and end token boundaries, the document remains unchanged.
+
+        Arguments:
+            start_idx (int): The character index of the start of the slice to merge.
+            end_idx (int): The character index after the end of the slice to merge.
+            **attributes:
+                Attributes to assign to the merged token. By default, attributes
+                are inherited from the syntactic root token of the span.
+        Returns:
+            token (Token):
+                The newly merged token, or None if the start and end indices did
+                not fall at token boundaries.
+
+        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
            # TODO: Warn deprecation
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -18,12 +18,23 @@ from ..lexeme cimport Lexeme

 cdef class Span:
    """A slice from a Doc object."""
-    def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
+    def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
                  vector_norm=None):
+        '''Create a Span object from the slice doc[start : end]
+
+        Arguments:
+            doc (Doc): The parent document.
+            start (int): The index of the first token of the span.
+            end (int): The index of the first token after the span.
+            label (int): A label to attach to the Span, e.g. for named entities.
+            vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        Returns:
+            Span The newly constructed object.
+        '''
        if not (0 <= start <= end <= len(tokens)):
            raise IndexError

-        self.doc = tokens
+        self.doc = doc
        self.start = start
        self.start_char = self.doc[start].idx if start < self.doc.length else 0
        self.end = end
@ -78,9 +89,29 @@ cdef class Span:
            yield self.doc[i]

    def merge(self, *args, **attributes):
+        """Retokenize the document, such that the span is merged into a single token.
+
+        Arguments:
+            **attributes:
+                Attributes to assign to the merged token. By default, attributes
+                are inherited from the syntactic root token of the span.
+        Returns:
+            token (Token):
+                The newly merged token.
+        """
        self.doc.merge(self.start_char, self.end_char, *args, **attributes)

    def similarity(self, other):
+        '''Make a semantic similarity estimate. The default estimate is cosine
+        similarity using an average of word vectors.
+
+        Arguments:
+            other (object): The object to compare with. By default, accepts Doc,
+                Span, Token and Lexeme objects.
+
+        Return:
+            score (float): A scalar similarity score. Higher is more similar.
+        '''
        if 'similarity' in self.doc.user_span_hooks:
            self.doc.user_span_hooks['similarity'](self, other)
        if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -102,7 +133,11 @@ cdef class Span:
            self.end = end + 1

    property sent:
-        '''Get the sentence span that this span is a part of.'''
+        '''The sentence span that this span is a part of.
+        
+        Returns:
+            Span The sentence this is part of.
+        '''
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
@ -156,7 +191,12 @@ cdef class Span:
            return u''.join([t.text_with_ws for t in self])

    property root:
-        """The word of the span that is highest in the parse tree, i.e. has the
+        """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
+
+        Returns:
+            Token: The root token.
+        
+        i.e. has the
        shortest path to the root of the sentence (or is the root itself).

        If multiple words are equally high in the tree, the first word is taken.
@ -231,7 +271,10 @@ cdef class Span:
                return self.doc[root]
    
    property lefts:
-        """Tokens that are to the left of the Span, whose head is within the Span."""
+        """Tokens that are to the left of the span, whose head is within the Span.
+        
+        Yields: Token A left-child of a token of the span.
+        """
        def __get__(self):
            for token in reversed(self): # Reverse, so we get the tokens in order
                for left in token.lefts:
@ -239,7 +282,10 @@ cdef class Span:
                        yield left

    property rights:
-        """Tokens that are to the right of the Span, whose head is within the Span."""
+        """Tokens that are to the right of the Span, whose head is within the Span.
+        
+        Yields: Token A right-child of a token of the span.
+        """
        def __get__(self):
            for token in self:
                for right in token.rights:
@ -247,6 +293,10 @@ cdef class Span:
                        yield right

    property subtree:
+        """Tokens that descend from tokens in the span, but fall outside it.
+
+        Yields: Token A descendant of a token within the span.
+        """
        def __get__(self):
            for word in self.lefts:
                yield from word.subtree
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -30,8 +30,7 @@ from ..lexeme cimport Lexeme


 cdef class Token:
-    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
-    via Doc.__getitem__ and Doc.__iter__.
+    """An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
    """
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        self.vocab = vocab
@ -40,6 +39,7 @@ cdef class Token:
        self.i = offset

    def __len__(self):
+        '''Number of unicode characters in token.text'''
        return self.c.lex.length

    def __unicode__(self):
@ -57,12 +57,35 @@ cdef class Token:
        return self.__str__()

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
+        '''Check the value of a boolean flag.
+        
+        Arguments:
+            flag_id (int): The ID of the flag attribute.
+        Returns:
+            is_set (bool): Whether the flag is set.
+        '''
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
+        '''Get a neighboring token.
+
+        Arguments:
+            i (int): The relative position of the token to get. Defaults to 1.
+        Returns:
+            neighbor (Token): The token at position self.doc[self.i+i]
+        '''
        return self.doc[self.i+i]

    def similarity(self, other):
+        '''Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+        Arguments:
+            other:
+                The object to compare with. By default, accepts Doc, Span,
+                Token and Lexeme objects.
+        Returns:
+            score (float): A scalar similarity score. Higher is more similar.
+        '''
        if 'similarity' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['similarity'](self)
        if self.vector_norm == 0 or other.vector_norm == 0:
@ -158,6 +181,9 @@ cdef class Token:
            self.c.dep = label

    property has_vector:
+        '''
+        A boolean value indicating whether a word vector is associated with the object.
+        '''
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
@ -169,6 +195,11 @@ cdef class Token:
                return False

    property vector:
+        '''
+        A real-valued meaning representation.
+        
+        Type: numpy.ndarray[ndim=1, dtype='float32']
+        '''
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
@ -241,11 +272,19 @@ cdef class Token:
                yield t

    property children:
+        '''A sequence of the token's immediate syntactic children.
+
+        Yields: Token A child token such that child.head==self
+        '''
        def __get__(self):
            yield from self.lefts
            yield from self.rights

    property subtree:
+        '''A sequence of all the token's syntactic descendents.
+
+        Yields: Token A descendent token such that self.is_ancestor(descendent)
+        '''
        def __get__(self):
            for word in self.lefts:
                yield from word.subtree
@ -254,14 +293,26 @@ cdef class Token:
                yield from word.subtree

    property left_edge:
+        '''The leftmost token of this token's syntactic descendents.
+
+        Returns: Token The first token such that self.is_ancestor(token)
+        '''
        def __get__(self):
            return self.doc[self.c.l_edge]

    property right_edge:
+        '''The rightmost token of this token's syntactic descendents.
+
+        Returns: Token The last token such that self.is_ancestor(token)
+        '''
        def __get__(self):
            return self.doc[self.c.r_edge]

    property ancestors:
+        '''A sequence of this token's syntactic ancestors.
+
+        Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
+        '''
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
            # guard against infinite loop, no token can have 
@ -273,9 +324,27 @@ cdef class Token:
                i += 1

    def is_ancestor_of(self, descendant):
+        # TODO: Remove after backward compatibility check.
+        return self.is_ancestor(descendant)
+
+    def is_ancestor(self, descendant):
+        '''Check whether this token is a parent, grandparent, etc. of another
+        in the dependency tree.
+
+        Arguments:
+            descendant (Token): Another token.
+        Returns:
+            is_ancestor (bool): Whether this token is the ancestor of the descendant.
+        '''
+        if self.doc is not other.doc:
+            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )

    property head:
+        '''The syntactic parent, or "governor", of this token.
+        
+        Returns: Token
+        '''
        def __get__(self):
            """The token predicted by the parser to be the head of the current token."""
            return self.doc[self.i + self.c.head]
@ -370,6 +439,10 @@ cdef class Token:
            self.c.head = rel_newhead_i

    property conjuncts:
+        '''A sequence of coordinated tokens, including the token itself.
+
+        Yields: Token A coordinated token
+        '''
        def __get__(self):
            """Get a list of conjoined words."""
            cdef Token word
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -52,6 +52,25 @@ cdef class Vocab:
    @classmethod
    def load(cls, path, lex_attr_getters=None, lemmatizer=True,
             tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): 
+        """
+        Load the vocabulary from a path.
+
+        Arguments:
+            path (Path):
+                The path to load from.
+            lex_attr_getters (dict):
+                A dictionary mapping attribute IDs to functions to compute them.
+                Defaults to None.
+            lemmatizer (object):
+                A lemmatizer. Defaults to None.
+            tag_map (dict):
+                A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
+                and optionally morphological attributes.
+            oov_prob (float):
+                The default probability for out-of-vocabulary words.
+        Returns:
+            Vocab: The newly constructed vocab object.
+        """
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
        if 'vectors' in deprecated_kwargs:
            raise AttributeError(
@ -82,6 +101,22 @@ cdef class Vocab:

    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
            serializer_freqs=None, **deprecated_kwargs):
+        '''Create the vocabulary.
+
+        lex_attr_getters (dict):
+            A dictionary mapping attribute IDs to functions to compute them.
+            Defaults to None.
+        lemmatizer (object):
+            A lemmatizer. Defaults to None.
+        tag_map (dict):
+            A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
+            and optionally morphological attributes.
+        oov_prob (float):
+            The default probability for out-of-vocabulary words.
+
+        Returns:
+            Vocab: The newly constructed vocab object.
+        '''
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
        
        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -134,6 +169,9 @@ cdef class Vocab:
        '''
        Set vectors_length to a new size, and allocate more memory for the Lexeme
        vectors if necessary. The memory will be zeroed.
+
+        Arguments:
+            new_size (int): The new size of the vectors. 
        '''
        cdef hash_t key
        cdef size_t addr
@ -145,11 +183,14 @@ cdef class Vocab:
        self.vectors_length = new_size

    def add_flag(self, flag_getter, int flag_id=-1):
-        '''Set a new boolean flag to words in the vocabulary. The flag_setter
-        function will be called over the words currently in the vocab, and then
-        applied to new words as they occur. You'll then be able to access the
-        flag value on each token, using token.check_flag(flag_id). See also:
-        Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
+        '''Set a new boolean flag to words in the vocabulary.
+        
+        The flag_setter function will be called over the words currently in the
+        vocab, and then applied to new words as they occur. You'll then be able
+        to access the flag value on each token, using token.check_flag(flag_id).
+        
+        See also:
+            Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.

        Arguments:
            flag_getter:
@ -246,11 +287,23 @@ cdef class Vocab:
        self.length += 1

    def __contains__(self, unicode string):
+        '''Check whether the string has an entry in the vocabulary.
+
+        Arguments:
+            string (unicode): The ID string.
+
+        Returns:
+            bool Whether the string has an entry in the vocabulary.
+        '''
        key = hash_string(string)
        lex = self._by_hash.get(key)
        return True if lex is not NULL else False

    def __iter__(self):
+        '''Iterate over the lexemes in the vocabulary.
+
+        Yields: Lexeme An entry in the vocabulary.
+        '''
        cdef attr_t orth
        cdef size_t addr
        for orth, addr in self._by_orth.items():
@ -260,16 +313,15 @@ cdef class Vocab:
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
        unseen unicode string is given, a new lexeme is created and stored.

-        Args:
+        Arguments:
            id_or_string (int or unicode):
-              The integer ID of a word, or its unicode string.  If an int >= Lexicon.size,
-              IndexError is raised. If id_or_string is neither an int nor a unicode string,
-              ValueError is raised.
+              The integer ID of a word, or its unicode string.
+              
+              If an int >= Lexicon.size, IndexError is raised. If id_or_string
+              is neither an int nor a unicode string, ValueError is raised.

        Returns:
-            lexeme (Lexeme):
-              An instance of the Lexeme Python class, with data copied on
-              instantiation.
+            lexeme (Lexeme): The lexeme indicated by the given ID.
        '''
        cdef attr_t orth
        if type(id_or_string) == unicode:
@ -295,6 +347,11 @@ cdef class Vocab:
        return tokens
    
    def dump(self, loc):
+        """Save the lexemes binary data to the given location.
+
+        Arguments:
+            loc (Path): The path to save to.
+        """
        if hasattr(loc, 'as_posix'):
            loc = loc.as_posix()
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
@ -323,6 +380,14 @@ cdef class Vocab:
        fp.close()

    def load_lexemes(self, loc):
+        '''Load the binary vocabulary data from the given location.
+
+        Arguments:
+            loc (Path): The path to load from.
+
+        Returns:
+            None
+        '''
        fp = CFile(loc, 'rb',
                on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
        cdef LexemeC* lexeme
@ -363,6 +428,13 @@ cdef class Vocab:
        fp.close()

    def dump_vectors(self, out_loc):
+        '''Save the word vectors to a binary file.
+
+        Arguments:
+            loc (Path): The path to save to.
+        Returns:
+            None
+        '''
        cdef int32_t vec_len = self.vectors_length
        cdef int32_t word_len
        cdef bytes word_str
@ -384,6 +456,17 @@ cdef class Vocab:
        out_file.close()

    def load_vectors(self, file_):
+        """Load vectors from a text-based file.         
+
+        Arguments:
+            file_ (buffer): The file to read from. Entries should be separated by newlines,
+        and each entry should be whitespace delimited. The first value of the entry
+        should be the word string, and subsequent entries should be the values of the
+        vector.
+
+        Returns:
+            vec_len (int): The length of the vectors loaded.
+        """
        cdef LexemeC* lexeme
        cdef attr_t orth
        cdef int32_t vec_len = -1
@ -409,6 +492,14 @@ cdef class Vocab:
        return vec_len

    def load_vectors_from_bin_loc(self, loc):
+        """Load vectors from the location of a binary file.
+
+        Arguments:
+            loc (unicode): The path of the binary file to load from.
+
+        Returns:
+            vec_len (int): The length of the vectors loaded.
+        """
        cdef CFile file_ = CFile(loc, b'rb')
        cdef int32_t word_len
        cdef int32_t vec_len = 0