diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 985887630..635cc431a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -212,6 +212,7 @@ def _consume_ent(tags): cdef class GoldParse: + """Collection for training annotations.""" @classmethod def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): _, words, tags, heads, deps, entities = annot_tuples @@ -220,6 +221,25 @@ cdef class GoldParse: def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, deps=None, entities=None, make_projective=False): + """Create a GoldParse. + + Arguments: + doc (Doc): + The document the annotations refer to. + words: + A sequence of unicode word strings. + tags: + A sequence of strings, representing tag annotations. + heads: + A sequence of integers, representing syntactic head offsets. + deps: + A sequence of strings, representing the syntactic relation types. + entities: + A sequence of named entity annotations, either as BILUO tag strings, + or as (start_char, end_char, label) tuples, representing the entity + positions. + Returns (GoldParse): The newly constructed object. + """ if words is None: words = [token.text for token in doc] if tags is None: @@ -280,10 +300,16 @@ cdef class GoldParse: self.heads = proj_heads def __len__(self): + """Get the number of gold-standard tokens. + + Returns (int): The number of gold-standard tokens. + """ return self.length @property def is_projective(self): + """Whether the provided syntactic annotations form a projective dependency + tree.""" return not nonproj.is_nonproj_tree(self.heads) diff --git a/spacy/language.py b/spacy/language.py index e62431bf1..60f569a64 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -293,13 +293,14 @@ class Language(object): text (unicode): The text to be processed. Returns: - tokens (spacy.tokens.Doc): + doc (Doc): A container for accessing the annotations. - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp('An example sentence. Another example sentence.') - >>> tokens[0].orth_, tokens[0].head.tag_ - ('An', 'NN') + Example: + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp('An example sentence. Another example sentence.') + >>> tokens[0].orth_, tokens[0].head.tag_ + ('An', 'NN') """ doc = self.make_doc(text) if self.entity and entity: @@ -314,6 +315,16 @@ class Language(object): return doc def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): + '''Process texts as a stream, and yield Doc objects in order. + + Supports GIL-free multi-threading. + + Arguments: + texts (iterator) + tag (bool) + parse (bool) + entity (bool) + ''' skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} stream = (self.make_doc(text) for text in texts) for proc in self.pipeline: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 638cd6365..1d5421d74 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -36,6 +36,13 @@ cdef class Lexeme: tag). """ def __init__(self, Vocab vocab, int orth): + """Create a Lexeme object. + + Arguments: + vocab (Vocab): The parent vocabulary + orth (int): The orth id of the lexeme. + Returns (Lexeme): The newly constructd object. + """ self.vocab = vocab self.orth = orth self.c = vocab.get_by_orth(vocab.mem, orth) @@ -73,12 +80,33 @@ cdef class Lexeme: return self.c.orth def set_flag(self, attr_id_t flag_id, bint value): + """Change the value of a boolean flag. + + Arguments: + flag_id (int): The attribute ID of the flag to set. + value (bool): The new value of the flag. + """ Lexeme.c_set_flag(self.c, flag_id, value) def check_flag(self, attr_id_t flag_id): + """Check the value of a boolean flag. + + Arguments: + flag_id (int): The attribute ID of the flag to query. + Returns (bool): The value of the flag. + """ return True if Lexeme.c_check_flag(self.c, flag_id) else False def similarity(self, other): + '''Compute a semantic similarity estimate. Defaults to cosine over vectors. + + Arguments: + other: + The object to compare with. By default, accepts Doc, Span, + Token and Lexeme objects. + Returns: + score (float): A scalar similarity score. Higher is more similar. + ''' if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 7279b003c..2cf8f03b8 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -165,6 +165,7 @@ def _convert_strings(token_specs, string_store): cdef class Matcher: + '''Match sequences of tokens, based on pattern rules.''' cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab @@ -175,6 +176,16 @@ cdef class Matcher: @classmethod def load(cls, path, vocab): + '''Load the matcher and patterns from a file path. + + Arguments: + path (Path): + Path to a JSON-formatted patterns file. + vocab (Vocab): + The vocabulary that the documents to match over will refer to. + Returns: + Matcher: The newly constructed object. + ''' if (path / 'gazetteer.json').exists(): with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: patterns = json.load(file_) @@ -183,6 +194,16 @@ cdef class Matcher: return cls(vocab, patterns) def __init__(self, vocab, patterns={}): + """Create the Matcher. + + Arguments: + vocab (Vocab): + The vocabulary object, which must be shared with the documents + the matcher will operate on. + patterns (dict): Patterns to add to the matcher. + Returns: + The newly constructed object. + """ self._patterns = {} self._entities = {} self._acceptors = {} @@ -203,6 +224,22 @@ cdef class Matcher: def add_entity(self, entity_key, attrs=None, if_exists='raise', acceptor=None, on_match=None): + """Add an entity to the matcher. + + Arguments: + entity_key (unicode or int): + An ID for the entity. + attrs: + Attributes to associate with the Matcher. + if_exists ('raise', 'ignore' or 'update'): + Controls what happens if the entity ID already exists. Defaults to 'raise'. + acceptor: + Callback function to filter matches of the entity. + on_match: + Callback function to act on matches of the entity. + Returns: + None + """ if if_exists not in ('raise', 'ignore', 'update'): raise ValueError( "Unexpected value for if_exists: %s.\n" @@ -224,6 +261,18 @@ cdef class Matcher: self._callbacks[entity_key] = on_match def add_pattern(self, entity_key, token_specs, label=""): + """Add a pattern to the matcher. + + Arguments: + entity_key (unicode or int): + An ID for the entity. + token_specs: + Description of the pattern to be matched. + label: + Label to assign to the matched pattern. Defaults to "". + Returns: + None + """ entity_key = self.normalize_entity_key(entity_key) if not self.has_entity(entity_key): self.add_entity(entity_key) @@ -249,10 +298,24 @@ cdef class Matcher: return entity_key def has_entity(self, entity_key): + """Check whether the matcher has an entity. + + Arguments: + entity_key (string or int): The entity key to check. + Returns: + bool: Whether the matcher has the entity. + """ entity_key = self.normalize_entity_key(entity_key) return entity_key in self._entities def get_entity(self, entity_key): + """Retrieve the attributes stored for an entity. + + Arguments: + entity_key (unicode or int): The entity to retrieve. + Returns: + The entity attributes if present, otherwise None. + """ entity_key = self.normalize_entity_key(entity_key) if entity_key in self._entities: return self._entities[entity_key] @@ -260,6 +323,17 @@ cdef class Matcher: return None def __call__(self, Doc doc, acceptor=None): + """Find all token sequences matching the supplied patterns on the Doc. + + Arguments: + doc (Doc): + The document to match over. + Returns: + list + A list of (entity_key, label_id, start, end) tuples, + describing the matches. A match tuple describes a span doc[start:end]. + The label_id and entity_key are both integers. + """ if acceptor is not None: raise ValueError( "acceptor keyword argument to Matcher deprecated. Specify acceptor " @@ -340,6 +414,18 @@ cdef class Matcher: return matches def pipe(self, docs, batch_size=1000, n_threads=2): + """Match a stream of documents, yielding them in turn. + + Arguments: + docs: A stream of documents. + batch_size (int): + The number of documents to accumulate into a working set. + n_threads (int): + The number of threads with which to work on the buffer in parallel, + if the Matcher implementation supports multi-threading. + Yields: + Doc Documents, in order. + """ for doc in docs: self(doc) yield doc diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 02b6ecbee..f3e51ade7 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -11,6 +11,7 @@ from .attrs import DEP, ENT_TYPE cdef class EntityRecognizer(Parser): + """Annotate named entities on Doc objects.""" TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 170fb2796..9ca6fd74c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -73,6 +73,11 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except cdef class StringStore: '''Map strings to and from integer IDs.''' def __init__(self, strings=None, freeze=False): + '''Create the StringStore. + + Arguments: + strings: A sequence of unicode strings to add to the store. + ''' self.mem = Pool() self._map = PreshMap() self._oov = PreshMap() @@ -89,9 +94,22 @@ cdef class StringStore: return self.size -1 def __len__(self): + """The number of strings in the store. + + Returns: + int The number of strings in the store. + """ return self.size-1 def __getitem__(self, object string_or_id): + """Retrieve a string from a given integer ID, or vice versa. + + Arguments: + string_or_id (bytes or unicode or int): + The value to encode. + Returns: + unicode or int: The value to retrieved. + """ if isinstance(string_or_id, basestring) and len(string_or_id) == 0: return 0 elif string_or_id == 0: @@ -127,12 +145,23 @@ cdef class StringStore: return utf8str - self.c def __contains__(self, unicode string not None): + """Check whether a string is in the store. + + Arguments: + string (unicode): The string to check. + Returns bool: + Whether the store contains the string. + """ if len(string) == 0: return True cdef hash_t key = hash_string(string) return self._map.get(key) is not NULL def __iter__(self): + """Iterate over the strings in the store, in order. + + Yields: unicode A string in the store. + """ cdef int i for i in range(self.size): yield _decode(&self.c[i]) if i > 0 else u'' @@ -185,6 +214,13 @@ cdef class StringStore: return &self.c[self.size-1] def dump(self, file_): + """Save the strings to a JSON file. + + Arguments: + file_ (buffer): The file to save the strings. + Returns: + None + """ string_data = json.dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') @@ -192,6 +228,13 @@ cdef class StringStore: file_.write(string_data) def load(self, file_): + """Load the strings from a JSON file. + + Arguments: + file_ (buffer): The file from which to load the strings. + Returns: + None + """ strings = json.load(file_) if strings == ['']: return None diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index d7fce5b3d..918db1790 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -74,8 +74,21 @@ cdef class ParserModel(AveragedPerceptron): cdef class Parser: + """Base class of the DependencyParser and EntityRecognizer.""" @classmethod def load(cls, path, Vocab vocab, TransitionSystem=None, require=False): + """Load the statistical model from the supplied path. + + Arguments: + path (Path): + The path to load from. + vocab (Vocab): + The vocabulary. Must be shared by the documents to be processed. + require (bool): + Whether to raise an error if the files are not found. + Returns (Parser): + The newly constructed object. + """ with (path / 'config.json').open() as file_: cfg = json.load(file_) # TODO: remove this shim when we don't have to support older data @@ -90,6 +103,16 @@ cdef class Parser: return self def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): + """Create a Parser. + + Arguments: + vocab (Vocab): + The vocabulary object. Must be shared with documents to be processed. + model (thinc.linear.AveragedPerceptron): + The statistical model. + Returns (Parser): + The newly constructed object. + """ if TransitionSystem is None: TransitionSystem = self.TransitionSystem self.vocab = vocab @@ -107,6 +130,13 @@ cdef class Parser: return (Parser, (self.vocab, self.moves, self.model), None, None) def __call__(self, Doc tokens): + """Apply the entity recognizer, setting the annotations onto the Doc object. + + Arguments: + doc (Doc): The document to be processed. + Returns: + None + """ cdef int nr_feat = self.model.nr_feat with nogil: status = self.parseC(tokens.c, tokens.length, nr_feat) @@ -117,6 +147,16 @@ cdef class Parser: self.moves.finalize_doc(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): + """Process a stream of documents. + + Arguments: + stream: The sequence of documents to process. + batch_size (int): + The number of documents to accumulate into a working set. + n_threads (int): + The number of threads with which to work on the buffer in parallel. + Yields (Doc): Documents, in order. + """ cdef Pool mem = Pool() cdef TokenC** doc_ptr = mem.alloc(batch_size, sizeof(TokenC*)) cdef int* lengths = mem.alloc(batch_size, sizeof(int)) @@ -194,6 +234,16 @@ cdef class Parser: return 0 def update(self, Doc tokens, GoldParse gold): + """Update the statistical model. + + Arguments: + doc (Doc): + The example document for the update. + gold (GoldParse): + The gold-standard annotations, to calculate the loss. + Returns (float): + The loss on this example. + """ self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) self.moves.initialize_state(stcls.c) @@ -220,9 +270,24 @@ cdef class Parser: return loss def step_through(self, Doc doc): + """Set up a stepwise state, to introspect and control the transition sequence. + + Arguments: + doc (Doc): The document to step through. + Returns (StepwiseState): + A state object, to step through the annotation process. + """ return StepwiseState(self, doc) def from_transition_sequence(self, Doc doc, sequence): + """Control the annotations on a document by specifying a transition sequence + to follow. + + Arguments: + doc (Doc): The document to annotate. + sequence: A sequence of action names, as unicode strings. + Returns: None + """ with self.step_through(doc) as stepwise: for transition in sequence: stepwise.transition(transition) @@ -233,7 +298,6 @@ cdef class Parser: self.moves.add_action(action, label) - cdef class StepwiseState: cdef readonly StateClass stcls cdef readonly Example eg diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index c98b06d3a..a387ccb12 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -102,9 +102,21 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: cdef class Tagger: - """A part-of-speech tagger for English""" + """Annotate part-of-speech tags on Doc objects.""" @classmethod def load(cls, path, vocab, require=False): + """Load the statistical model from the supplied path. + + Arguments: + path (Path): + The path to load from. + vocab (Vocab): + The vocabulary. Must be shared by the documents to be processed. + require (bool): + Whether to raise an error if the files are not found. + Returns (Tagger): + The newly created object. + """ # TODO: Change this to expect config.json when we don't have to # support old data. path = path if not isinstance(path, basestring) else pathlib.Path(path) @@ -126,6 +138,16 @@ cdef class Tagger: return self def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): + """Create a Tagger. + + Arguments: + vocab (Vocab): + The vocabulary object. Must be shared with documents to be processed. + model (thinc.linear.AveragedPerceptron): + The statistical model. + Returns (Tagger): + The newly constructed object. + """ if model is None: model = TaggerModel(cfg.get('features', self.feature_templates)) self.vocab = vocab @@ -154,8 +176,10 @@ cdef class Tagger: def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. - Args: - tokens (Doc): The tokens to be tagged. + Arguments: + doc (Doc): The tokens to be tagged. + Returns: + None """ if tokens.length == 0: return 0 @@ -178,11 +202,33 @@ cdef class Tagger: tokens._py_tokens = [None] * tokens.length def pipe(self, stream, batch_size=1000, n_threads=2): + """Tag a stream of documents. + + Arguments: + stream: The sequence of documents to tag. + batch_size (int): + The number of documents to accumulate into a working set. + n_threads (int): + The number of threads with which to work on the buffer in parallel, + if the Matcher implementation supports multi-threading. + Yields: + Doc Documents, in order. + """ for doc in stream: self(doc) yield doc def update(self, Doc tokens, GoldParse gold): + """Update the statistical model, with tags supplied for the given document. + + Arguments: + doc (Doc): + The document to update on. + gold (GoldParse): + Manager for the gold-standard tags. + Returns (int): + Number of tags correct. + """ gold_tag_strs = gold.tags assert len(tokens) == len(gold_tag_strs) for tag in gold_tag_strs: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66759d271..5f668a424 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -219,6 +219,16 @@ cdef class Doc: return self.__str__() def similarity(self, other): + '''Make a semantic similarity estimate. The default estimate is cosine + similarity using an average of word vectors. + + Arguments: + other (object): The object to compare with. By default, accepts Doc, + Span, Token and Lexeme objects. + + Return: + score (float): A scalar similarity score. Higher is more similar. + ''' if 'similarity' in self.user_hooks: return self.user_hooks['similarity'](self, other) if self.vector_norm == 0 or other.vector_norm == 0: @@ -226,6 +236,9 @@ cdef class Doc: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property has_vector: + ''' + A boolean value indicating whether a word vector is associated with the object. + ''' def __get__(self): if 'has_vector' in self.user_hooks: return self.user_hooks['has_vector'](self) @@ -233,6 +246,11 @@ cdef class Doc: return any(token.has_vector for token in self) property vector: + ''' + A real-valued meaning representation. Defaults to an average of the token vectors. + + Type: numpy.ndarray[ndim=1, dtype='float32'] + ''' def __get__(self): if 'vector' in self.user_hooks: return self.user_hooks['vector'](self) @@ -265,14 +283,16 @@ cdef class Doc: @property def string(self): return self.text + + property text + '''A unicode representation of the document text.''' + def __get__(self): + return u''.join(t.text_with_ws for t in self) - @property - def text_with_ws(self): - return self.text - - @property - def text(self): - return u''.join(t.text_with_ws for t in self) + property text_with_ws: + '''An alias of Doc.text, provided for duck-type compatibility with Span and Token.''' + def __get__(self): + return self.text property ents: ''' @@ -567,7 +587,6 @@ cdef class Doc: set_children_from_heads(self.c, self.length) self.is_parsed = bool(HEAD in attrs or DEP in attrs) self.is_tagged = bool(TAG in attrs or POS in attrs) - return self def to_bytes(self): @@ -612,7 +631,22 @@ cdef class Doc: yield n_bytes_str + data def merge(self, int start_idx, int end_idx, *args, **attributes): - """Merge a multi-word expression into a single token.""" + """Retokenize the document, such that the span at doc.text[start_idx : end_idx] + is merged into a single token. If start_idx and end_idx do not mark start + and end token boundaries, the document remains unchanged. + + Arguments: + start_idx (int): The character index of the start of the slice to merge. + end_idx (int): The character index after the end of the slice to merge. + **attributes: + Attributes to assign to the merged token. By default, attributes + are inherited from the syntactic root token of the span. + Returns: + token (Token): + The newly merged token, or None if the start and end indices did + not fall at token boundaries. + + """ cdef unicode tag, lemma, ent_type if len(args) == 3: # TODO: Warn deprecation diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 033c74c56..d3958242e 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -18,12 +18,23 @@ from ..lexeme cimport Lexeme cdef class Span: """A slice from a Doc object.""" - def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, + def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, vector_norm=None): + '''Create a Span object from the slice doc[start : end] + + Arguments: + doc (Doc): The parent document. + start (int): The index of the first token of the span. + end (int): The index of the first token after the span. + label (int): A label to attach to the Span, e.g. for named entities. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + Returns: + Span The newly constructed object. + ''' if not (0 <= start <= end <= len(tokens)): raise IndexError - self.doc = tokens + self.doc = doc self.start = start self.start_char = self.doc[start].idx if start < self.doc.length else 0 self.end = end @@ -78,9 +89,29 @@ cdef class Span: yield self.doc[i] def merge(self, *args, **attributes): + """Retokenize the document, such that the span is merged into a single token. + + Arguments: + **attributes: + Attributes to assign to the merged token. By default, attributes + are inherited from the syntactic root token of the span. + Returns: + token (Token): + The newly merged token. + """ self.doc.merge(self.start_char, self.end_char, *args, **attributes) def similarity(self, other): + '''Make a semantic similarity estimate. The default estimate is cosine + similarity using an average of word vectors. + + Arguments: + other (object): The object to compare with. By default, accepts Doc, + Span, Token and Lexeme objects. + + Return: + score (float): A scalar similarity score. Higher is more similar. + ''' if 'similarity' in self.doc.user_span_hooks: self.doc.user_span_hooks['similarity'](self, other) if self.vector_norm == 0.0 or other.vector_norm == 0.0: @@ -102,7 +133,11 @@ cdef class Span: self.end = end + 1 property sent: - '''Get the sentence span that this span is a part of.''' + '''The sentence span that this span is a part of. + + Returns: + Span The sentence this is part of. + ''' def __get__(self): if 'sent' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sent'](self) @@ -156,7 +191,12 @@ cdef class Span: return u''.join([t.text_with_ws for t in self]) property root: - """The word of the span that is highest in the parse tree, i.e. has the + """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. + + Returns: + Token: The root token. + + i.e. has the shortest path to the root of the sentence (or is the root itself). If multiple words are equally high in the tree, the first word is taken. @@ -231,7 +271,10 @@ cdef class Span: return self.doc[root] property lefts: - """Tokens that are to the left of the Span, whose head is within the Span.""" + """Tokens that are to the left of the span, whose head is within the Span. + + Yields: Token A left-child of a token of the span. + """ def __get__(self): for token in reversed(self): # Reverse, so we get the tokens in order for left in token.lefts: @@ -239,7 +282,10 @@ cdef class Span: yield left property rights: - """Tokens that are to the right of the Span, whose head is within the Span.""" + """Tokens that are to the right of the Span, whose head is within the Span. + + Yields: Token A right-child of a token of the span. + """ def __get__(self): for token in self: for right in token.rights: @@ -247,6 +293,10 @@ cdef class Span: yield right property subtree: + """Tokens that descend from tokens in the span, but fall outside it. + + Yields: Token A descendant of a token within the span. + """ def __get__(self): for word in self.lefts: yield from word.subtree diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f15869d59..5f18866e9 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -30,8 +30,7 @@ from ..lexeme cimport Lexeme cdef class Token: - """An individual token --- i.e. a word, a punctuation symbol, etc. Created - via Doc.__getitem__ and Doc.__iter__. + """An individual token --- i.e. a word, punctuation symbol, whitespace, etc. """ def __cinit__(self, Vocab vocab, Doc doc, int offset): self.vocab = vocab @@ -40,6 +39,7 @@ cdef class Token: self.i = offset def __len__(self): + '''Number of unicode characters in token.text''' return self.c.lex.length def __unicode__(self): @@ -57,12 +57,35 @@ cdef class Token: return self.__str__() cpdef bint check_flag(self, attr_id_t flag_id) except -1: + '''Check the value of a boolean flag. + + Arguments: + flag_id (int): The ID of the flag attribute. + Returns: + is_set (bool): Whether the flag is set. + ''' return Lexeme.c_check_flag(self.c.lex, flag_id) def nbor(self, int i=1): + '''Get a neighboring token. + + Arguments: + i (int): The relative position of the token to get. Defaults to 1. + Returns: + neighbor (Token): The token at position self.doc[self.i+i] + ''' return self.doc[self.i+i] def similarity(self, other): + '''Compute a semantic similarity estimate. Defaults to cosine over vectors. + + Arguments: + other: + The object to compare with. By default, accepts Doc, Span, + Token and Lexeme objects. + Returns: + score (float): A scalar similarity score. Higher is more similar. + ''' if 'similarity' in self.doc.user_token_hooks: return self.doc.user_token_hooks['similarity'](self) if self.vector_norm == 0 or other.vector_norm == 0: @@ -158,6 +181,9 @@ cdef class Token: self.c.dep = label property has_vector: + ''' + A boolean value indicating whether a word vector is associated with the object. + ''' def __get__(self): if 'has_vector' in self.doc.user_token_hooks: return self.doc.user_token_hooks['has_vector'](self) @@ -169,6 +195,11 @@ cdef class Token: return False property vector: + ''' + A real-valued meaning representation. + + Type: numpy.ndarray[ndim=1, dtype='float32'] + ''' def __get__(self): if 'vector' in self.doc.user_token_hooks: return self.doc.user_token_hooks['vector'](self) @@ -241,11 +272,19 @@ cdef class Token: yield t property children: + '''A sequence of the token's immediate syntactic children. + + Yields: Token A child token such that child.head==self + ''' def __get__(self): yield from self.lefts yield from self.rights property subtree: + '''A sequence of all the token's syntactic descendents. + + Yields: Token A descendent token such that self.is_ancestor(descendent) + ''' def __get__(self): for word in self.lefts: yield from word.subtree @@ -254,14 +293,26 @@ cdef class Token: yield from word.subtree property left_edge: + '''The leftmost token of this token's syntactic descendents. + + Returns: Token The first token such that self.is_ancestor(token) + ''' def __get__(self): return self.doc[self.c.l_edge] property right_edge: + '''The rightmost token of this token's syntactic descendents. + + Returns: Token The last token such that self.is_ancestor(token) + ''' def __get__(self): return self.doc[self.c.r_edge] property ancestors: + '''A sequence of this token's syntactic ancestors. + + Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self) + ''' def __get__(self): cdef const TokenC* head_ptr = self.c # guard against infinite loop, no token can have @@ -273,9 +324,27 @@ cdef class Token: i += 1 def is_ancestor_of(self, descendant): + # TODO: Remove after backward compatibility check. + return self.is_ancestor(descendant) + + def is_ancestor(self, descendant): + '''Check whether this token is a parent, grandparent, etc. of another + in the dependency tree. + + Arguments: + descendant (Token): Another token. + Returns: + is_ancestor (bool): Whether this token is the ancestor of the descendant. + ''' + if self.doc is not other.doc: + return False return any( ancestor.i == self.i for ancestor in descendant.ancestors ) property head: + '''The syntactic parent, or "governor", of this token. + + Returns: Token + ''' def __get__(self): """The token predicted by the parser to be the head of the current token.""" return self.doc[self.i + self.c.head] @@ -370,6 +439,10 @@ cdef class Token: self.c.head = rel_newhead_i property conjuncts: + '''A sequence of coordinated tokens, including the token itself. + + Yields: Token A coordinated token + ''' def __get__(self): """Get a list of conjoined words.""" cdef Token word diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b82a61906..feef43ae5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -52,6 +52,25 @@ cdef class Vocab: @classmethod def load(cls, path, lex_attr_getters=None, lemmatizer=True, tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): + """ + Load the vocabulary from a path. + + Arguments: + path (Path): + The path to load from. + lex_attr_getters (dict): + A dictionary mapping attribute IDs to functions to compute them. + Defaults to None. + lemmatizer (object): + A lemmatizer. Defaults to None. + tag_map (dict): + A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, + and optionally morphological attributes. + oov_prob (float): + The default probability for out-of-vocabulary words. + Returns: + Vocab: The newly constructed vocab object. + """ util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) if 'vectors' in deprecated_kwargs: raise AttributeError( @@ -82,6 +101,22 @@ cdef class Vocab: def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, serializer_freqs=None, **deprecated_kwargs): + '''Create the vocabulary. + + lex_attr_getters (dict): + A dictionary mapping attribute IDs to functions to compute them. + Defaults to None. + lemmatizer (object): + A lemmatizer. Defaults to None. + tag_map (dict): + A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, + and optionally morphological attributes. + oov_prob (float): + The default probability for out-of-vocabulary words. + + Returns: + Vocab: The newly constructed vocab object. + ''' util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} @@ -134,6 +169,9 @@ cdef class Vocab: ''' Set vectors_length to a new size, and allocate more memory for the Lexeme vectors if necessary. The memory will be zeroed. + + Arguments: + new_size (int): The new size of the vectors. ''' cdef hash_t key cdef size_t addr @@ -145,11 +183,14 @@ cdef class Vocab: self.vectors_length = new_size def add_flag(self, flag_getter, int flag_id=-1): - '''Set a new boolean flag to words in the vocabulary. The flag_setter - function will be called over the words currently in the vocab, and then - applied to new words as they occur. You'll then be able to access the - flag value on each token, using token.check_flag(flag_id). See also: - Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag. + '''Set a new boolean flag to words in the vocabulary. + + The flag_setter function will be called over the words currently in the + vocab, and then applied to new words as they occur. You'll then be able + to access the flag value on each token, using token.check_flag(flag_id). + + See also: + Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag. Arguments: flag_getter: @@ -246,11 +287,23 @@ cdef class Vocab: self.length += 1 def __contains__(self, unicode string): + '''Check whether the string has an entry in the vocabulary. + + Arguments: + string (unicode): The ID string. + + Returns: + bool Whether the string has an entry in the vocabulary. + ''' key = hash_string(string) lex = self._by_hash.get(key) return True if lex is not NULL else False def __iter__(self): + '''Iterate over the lexemes in the vocabulary. + + Yields: Lexeme An entry in the vocabulary. + ''' cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): @@ -260,16 +313,15 @@ cdef class Vocab: '''Retrieve a lexeme, given an int ID or a unicode string. If a previously unseen unicode string is given, a new lexeme is created and stored. - Args: + Arguments: id_or_string (int or unicode): - The integer ID of a word, or its unicode string. If an int >= Lexicon.size, - IndexError is raised. If id_or_string is neither an int nor a unicode string, - ValueError is raised. + The integer ID of a word, or its unicode string. + + If an int >= Lexicon.size, IndexError is raised. If id_or_string + is neither an int nor a unicode string, ValueError is raised. Returns: - lexeme (Lexeme): - An instance of the Lexeme Python class, with data copied on - instantiation. + lexeme (Lexeme): The lexeme indicated by the given ID. ''' cdef attr_t orth if type(id_or_string) == unicode: @@ -295,6 +347,11 @@ cdef class Vocab: return tokens def dump(self, loc): + """Save the lexemes binary data to the given location. + + Arguments: + loc (Path): The path to save to. + """ if hasattr(loc, 'as_posix'): loc = loc.as_posix() cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc @@ -323,6 +380,14 @@ cdef class Vocab: fp.close() def load_lexemes(self, loc): + '''Load the binary vocabulary data from the given location. + + Arguments: + loc (Path): The path to load from. + + Returns: + None + ''' fp = CFile(loc, 'rb', on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) cdef LexemeC* lexeme @@ -363,6 +428,13 @@ cdef class Vocab: fp.close() def dump_vectors(self, out_loc): + '''Save the word vectors to a binary file. + + Arguments: + loc (Path): The path to save to. + Returns: + None + ''' cdef int32_t vec_len = self.vectors_length cdef int32_t word_len cdef bytes word_str @@ -384,6 +456,17 @@ cdef class Vocab: out_file.close() def load_vectors(self, file_): + """Load vectors from a text-based file. + + Arguments: + file_ (buffer): The file to read from. Entries should be separated by newlines, + and each entry should be whitespace delimited. The first value of the entry + should be the word string, and subsequent entries should be the values of the + vector. + + Returns: + vec_len (int): The length of the vectors loaded. + """ cdef LexemeC* lexeme cdef attr_t orth cdef int32_t vec_len = -1 @@ -409,6 +492,14 @@ cdef class Vocab: return vec_len def load_vectors_from_bin_loc(self, loc): + """Load vectors from the location of a binary file. + + Arguments: + loc (unicode): The path of the binary file to load from. + + Returns: + vec_len (int): The length of the vectors loaded. + """ cdef CFile file_ = CFile(loc, b'rb') cdef int32_t word_len cdef int32_t vec_len = 0