Use consistent formatting for docstrings

This commit is contained in:
ines 2017-04-15 11:59:21 +02:00
parent d13f0a7017
commit 561f2a3eb4
17 changed files with 192 additions and 113 deletions

View File

@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert
class CLI(object): class CLI(object):
"""Command-line interface for spaCy""" """
Command-line interface for spaCy
"""
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert') commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
@plac.annotations( @plac.annotations(
@ -29,7 +30,6 @@ class CLI(object):
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name
with version. with version.
""" """
cli_download(model, direct) cli_download(model, direct)
@ -44,7 +44,6 @@ class CLI(object):
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name). directory. Linking models allows loading them via spacy.load(link_name).
""" """
cli_link(origin, link_name, force) cli_link(origin, link_name, force)
@ -58,7 +57,6 @@ class CLI(object):
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues. prints details in Markdown for easy copy-pasting to GitHub issues.
""" """
cli_info(model, markdown) cli_info(model, markdown)
@ -73,7 +71,6 @@ class CLI(object):
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified
output directory, and model data will be copied over. output directory, and model data will be copied over.
""" """
cli_package(input_dir, output_dir, force) cli_package(input_dir, output_dir, force)
@ -93,7 +90,6 @@ class CLI(object):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
not no_parser, not no_ner, parser_L1) not no_parser, not no_ner, parser_L1)
@ -108,7 +104,6 @@ class CLI(object):
""" """
Initialize a new model and its data directory. Initialize a new model and its data directory.
""" """
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
@plac.annotations( @plac.annotations(
@ -122,7 +117,6 @@ class CLI(object):
Convert files into JSON format for use with train command and other Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions.
""" """
cli_convert(input_file, output_dir, n_sents, morphology) cli_convert(input_file, output_dir, n_sents, morphology)

View File

@ -92,7 +92,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints. """
Normalize a dictionary of attributes, converting them to ints.
Arguments: Arguments:
stringy_attrs (dict): stringy_attrs (dict):
@ -105,7 +106,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
inty_attrs (dict): inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to Attributes dictionary with keys and optionally values converted to
ints. ints.
''' """
inty_attrs = {} inty_attrs = {}
if _do_deprecated: if _do_deprecated:
if 'F' in stringy_attrs: if 'F' in stringy_attrs:

View File

@ -7,7 +7,8 @@ from ... import util
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
"""Convert conllu files into JSON format for use with train cli. """
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is use_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich. useful for languages such as Spanish, where UD tags are not so rich.
""" """

View File

@ -36,7 +36,8 @@ def align_tokens(ref, indices): # Deprecated, surely?
def detokenize(token_rules, words): # Deprecated? def detokenize(token_rules, words): # Deprecated?
"""To align with treebanks, return a list of "chunks", where a chunk is a """
To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g. chunk should be a tuple of token indices, e.g.
@ -57,10 +58,13 @@ def detokenize(token_rules, words): # Deprecated?
return positions return positions
def fix_glove_vectors_loading(overrides):
"""Special-case hack for loading the GloVe vectors, to support deprecated
<1.0 stuff. Phase this out once the data is fixed."""
def fix_glove_vectors_loading(overrides):
"""
Special-case hack for loading the GloVe vectors, to support deprecated
<1.0 stuff. Phase this out once the data is fixed.
"""
if 'data_dir' in overrides and 'path' not in overrides: if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'") raise ValueError("The argument 'data_dir' has been renamed to 'path'")
if overrides.get('path') is False: if overrides.get('path') is False:
@ -88,13 +92,13 @@ def fix_glove_vectors_loading(overrides):
def resolve_model_name(name): def resolve_model_name(name):
"""If spaCy is loaded with 'de', check if symlink already exists. If """
If spaCy is loaded with 'de', check if symlink already exists. If
not, user have upgraded from older version and have old models installed. not, user have upgraded from older version and have old models installed.
Check if old model directory exists and if so, return that instead and create Check if old model directory exists and if so, return that instead and create
shortcut link. If English model is found and no shortcut exists, raise error shortcut link. If English model is found and no shortcut exists, raise error
and tell user to install new model. and tell user to install new model.
""" """
if name == 'en' or name == 'de': if name == 'en' or name == 'de':
versions = ['1.0.0', '1.1.0'] versions = ['1.0.0', '1.1.0']
data_path = Path(util.get_data_path()) data_path = Path(util.get_data_path())
@ -117,9 +121,11 @@ def resolve_model_name(name):
class ModelDownload(): class ModelDownload():
"""Replace download modules within en and de with deprecation warning and """
Replace download modules within en and de with deprecation warning and
download default language model (using shortcut). Use classmethods to allow download default language model (using shortcut). Use classmethods to allow
importing ModelDownload as download and calling download.en() etc.""" importing ModelDownload as download and calling download.en() etc.
"""
@classmethod @classmethod
def load(self, lang): def load(self, lang):

View File

@ -220,7 +220,8 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False):
"""Create a GoldParse. """
Create a GoldParse.
Arguments: Arguments:
doc (Doc): doc (Doc):
@ -310,13 +311,16 @@ cdef class GoldParse:
@property @property
def is_projective(self): def is_projective(self):
"""Whether the provided syntactic annotations form a projective dependency """
tree.""" Whether the provided syntactic annotations form a projective dependency
tree.
"""
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities): def biluo_tags_from_offsets(doc, entities):
'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out """
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (biluo). scheme (biluo).
Arguments: Arguments:
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
tags = biluo_tags_from_offsets(doc, entities) tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O'] assert tags == ['O', 'O', 'U-LOC', 'O']
''' """
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}
biluo = ['-' for _ in doc] biluo = ['-' for _ in doc]

View File

@ -202,9 +202,10 @@ class BaseDefaults(object):
class Language(object): class Language(object):
'''A text-processing pipeline. Usually you'll load this once per process, and """
A text-processing pipeline. Usually you'll load this once per process, and
pass the instance around your program. pass the instance around your program.
''' """
Defaults = BaseDefaults Defaults = BaseDefaults
lang = None lang = None
@ -342,7 +343,8 @@ class Language(object):
return doc return doc
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
'''Process texts as a stream, and yield Doc objects in order. """
Process texts as a stream, and yield Doc objects in order.
Supports GIL-free multi-threading. Supports GIL-free multi-threading.
@ -351,7 +353,7 @@ class Language(object):
tag (bool) tag (bool)
parse (bool) parse (bool)
entity (bool) entity (bool)
''' """
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
stream = (self.make_doc(text) for text in texts) stream = (self.make_doc(text) for text in texts)
for proc in self.pipeline: for proc in self.pipeline:

View File

@ -38,8 +38,10 @@ class Lemmatizer(object):
return lemmas return lemmas
def is_base_form(self, univ_pos, morphology=None): def is_base_form(self, univ_pos, morphology=None):
'''Check whether we're dealing with an uninflected paradigm, so we can """
avoid lemmatization entirely.''' Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)

View File

@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme: cdef class Lexeme:
"""An entry in the vocabulary. A Lexeme has no string context --- it's a """
An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag). tag).
""" """
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, int orth):
"""Create a Lexeme object. """
Create a Lexeme object.
Arguments: Arguments:
vocab (Vocab): The parent vocabulary vocab (Vocab): The parent vocabulary
@ -80,7 +82,8 @@ cdef class Lexeme:
return self.c.orth return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value): def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag. """
Change the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The attribute ID of the flag to set. flag_id (int): The attribute ID of the flag to set.
@ -89,7 +92,8 @@ cdef class Lexeme:
Lexeme.c_set_flag(self.c, flag_id, value) Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id): def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag. """
Check the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The attribute ID of the flag to query. flag_id (int): The attribute ID of the flag to query.
@ -98,7 +102,8 @@ cdef class Lexeme:
return True if Lexeme.c_check_flag(self.c, flag_id) else False return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other): def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors. """
Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments: Arguments:
other: other:
@ -106,7 +111,7 @@ cdef class Lexeme:
Token and Lexeme objects. Token and Lexeme objects.
Returns: Returns:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

View File

@ -180,7 +180,8 @@ cdef class Matcher:
@classmethod @classmethod
def load(cls, path, vocab): def load(cls, path, vocab):
'''Load the matcher and patterns from a file path. """
Load the matcher and patterns from a file path.
Arguments: Arguments:
path (Path): path (Path):
@ -189,7 +190,7 @@ cdef class Matcher:
The vocabulary that the documents to match over will refer to. The vocabulary that the documents to match over will refer to.
Returns: Returns:
Matcher: The newly constructed object. Matcher: The newly constructed object.
''' """
if (path / 'gazetteer.json').exists(): if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = json.load(file_) patterns = json.load(file_)
@ -198,7 +199,8 @@ cdef class Matcher:
return cls(vocab, patterns) return cls(vocab, patterns)
def __init__(self, vocab, patterns={}): def __init__(self, vocab, patterns={}):
"""Create the Matcher. """
Create the Matcher.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -227,7 +229,8 @@ cdef class Matcher:
def add_entity(self, entity_key, attrs=None, if_exists='raise', def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None): acceptor=None, on_match=None):
"""Add an entity to the matcher. """
Add an entity to the matcher.
Arguments: Arguments:
entity_key (unicode or int): entity_key (unicode or int):
@ -264,7 +267,8 @@ cdef class Matcher:
self._callbacks[entity_key] = on_match self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""): def add_pattern(self, entity_key, token_specs, label=""):
"""Add a pattern to the matcher. """
Add a pattern to the matcher.
Arguments: Arguments:
entity_key (unicode or int): entity_key (unicode or int):
@ -307,7 +311,8 @@ cdef class Matcher:
return entity_key return entity_key
def has_entity(self, entity_key): def has_entity(self, entity_key):
"""Check whether the matcher has an entity. """
Check whether the matcher has an entity.
Arguments: Arguments:
entity_key (string or int): The entity key to check. entity_key (string or int): The entity key to check.
@ -318,7 +323,8 @@ cdef class Matcher:
return entity_key in self._entities return entity_key in self._entities
def get_entity(self, entity_key): def get_entity(self, entity_key):
"""Retrieve the attributes stored for an entity. """
Retrieve the attributes stored for an entity.
Arguments: Arguments:
entity_key (unicode or int): The entity to retrieve. entity_key (unicode or int): The entity to retrieve.
@ -332,7 +338,8 @@ cdef class Matcher:
return None return None
def __call__(self, Doc doc, acceptor=None): def __call__(self, Doc doc, acceptor=None):
"""Find all token sequences matching the supplied patterns on the Doc. """
Find all token sequences matching the supplied patterns on the Doc.
Arguments: Arguments:
doc (Doc): doc (Doc):
@ -445,7 +452,8 @@ cdef class Matcher:
return matches return matches
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn. """
Match a stream of documents, yielding them in turn.
Arguments: Arguments:
docs: A stream of documents. docs: A stream of documents.

View File

@ -16,7 +16,9 @@ from .attrs import LEMMA, intify_attrs
def _normalize_props(props): def _normalize_props(props):
'''Transform deprecated string keys to correct names.''' """
Transform deprecated string keys to correct names.
"""
out = {} out = {}
for key, value in props.items(): for key, value in props.items():
if key == POS: if key == POS:
@ -98,13 +100,14 @@ cdef class Morphology:
flags[0] &= ~(one << flag_id) flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
'''Add a special-case rule to the morphological analyser. Tokens whose """
Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties. tag and orth match the rule will receive the specified properties.
Arguments: Arguments:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
''' """
tag = self.strings[tag_str] tag = self.strings[tag_str]
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]

View File

@ -11,7 +11,9 @@ from .attrs import DEP, ENT_TYPE
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
"""Annotate named entities on Doc objects.""" """
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')
@ -28,7 +30,9 @@ cdef class EntityRecognizer(Parser):
cdef class BeamEntityRecognizer(BeamParser): cdef class BeamEntityRecognizer(BeamParser):
"""Annotate named entities on Doc objects.""" """
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')

View File

@ -6,7 +6,9 @@ from .gold import tags_to_entities
class PRFScore(object): class PRFScore(object):
"""A precision / recall / F score""" """
A precision / recall / F score
"""
def __init__(self): def __init__(self):
self.tp = 0 self.tp = 0
self.fp = 0 self.fp = 0

View File

@ -73,13 +73,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore: cdef class StringStore:
'''Map strings to and from integer IDs.''' """
Map strings to and from integer IDs.
"""
def __init__(self, strings=None, freeze=False): def __init__(self, strings=None, freeze=False):
'''Create the StringStore. """
Create the StringStore.
Arguments: Arguments:
strings: A sequence of unicode strings to add to the store. strings: A sequence of unicode strings to add to the store.
''' """
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._oov = PreshMap() self._oov = PreshMap()
@ -104,7 +107,8 @@ cdef class StringStore:
return (StringStore, (list(self),)) return (StringStore, (list(self),))
def __len__(self): def __len__(self):
"""The number of strings in the store. """
The number of strings in the store.
Returns: Returns:
int The number of strings in the store. int The number of strings in the store.
@ -112,7 +116,8 @@ cdef class StringStore:
return self.size-1 return self.size-1
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
"""Retrieve a string from a given integer ID, or vice versa. """
Retrieve a string from a given integer ID, or vice versa.
Arguments: Arguments:
string_or_id (bytes or unicode or int): string_or_id (bytes or unicode or int):
@ -159,7 +164,8 @@ cdef class StringStore:
return utf8str - self.c return utf8str - self.c
def __contains__(self, unicode string not None): def __contains__(self, unicode string not None):
"""Check whether a string is in the store. """
Check whether a string is in the store.
Arguments: Arguments:
string (unicode): The string to check. string (unicode): The string to check.
@ -172,7 +178,8 @@ cdef class StringStore:
return self._map.get(key) is not NULL return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):
"""Iterate over the strings in the store, in order. """
Iterate over the strings in the store, in order.
Yields: unicode A string in the store. Yields: unicode A string in the store.
""" """
@ -230,7 +237,8 @@ cdef class StringStore:
return &self.c[self.size-1] return &self.c[self.size-1]
def dump(self, file_): def dump(self, file_):
"""Save the strings to a JSON file. """
Save the strings to a JSON file.
Arguments: Arguments:
file_ (buffer): The file to save the strings. file_ (buffer): The file to save the strings.
@ -244,7 +252,8 @@ cdef class StringStore:
file_.write(string_data) file_.write(string_data)
def load(self, file_): def load(self, file_):
"""Load the strings from a JSON file. """
Load the strings from a JSON file.
Arguments: Arguments:
file_ (buffer): The file from which to load the strings. file_ (buffer): The file from which to load the strings.

View File

@ -106,10 +106,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger: cdef class Tagger:
"""Annotate part-of-speech tags on Doc objects.""" """
Annotate part-of-speech tags on Doc objects.
"""
@classmethod @classmethod
def load(cls, path, vocab, require=False): def load(cls, path, vocab, require=False):
"""Load the statistical model from the supplied path. """
Load the statistical model from the supplied path.
Arguments: Arguments:
path (Path): path (Path):
@ -142,7 +145,8 @@ cdef class Tagger:
return self return self
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""Create a Tagger. """
Create a Tagger.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -180,7 +184,8 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """
Apply the tagger, setting the POS tags onto the Doc object.
Arguments: Arguments:
doc (Doc): The tokens to be tagged. doc (Doc): The tokens to be tagged.
@ -208,7 +213,8 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2):
"""Tag a stream of documents. """
Tag a stream of documents.
Arguments: Arguments:
stream: The sequence of documents to tag. stream: The sequence of documents to tag.
@ -225,7 +231,8 @@ cdef class Tagger:
yield doc yield doc
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0):
"""Update the statistical model, with tags supplied for the given document. """
Update the statistical model, with tags supplied for the given document.
Arguments: Arguments:
doc (Doc): doc (Doc):

View File

@ -23,11 +23,14 @@ from .tokens.doc cimport Doc
cdef class Tokenizer: cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment boundaries.""" """
Segment text, and create Doc objects with the discovered segment boundaries.
"""
@classmethod @classmethod
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
infix_finditer=None, token_match=None): infix_finditer=None, token_match=None):
'''Load a Tokenizer, reading unsupplied components from the path. """
Load a Tokenizer, reading unsupplied components from the path.
Arguments: Arguments:
path (Path): path (Path):
@ -45,10 +48,10 @@ cdef class Tokenizer:
infix_finditer: infix_finditer:
Signature of re.compile(string).finditer Signature of re.compile(string).finditer
Returns Tokenizer Returns Tokenizer
'''
if isinstance(path, basestring): if isinstance(path, basestring):
path = pathlib.Path(path) path = pathlib.Path(path)
"""
if rules is None: if rules is None:
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
rules = json.load(file_) rules = json.load(file_)
@ -67,7 +70,8 @@ cdef class Tokenizer:
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
'''Create a Tokenizer, to create Doc objects given unicode text. """
Create a Tokenizer, to create Doc objects given unicode text.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -85,7 +89,7 @@ cdef class Tokenizer:
to find infixes. to find infixes.
token_match: token_match:
A boolean function matching strings that becomes tokens. A boolean function matching strings that becomes tokens.
''' """
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
@ -117,7 +121,8 @@ cdef class Tokenizer:
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, unicode string): def __call__(self, unicode string):
"""Tokenize a string. """
Tokenize a string.
Arguments: Arguments:
string (unicode): The string to tokenize. string (unicode): The string to tokenize.
@ -170,7 +175,8 @@ cdef class Tokenizer:
return tokens return tokens
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=2):
"""Tokenize a stream of texts. """
Tokenize a stream of texts.
Arguments: Arguments:
texts: A sequence of unicode texts. texts: A sequence of unicode texts.
@ -324,7 +330,8 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
def find_infix(self, unicode string): def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens. """
Find internal split points of the string, such as hyphens.
string (unicode): The string to segment. string (unicode): The string to segment.
@ -337,7 +344,8 @@ cdef class Tokenizer:
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string, """
Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match. or None if no prefix rules match.
Arguments: Arguments:
@ -350,7 +358,8 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string, """
Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match. or None if no suffix rules match.
Arguments: Arguments:
@ -363,13 +372,15 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases): def _load_special_tokenization(self, special_cases):
'''Add special-case tokenization rules. """
''' Add special-case tokenization rules.
"""
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings) self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings): def add_special_case(self, unicode string, substrings):
'''Add a special-case tokenization rule. """
Add a special-case tokenization rule.
Arguments: Arguments:
string (unicode): The string to specially tokenize. string (unicode): The string to specially tokenize.
@ -378,7 +389,7 @@ cdef class Tokenizer:
attributes. The ORTH fields of the attributes must exactly match attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated. the string when they are concatenated.
Returns None Returns None
''' """
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)

View File

@ -9,7 +9,9 @@ from .gold import merge_sents
class Trainer(object): class Trainer(object):
'''Manage training of an NLP pipeline.''' """
Manage training of an NLP pipeline.
"""
def __init__(self, nlp, gold_tuples): def __init__(self, nlp, gold_tuples):
self.nlp = nlp self.nlp = nlp
self.gold_tuples = gold_tuples self.gold_tuples = gold_tuples

View File

@ -48,8 +48,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. """
''' A map container for a language's LexemeC structs.
"""
@classmethod @classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True, def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
@ -108,7 +109,8 @@ cdef class Vocab:
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, strings=tuple(), **deprecated_kwargs): serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
'''Create the vocabulary. """
Create the vocabulary.
lex_attr_getters (dict): lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them. A dictionary mapping attribute IDs to functions to compute them.
@ -123,7 +125,7 @@ cdef class Vocab:
Returns: Returns:
Vocab: The newly constructed vocab object. Vocab: The newly constructed vocab object.
''' """
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -172,17 +174,19 @@ cdef class Vocab:
return langfunc('_') if langfunc else '' return langfunc('_') if langfunc else ''
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """
The current number of lexemes stored.
"""
return self.length return self.length
def resize_vectors(self, int new_size): def resize_vectors(self, int new_size):
''' """
Set vectors_length to a new size, and allocate more memory for the Lexeme Set vectors_length to a new size, and allocate more memory for the Lexeme
vectors if necessary. The memory will be zeroed. vectors if necessary. The memory will be zeroed.
Arguments: Arguments:
new_size (int): The new size of the vectors. new_size (int): The new size of the vectors.
''' """
cdef hash_t key cdef hash_t key
cdef size_t addr cdef size_t addr
if new_size > self.vectors_length: if new_size > self.vectors_length:
@ -193,7 +197,8 @@ cdef class Vocab:
self.vectors_length = new_size self.vectors_length = new_size
def add_flag(self, flag_getter, int flag_id=-1): def add_flag(self, flag_getter, int flag_id=-1):
'''Set a new boolean flag to words in the vocabulary. """
Set a new boolean flag to words in the vocabulary.
The flag_setter function will be called over the words currently in the The flag_setter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able vocab, and then applied to new words as they occur. You'll then be able
@ -213,7 +218,7 @@ cdef class Vocab:
Returns: Returns:
flag_id (int): The integer ID by which the flag value can be checked. flag_id (int): The integer ID by which the flag value can be checked.
''' """
if flag_id == -1: if flag_id == -1:
for bit in range(1, 64): for bit in range(1, 64):
if bit not in self.lex_attr_getters: if bit not in self.lex_attr_getters:
@ -234,9 +239,11 @@ cdef class Vocab:
return flag_id return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme """
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if string == u'': if string == u'':
return &EMPTY_LEXEME return &EMPTY_LEXEME
cdef LexemeC* lex cdef LexemeC* lex
@ -252,9 +259,11 @@ cdef class Vocab:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme """
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if orth == 0: if orth == 0:
return &EMPTY_LEXEME return &EMPTY_LEXEME
cdef LexemeC* lex cdef LexemeC* lex
@ -297,30 +306,33 @@ cdef class Vocab:
self.length += 1 self.length += 1
def __contains__(self, unicode string): def __contains__(self, unicode string):
'''Check whether the string has an entry in the vocabulary. """
Check whether the string has an entry in the vocabulary.
Arguments: Arguments:
string (unicode): The ID string. string (unicode): The ID string.
Returns: Returns:
bool Whether the string has an entry in the vocabulary. bool Whether the string has an entry in the vocabulary.
''' """
key = hash_string(string) key = hash_string(string)
lex = self._by_hash.get(key) lex = self._by_hash.get(key)
return lex is not NULL return lex is not NULL
def __iter__(self): def __iter__(self):
'''Iterate over the lexemes in the vocabulary. """
Iterate over the lexemes in the vocabulary.
Yields: Lexeme An entry in the vocabulary. Yields: Lexeme An entry in the vocabulary.
''' """
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
for orth, addr in self._by_orth.items(): for orth, addr in self._by_orth.items():
yield Lexeme(self, orth) yield Lexeme(self, orth)
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously """
Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new lexeme is created and stored. unseen unicode string is given, a new lexeme is created and stored.
Arguments: Arguments:
@ -332,7 +344,7 @@ cdef class Vocab:
Returns: Returns:
lexeme (Lexeme): The lexeme indicated by the given ID. lexeme (Lexeme): The lexeme indicated by the given ID.
''' """
cdef attr_t orth cdef attr_t orth
if type(id_or_string) == unicode: if type(id_or_string) == unicode:
orth = self.strings[id_or_string] orth = self.strings[id_or_string]
@ -355,7 +367,8 @@ cdef class Vocab:
return tokens return tokens
def dump(self, loc=None): def dump(self, loc=None):
"""Save the lexemes binary data to the given location, or """
Save the lexemes binary data to the given location, or
return a byte-string with the data if loc is None. return a byte-string with the data if loc is None.
Arguments: Arguments:
@ -392,14 +405,15 @@ cdef class Vocab:
return fp.string_data() return fp.string_data()
def load_lexemes(self, loc): def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location. """
Load the binary vocabulary data from the given location.
Arguments: Arguments:
loc (Path): The path to load from. loc (Path): The path to load from.
Returns: Returns:
None None
''' """
fp = CFile(loc, 'rb', fp = CFile(loc, 'rb',
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
cdef LexemeC* lexeme = NULL cdef LexemeC* lexeme = NULL
@ -440,8 +454,9 @@ cdef class Vocab:
fp.close() fp.close()
def _deserialize_lexemes(self, CFile fp): def _deserialize_lexemes(self, CFile fp):
'''Load the binary vocabulary data from the given CFile. """
''' Load the binary vocabulary data from the given CFile.
"""
cdef LexemeC* lexeme = NULL cdef LexemeC* lexeme = NULL
cdef hash_t key cdef hash_t key
cdef unicode py_str cdef unicode py_str
@ -494,13 +509,14 @@ cdef class Vocab:
fp.close() fp.close()
def dump_vectors(self, out_loc): def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file. """
Save the word vectors to a binary file.
Arguments: Arguments:
loc (Path): The path to save to. loc (Path): The path to save to.
Returns: Returns:
None None
''' """
cdef int32_t vec_len = self.vectors_length cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len cdef int32_t word_len
cdef bytes word_str cdef bytes word_str
@ -522,7 +538,8 @@ cdef class Vocab:
out_file.close() out_file.close()
def load_vectors(self, file_): def load_vectors(self, file_):
"""Load vectors from a text-based file. """
Load vectors from a text-based file.
Arguments: Arguments:
file_ (buffer): The file to read from. Entries should be separated by newlines, file_ (buffer): The file to read from. Entries should be separated by newlines,
@ -561,7 +578,8 @@ cdef class Vocab:
return vec_len return vec_len
def load_vectors_from_bin_loc(self, loc): def load_vectors_from_bin_loc(self, loc):
"""Load vectors from the location of a binary file. """
Load vectors from the location of a binary file.
Arguments: Arguments:
loc (unicode): The path of the binary file to load from. loc (unicode): The path of the binary file to load from.