mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Use consistent formatting for docstrings
This commit is contained in:
		
							parent
							
								
									d13f0a7017
								
							
						
					
					
						commit
						561f2a3eb4
					
				|  | @ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert | |||
| 
 | ||||
| 
 | ||||
| class CLI(object): | ||||
|     """Command-line interface for spaCy""" | ||||
| 
 | ||||
|     """ | ||||
|     Command-line interface for spaCy | ||||
|     """ | ||||
|     commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert') | ||||
| 
 | ||||
|     @plac.annotations( | ||||
|  | @ -29,7 +30,6 @@ class CLI(object): | |||
|         can be shortcut, model name or, if --direct flag is set, full model name | ||||
|         with version. | ||||
|         """ | ||||
| 
 | ||||
|         cli_download(model, direct) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -44,7 +44,6 @@ class CLI(object): | |||
|         either the name of a pip package, or the local path to the model data | ||||
|         directory. Linking models allows loading them via spacy.load(link_name). | ||||
|         """ | ||||
| 
 | ||||
|         cli_link(origin, link_name, force) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -58,7 +57,6 @@ class CLI(object): | |||
|         speficied as an argument, print model information. Flag --markdown | ||||
|         prints details in Markdown for easy copy-pasting to GitHub issues. | ||||
|         """ | ||||
| 
 | ||||
|         cli_info(model, markdown) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -73,7 +71,6 @@ class CLI(object): | |||
|         installation files. A new directory will be created in the specified | ||||
|         output directory, and model data will be copied over. | ||||
|         """ | ||||
| 
 | ||||
|         cli_package(input_dir, output_dir, force) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -93,7 +90,6 @@ class CLI(object): | |||
|         """ | ||||
|         Train a model. Expects data in spaCy's JSON format. | ||||
|         """ | ||||
| 
 | ||||
|         cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, | ||||
|                   not no_parser, not no_ner, parser_L1) | ||||
| 
 | ||||
|  | @ -108,7 +104,6 @@ class CLI(object): | |||
|         """ | ||||
|         Initialize a new model and its data directory. | ||||
|         """ | ||||
| 
 | ||||
|         cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) | ||||
| 
 | ||||
|     @plac.annotations( | ||||
|  | @ -122,7 +117,6 @@ class CLI(object): | |||
|         Convert files into JSON format for use with train command and other | ||||
|         experiment management functions. | ||||
|         """ | ||||
| 
 | ||||
|         cli_convert(input_file, output_dir, n_sents, morphology) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -92,7 +92,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | |||
| 
 | ||||
| 
 | ||||
| def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||
|     '''Normalize a dictionary of attributes, converting them to ints. | ||||
|     """ | ||||
|     Normalize a dictionary of attributes, converting them to ints. | ||||
| 
 | ||||
|     Arguments: | ||||
|         stringy_attrs (dict): | ||||
|  | @ -105,7 +106,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | |||
|         inty_attrs (dict): | ||||
|             Attributes dictionary with keys and optionally values converted to | ||||
|             ints. | ||||
|     ''' | ||||
|     """ | ||||
|     inty_attrs = {} | ||||
|     if _do_deprecated: | ||||
|         if 'F' in stringy_attrs: | ||||
|  |  | |||
|  | @ -7,7 +7,8 @@ from ... import util | |||
| 
 | ||||
| 
 | ||||
| def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): | ||||
|     """Convert conllu files into JSON format for use with train cli. | ||||
|     """ | ||||
|     Convert conllu files into JSON format for use with train cli. | ||||
|     use_morphology parameter enables appending morphology to tags, which is | ||||
|     useful for languages such as Spanish, where UD tags are not so rich. | ||||
|     """ | ||||
|  |  | |||
|  | @ -36,7 +36,8 @@ def align_tokens(ref, indices): # Deprecated, surely? | |||
| 
 | ||||
| 
 | ||||
| def detokenize(token_rules, words): # Deprecated? | ||||
|     """To align with treebanks, return a list of "chunks", where a chunk is a | ||||
|     """ | ||||
|     To align with treebanks, return a list of "chunks", where a chunk is a | ||||
|     sequence of tokens that are separated by whitespace in actual strings. Each | ||||
|     chunk should be a tuple of token indices, e.g. | ||||
| 
 | ||||
|  | @ -57,10 +58,13 @@ def detokenize(token_rules, words): # Deprecated? | |||
|     return positions | ||||
| 
 | ||||
| 
 | ||||
| def fix_glove_vectors_loading(overrides): | ||||
|     """Special-case hack for loading the GloVe vectors, to support deprecated | ||||
|     <1.0 stuff. Phase this out once the data is fixed.""" | ||||
| 
 | ||||
| 
 | ||||
| def fix_glove_vectors_loading(overrides): | ||||
|     """ | ||||
|     Special-case hack for loading the GloVe vectors, to support deprecated | ||||
|     <1.0 stuff. Phase this out once the data is fixed. | ||||
|     """ | ||||
|     if 'data_dir' in overrides and 'path' not in overrides: | ||||
|         raise ValueError("The argument 'data_dir' has been renamed to 'path'") | ||||
|     if overrides.get('path') is False: | ||||
|  | @ -88,13 +92,13 @@ def fix_glove_vectors_loading(overrides): | |||
| 
 | ||||
| 
 | ||||
| def resolve_model_name(name): | ||||
|     """If spaCy is loaded with 'de', check if symlink already exists. If | ||||
|     """ | ||||
|     If spaCy is loaded with 'de', check if symlink already exists. If | ||||
|     not, user have upgraded from older version and have old models installed. | ||||
|     Check if old model directory exists and if so, return that instead and create | ||||
|     shortcut link. If English model is found and no shortcut exists, raise error | ||||
|     and tell user to install new model. | ||||
|     """ | ||||
| 
 | ||||
|     if name == 'en' or name == 'de': | ||||
|         versions = ['1.0.0', '1.1.0'] | ||||
|         data_path = Path(util.get_data_path()) | ||||
|  | @ -117,9 +121,11 @@ def resolve_model_name(name): | |||
| 
 | ||||
| 
 | ||||
| class ModelDownload(): | ||||
|     """Replace download modules within en and de with deprecation warning and | ||||
|     """ | ||||
|     Replace download modules within en and de with deprecation warning and | ||||
|     download default language model (using shortcut). Use classmethods to allow | ||||
|     importing ModelDownload as download and calling download.en() etc.""" | ||||
|     importing ModelDownload as download and calling download.en() etc. | ||||
|     """ | ||||
| 
 | ||||
|     @classmethod | ||||
|     def load(self, lang): | ||||
|  |  | |||
|  | @ -220,7 +220,8 @@ cdef class GoldParse: | |||
| 
 | ||||
|     def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, | ||||
|                  deps=None, entities=None, make_projective=False): | ||||
|         """Create a GoldParse. | ||||
|         """ | ||||
|         Create a GoldParse. | ||||
| 
 | ||||
|         Arguments: | ||||
|             doc (Doc): | ||||
|  | @ -310,13 +311,16 @@ cdef class GoldParse: | |||
| 
 | ||||
|     @property | ||||
|     def is_projective(self): | ||||
|         """Whether the provided syntactic annotations form a projective dependency | ||||
|         tree.""" | ||||
|         """ | ||||
|         Whether the provided syntactic annotations form a projective dependency | ||||
|         tree. | ||||
|         """ | ||||
|         return not nonproj.is_nonproj_tree(self.heads) | ||||
| 
 | ||||
| 
 | ||||
| def biluo_tags_from_offsets(doc, entities): | ||||
|     '''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out | ||||
|     """ | ||||
|     Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out | ||||
|     scheme (biluo). | ||||
| 
 | ||||
|     Arguments: | ||||
|  | @ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities): | |||
|         tags = biluo_tags_from_offsets(doc, entities) | ||||
| 
 | ||||
|         assert tags == ['O', 'O', 'U-LOC', 'O'] | ||||
|     ''' | ||||
|     """ | ||||
|     starts = {token.idx: token.i for token in doc} | ||||
|     ends = {token.idx+len(token): token.i for token in doc} | ||||
|     biluo = ['-' for _ in doc] | ||||
|  |  | |||
|  | @ -202,9 +202,10 @@ class BaseDefaults(object): | |||
| 
 | ||||
| 
 | ||||
| class Language(object): | ||||
|     '''A text-processing pipeline. Usually you'll load this once per process, and | ||||
|     """ | ||||
|     A text-processing pipeline. Usually you'll load this once per process, and | ||||
|     pass the instance around your program. | ||||
|     ''' | ||||
|     """ | ||||
|     Defaults = BaseDefaults | ||||
|     lang = None | ||||
| 
 | ||||
|  | @ -342,7 +343,8 @@ class Language(object): | |||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): | ||||
|         '''Process texts as a stream, and yield Doc objects in order. | ||||
|         """ | ||||
|         Process texts as a stream, and yield Doc objects in order. | ||||
| 
 | ||||
|         Supports GIL-free multi-threading. | ||||
| 
 | ||||
|  | @ -351,7 +353,7 @@ class Language(object): | |||
|             tag (bool) | ||||
|             parse (bool) | ||||
|             entity (bool) | ||||
|         ''' | ||||
|         """ | ||||
|         skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} | ||||
|         stream = (self.make_doc(text) for text in texts) | ||||
|         for proc in self.pipeline: | ||||
|  |  | |||
|  | @ -38,8 +38,10 @@ class Lemmatizer(object): | |||
|         return lemmas | ||||
| 
 | ||||
|     def is_base_form(self, univ_pos, morphology=None): | ||||
|         '''Check whether we're dealing with an uninflected paradigm, so we can | ||||
|         avoid lemmatization entirely.''' | ||||
|         """ | ||||
|         Check whether we're dealing with an uninflected paradigm, so we can | ||||
|         avoid lemmatization entirely. | ||||
|         """ | ||||
|         morphology = {} if morphology is None else morphology | ||||
|         others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] | ||||
|         true_morph_key = morphology.get('morph', 0) | ||||
|  |  | |||
|  | @ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | |||
| 
 | ||||
| 
 | ||||
| cdef class Lexeme: | ||||
|     """An entry in the vocabulary.  A Lexeme has no string context --- it's a | ||||
|     """ | ||||
|     An entry in the vocabulary.  A Lexeme has no string context --- it's a | ||||
|     word-type, as opposed to a word token.  It therefore has no part-of-speech | ||||
|     tag, dependency parse, or lemma (lemmatization depends on the part-of-speech | ||||
|     tag). | ||||
|     """ | ||||
|     def __init__(self, Vocab vocab, int orth): | ||||
|         """Create a Lexeme object. | ||||
|         """ | ||||
|         Create a Lexeme object. | ||||
| 
 | ||||
|         Arguments: | ||||
|             vocab (Vocab): The parent vocabulary | ||||
|  | @ -80,7 +82,8 @@ cdef class Lexeme: | |||
|         return self.c.orth | ||||
| 
 | ||||
|     def set_flag(self, attr_id_t flag_id, bint value): | ||||
|         """Change the value of a boolean flag. | ||||
|         """ | ||||
|         Change the value of a boolean flag. | ||||
| 
 | ||||
|         Arguments: | ||||
|             flag_id (int): The attribute ID of the flag to set. | ||||
|  | @ -89,7 +92,8 @@ cdef class Lexeme: | |||
|         Lexeme.c_set_flag(self.c, flag_id, value) | ||||
| 
 | ||||
|     def check_flag(self, attr_id_t flag_id): | ||||
|         """Check the value of a boolean flag. | ||||
|         """ | ||||
|         Check the value of a boolean flag. | ||||
| 
 | ||||
|         Arguments: | ||||
|             flag_id (int): The attribute ID of the flag to query. | ||||
|  | @ -98,7 +102,8 @@ cdef class Lexeme: | |||
|         return True if Lexeme.c_check_flag(self.c, flag_id) else False | ||||
| 
 | ||||
|     def similarity(self, other): | ||||
|         '''Compute a semantic similarity estimate. Defaults to cosine over vectors. | ||||
|         """ | ||||
|         Compute a semantic similarity estimate. Defaults to cosine over vectors. | ||||
| 
 | ||||
|         Arguments: | ||||
|             other: | ||||
|  | @ -106,7 +111,7 @@ cdef class Lexeme: | |||
|                 Token and Lexeme objects. | ||||
|         Returns: | ||||
|             score (float): A scalar similarity score. Higher is more similar. | ||||
|         ''' | ||||
|         """ | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             return 0.0 | ||||
|         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
|  |  | |||
|  | @ -180,7 +180,8 @@ cdef class Matcher: | |||
| 
 | ||||
|     @classmethod | ||||
|     def load(cls, path, vocab): | ||||
|         '''Load the matcher and patterns from a file path. | ||||
|         """ | ||||
|         Load the matcher and patterns from a file path. | ||||
| 
 | ||||
|         Arguments: | ||||
|             path (Path): | ||||
|  | @ -189,7 +190,7 @@ cdef class Matcher: | |||
|                 The vocabulary that the documents to match over will refer to. | ||||
|         Returns: | ||||
|             Matcher: The newly constructed object. | ||||
|         ''' | ||||
|         """ | ||||
|         if (path / 'gazetteer.json').exists(): | ||||
|             with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: | ||||
|                 patterns = json.load(file_) | ||||
|  | @ -198,7 +199,8 @@ cdef class Matcher: | |||
|         return cls(vocab, patterns) | ||||
| 
 | ||||
|     def __init__(self, vocab, patterns={}): | ||||
|         """Create the Matcher. | ||||
|         """ | ||||
|         Create the Matcher. | ||||
| 
 | ||||
|         Arguments: | ||||
|             vocab (Vocab): | ||||
|  | @ -227,7 +229,8 @@ cdef class Matcher: | |||
| 
 | ||||
|     def add_entity(self, entity_key, attrs=None, if_exists='raise', | ||||
|                    acceptor=None, on_match=None): | ||||
|         """Add an entity to the matcher. | ||||
|         """ | ||||
|         Add an entity to the matcher. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (unicode or int): | ||||
|  | @ -264,7 +267,8 @@ cdef class Matcher: | |||
|         self._callbacks[entity_key] = on_match | ||||
| 
 | ||||
|     def add_pattern(self, entity_key, token_specs, label=""): | ||||
|         """Add a pattern to the matcher. | ||||
|         """ | ||||
|         Add a pattern to the matcher. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (unicode or int): | ||||
|  | @ -307,7 +311,8 @@ cdef class Matcher: | |||
|             return entity_key | ||||
| 
 | ||||
|     def has_entity(self, entity_key): | ||||
|         """Check whether the matcher has an entity. | ||||
|         """ | ||||
|         Check whether the matcher has an entity. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (string or int): The entity key to check. | ||||
|  | @ -318,7 +323,8 @@ cdef class Matcher: | |||
|         return entity_key in self._entities | ||||
| 
 | ||||
|     def get_entity(self, entity_key): | ||||
|         """Retrieve the attributes stored for an entity. | ||||
|         """ | ||||
|         Retrieve the attributes stored for an entity. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (unicode or int): The entity to retrieve. | ||||
|  | @ -332,7 +338,8 @@ cdef class Matcher: | |||
|             return None | ||||
| 
 | ||||
|     def __call__(self, Doc doc, acceptor=None): | ||||
|         """Find all token sequences matching the supplied patterns on the Doc. | ||||
|         """ | ||||
|         Find all token sequences matching the supplied patterns on the Doc. | ||||
| 
 | ||||
|         Arguments: | ||||
|             doc (Doc): | ||||
|  | @ -445,7 +452,8 @@ cdef class Matcher: | |||
|         return matches | ||||
| 
 | ||||
|     def pipe(self, docs, batch_size=1000, n_threads=2): | ||||
|         """Match a stream of documents, yielding them in turn. | ||||
|         """ | ||||
|         Match a stream of documents, yielding them in turn. | ||||
| 
 | ||||
|         Arguments: | ||||
|             docs: A stream of documents. | ||||
|  |  | |||
|  | @ -16,7 +16,9 @@ from .attrs import LEMMA, intify_attrs | |||
| 
 | ||||
| 
 | ||||
| def _normalize_props(props): | ||||
|     '''Transform deprecated string keys to correct names.''' | ||||
|     """ | ||||
|     Transform deprecated string keys to correct names. | ||||
|     """ | ||||
|     out = {} | ||||
|     for key, value in props.items(): | ||||
|         if key == POS: | ||||
|  | @ -98,13 +100,14 @@ cdef class Morphology: | |||
|             flags[0] &= ~(one << flag_id) | ||||
| 
 | ||||
|     def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): | ||||
|         '''Add a special-case rule to the morphological analyser. Tokens whose | ||||
|         """ | ||||
|         Add a special-case rule to the morphological analyser. Tokens whose | ||||
|         tag and orth match the rule will receive the specified properties. | ||||
| 
 | ||||
|         Arguments: | ||||
|             tag (unicode): The part-of-speech tag to key the exception. | ||||
|             orth (unicode): The word-form to key the exception. | ||||
|         ''' | ||||
|         """ | ||||
|         tag = self.strings[tag_str] | ||||
|         tag_id = self.reverse_index[tag] | ||||
|         orth = self.strings[orth_str] | ||||
|  |  | |||
|  | @ -11,7 +11,9 @@ from .attrs import DEP, ENT_TYPE | |||
| 
 | ||||
| 
 | ||||
| cdef class EntityRecognizer(Parser): | ||||
|     """Annotate named entities on Doc objects.""" | ||||
|     """ | ||||
|     Annotate named entities on Doc objects. | ||||
|     """ | ||||
|     TransitionSystem = BiluoPushDown | ||||
| 
 | ||||
|     feature_templates = get_feature_templates('ner') | ||||
|  | @ -28,7 +30,9 @@ cdef class EntityRecognizer(Parser): | |||
| 
 | ||||
| 
 | ||||
| cdef class BeamEntityRecognizer(BeamParser): | ||||
|     """Annotate named entities on Doc objects.""" | ||||
|     """ | ||||
|     Annotate named entities on Doc objects. | ||||
|     """ | ||||
|     TransitionSystem = BiluoPushDown | ||||
| 
 | ||||
|     feature_templates = get_feature_templates('ner') | ||||
|  |  | |||
|  | @ -6,7 +6,9 @@ from .gold import tags_to_entities | |||
| 
 | ||||
| 
 | ||||
| class PRFScore(object): | ||||
|     """A precision / recall / F score""" | ||||
|     """ | ||||
|     A precision / recall / F score | ||||
|     """ | ||||
|     def __init__(self): | ||||
|         self.tp = 0 | ||||
|         self.fp = 0 | ||||
|  |  | |||
|  | @ -73,13 +73,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex | |||
| 
 | ||||
| 
 | ||||
| cdef class StringStore: | ||||
|     '''Map strings to and from integer IDs.''' | ||||
|     """ | ||||
|     Map strings to and from integer IDs. | ||||
|     """ | ||||
|     def __init__(self, strings=None, freeze=False): | ||||
|         '''Create the StringStore. | ||||
|         """ | ||||
|         Create the StringStore. | ||||
| 
 | ||||
|         Arguments: | ||||
|             strings: A sequence of unicode strings to add to the store. | ||||
|         ''' | ||||
|         """ | ||||
|         self.mem = Pool() | ||||
|         self._map = PreshMap() | ||||
|         self._oov = PreshMap() | ||||
|  | @ -104,7 +107,8 @@ cdef class StringStore: | |||
|         return (StringStore, (list(self),)) | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """The number of strings in the store. | ||||
|         """ | ||||
|         The number of strings in the store. | ||||
| 
 | ||||
|         Returns: | ||||
|             int The number of strings in the store. | ||||
|  | @ -112,7 +116,8 @@ cdef class StringStore: | |||
|         return self.size-1 | ||||
| 
 | ||||
|     def __getitem__(self, object string_or_id): | ||||
|         """Retrieve a string from a given integer ID, or vice versa. | ||||
|         """ | ||||
|         Retrieve a string from a given integer ID, or vice versa. | ||||
| 
 | ||||
|         Arguments: | ||||
|             string_or_id (bytes or unicode or int): | ||||
|  | @ -159,7 +164,8 @@ cdef class StringStore: | |||
|                 return utf8str - self.c | ||||
| 
 | ||||
|     def __contains__(self, unicode string not None): | ||||
|         """Check whether a string is in the store. | ||||
|         """ | ||||
|         Check whether a string is in the store. | ||||
| 
 | ||||
|         Arguments: | ||||
|             string (unicode): The string to check. | ||||
|  | @ -172,7 +178,8 @@ cdef class StringStore: | |||
|         return self._map.get(key) is not NULL | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         """Iterate over the strings in the store, in order. | ||||
|         """ | ||||
|         Iterate over the strings in the store, in order. | ||||
| 
 | ||||
|         Yields: unicode A string in the store. | ||||
|         """ | ||||
|  | @ -230,7 +237,8 @@ cdef class StringStore: | |||
|         return &self.c[self.size-1] | ||||
| 
 | ||||
|     def dump(self, file_): | ||||
|         """Save the strings to a JSON file. | ||||
|         """ | ||||
|         Save the strings to a JSON file. | ||||
| 
 | ||||
|         Arguments: | ||||
|             file_ (buffer): The file to save the strings. | ||||
|  | @ -244,7 +252,8 @@ cdef class StringStore: | |||
|         file_.write(string_data) | ||||
| 
 | ||||
|     def load(self, file_): | ||||
|         """Load the strings from a JSON file. | ||||
|         """ | ||||
|         Load the strings from a JSON file. | ||||
| 
 | ||||
|         Arguments: | ||||
|             file_ (buffer): The file from which to load the strings. | ||||
|  |  | |||
|  | @ -106,10 +106,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: | |||
| 
 | ||||
| 
 | ||||
| cdef class Tagger: | ||||
|     """Annotate part-of-speech tags on Doc objects.""" | ||||
|     """ | ||||
|     Annotate part-of-speech tags on Doc objects. | ||||
|     """ | ||||
|     @classmethod | ||||
|     def load(cls, path, vocab, require=False): | ||||
|         """Load the statistical model from the supplied path. | ||||
|         """ | ||||
|         Load the statistical model from the supplied path. | ||||
| 
 | ||||
|         Arguments: | ||||
|             path (Path): | ||||
|  | @ -142,7 +145,8 @@ cdef class Tagger: | |||
|         return self | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): | ||||
|         """Create a Tagger. | ||||
|         """ | ||||
|         Create a Tagger. | ||||
| 
 | ||||
|         Arguments: | ||||
|             vocab (Vocab): | ||||
|  | @ -180,7 +184,8 @@ cdef class Tagger: | |||
|         tokens._py_tokens = [None] * tokens.length | ||||
| 
 | ||||
|     def __call__(self, Doc tokens): | ||||
|         """Apply the tagger, setting the POS tags onto the Doc object. | ||||
|         """ | ||||
|         Apply the tagger, setting the POS tags onto the Doc object. | ||||
| 
 | ||||
|         Arguments: | ||||
|             doc (Doc): The tokens to be tagged. | ||||
|  | @ -208,7 +213,8 @@ cdef class Tagger: | |||
|         tokens._py_tokens = [None] * tokens.length | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=1000, n_threads=2): | ||||
|         """Tag a stream of documents. | ||||
|         """ | ||||
|         Tag a stream of documents. | ||||
| 
 | ||||
|         Arguments: | ||||
|             stream: The sequence of documents to tag. | ||||
|  | @ -225,7 +231,8 @@ cdef class Tagger: | |||
|             yield doc | ||||
| 
 | ||||
|     def update(self, Doc tokens, GoldParse gold, itn=0): | ||||
|         """Update the statistical model, with tags supplied for the given document. | ||||
|         """ | ||||
|         Update the statistical model, with tags supplied for the given document. | ||||
| 
 | ||||
|         Arguments: | ||||
|             doc (Doc): | ||||
|  |  | |||
|  | @ -23,11 +23,14 @@ from .tokens.doc cimport Doc | |||
| 
 | ||||
| 
 | ||||
| cdef class Tokenizer: | ||||
|     """Segment text, and create Doc objects with the discovered segment boundaries.""" | ||||
|     """ | ||||
|     Segment text, and create Doc objects with the discovered segment boundaries. | ||||
|     """ | ||||
|     @classmethod | ||||
|     def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, | ||||
|              infix_finditer=None, token_match=None): | ||||
|         '''Load a Tokenizer, reading unsupplied components from the path. | ||||
|         """ | ||||
|         Load a Tokenizer, reading unsupplied components from the path. | ||||
| 
 | ||||
|         Arguments: | ||||
|             path (Path): | ||||
|  | @ -45,10 +48,10 @@ cdef class Tokenizer: | |||
|             infix_finditer: | ||||
|                 Signature of re.compile(string).finditer | ||||
|         Returns Tokenizer | ||||
|         ''' | ||||
|         if isinstance(path, basestring): | ||||
|             path = pathlib.Path(path) | ||||
| 
 | ||||
|         """ | ||||
|         if rules is None: | ||||
|             with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: | ||||
|                 rules = json.load(file_) | ||||
|  | @ -67,7 +70,8 @@ cdef class Tokenizer: | |||
|         return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): | ||||
|         '''Create a Tokenizer, to create Doc objects given unicode text. | ||||
|         """ | ||||
|         Create a Tokenizer, to create Doc objects given unicode text. | ||||
| 
 | ||||
|         Arguments: | ||||
|             vocab (Vocab): | ||||
|  | @ -85,7 +89,7 @@ cdef class Tokenizer: | |||
|                 to find infixes. | ||||
|             token_match: | ||||
|                 A boolean function matching strings that becomes tokens. | ||||
|         ''' | ||||
|         """ | ||||
|         self.mem = Pool() | ||||
|         self._cache = PreshMap() | ||||
|         self._specials = PreshMap() | ||||
|  | @ -117,7 +121,8 @@ cdef class Tokenizer: | |||
| 
 | ||||
|     @cython.boundscheck(False) | ||||
|     def __call__(self, unicode string): | ||||
|         """Tokenize a string. | ||||
|         """ | ||||
|         Tokenize a string. | ||||
| 
 | ||||
|         Arguments: | ||||
|             string (unicode): The string to tokenize. | ||||
|  | @ -170,7 +175,8 @@ cdef class Tokenizer: | |||
|         return tokens | ||||
| 
 | ||||
|     def pipe(self, texts, batch_size=1000, n_threads=2): | ||||
|         """Tokenize a stream of texts. | ||||
|         """ | ||||
|         Tokenize a stream of texts. | ||||
| 
 | ||||
|         Arguments: | ||||
|             texts: A sequence of unicode texts. | ||||
|  | @ -324,7 +330,8 @@ cdef class Tokenizer: | |||
|         self._cache.set(key, cached) | ||||
| 
 | ||||
|     def find_infix(self, unicode string): | ||||
|         """Find internal split points of the string, such as hyphens. | ||||
|         """ | ||||
|         Find internal split points of the string, such as hyphens. | ||||
| 
 | ||||
|         string (unicode): The string to segment. | ||||
| 
 | ||||
|  | @ -337,7 +344,8 @@ cdef class Tokenizer: | |||
|         return list(self.infix_finditer(string)) | ||||
| 
 | ||||
|     def find_prefix(self, unicode string): | ||||
|         """Find the length of a prefix that should be segmented from the string, | ||||
|         """ | ||||
|         Find the length of a prefix that should be segmented from the string, | ||||
|         or None if no prefix rules match. | ||||
| 
 | ||||
|         Arguments: | ||||
|  | @ -350,7 +358,8 @@ cdef class Tokenizer: | |||
|         return (match.end() - match.start()) if match is not None else 0 | ||||
| 
 | ||||
|     def find_suffix(self, unicode string): | ||||
|         """Find the length of a suffix that should be segmented from the string, | ||||
|         """ | ||||
|         Find the length of a suffix that should be segmented from the string, | ||||
|         or None if no suffix rules match. | ||||
| 
 | ||||
|         Arguments: | ||||
|  | @ -363,13 +372,15 @@ cdef class Tokenizer: | |||
|         return (match.end() - match.start()) if match is not None else 0 | ||||
| 
 | ||||
|     def _load_special_tokenization(self, special_cases): | ||||
|         '''Add special-case tokenization rules. | ||||
|         ''' | ||||
|         """ | ||||
|         Add special-case tokenization rules. | ||||
|         """ | ||||
|         for chunk, substrings in sorted(special_cases.items()): | ||||
|             self.add_special_case(chunk, substrings) | ||||
|      | ||||
|     def add_special_case(self, unicode string, substrings): | ||||
|         '''Add a special-case tokenization rule. | ||||
|         """ | ||||
|         Add a special-case tokenization rule. | ||||
| 
 | ||||
|         Arguments: | ||||
|             string (unicode): The string to specially tokenize. | ||||
|  | @ -378,7 +389,7 @@ cdef class Tokenizer: | |||
|                 attributes. The ORTH fields of the attributes must exactly match | ||||
|                 the string when they are concatenated. | ||||
|         Returns None | ||||
|         ''' | ||||
|         """ | ||||
|         substrings = list(substrings) | ||||
|         cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) | ||||
|         cached.length = len(substrings) | ||||
|  |  | |||
|  | @ -9,7 +9,9 @@ from .gold import merge_sents | |||
| 
 | ||||
| 
 | ||||
| class Trainer(object): | ||||
|     '''Manage training of an NLP pipeline.''' | ||||
|     """ | ||||
|     Manage training of an NLP pipeline. | ||||
|     """ | ||||
|     def __init__(self, nlp, gold_tuples): | ||||
|         self.nlp = nlp | ||||
|         self.gold_tuples = gold_tuples | ||||
|  |  | |||
|  | @ -48,8 +48,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC | |||
| 
 | ||||
| 
 | ||||
| cdef class Vocab: | ||||
|     '''A map container for a language's LexemeC structs. | ||||
|     ''' | ||||
|     """ | ||||
|     A map container for a language's LexemeC structs. | ||||
|     """ | ||||
|     @classmethod | ||||
|     def load(cls, path, lex_attr_getters=None, lemmatizer=True, | ||||
|              tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): | ||||
|  | @ -108,7 +109,8 @@ cdef class Vocab: | |||
| 
 | ||||
|     def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, | ||||
|             serializer_freqs=None, strings=tuple(), **deprecated_kwargs): | ||||
|         '''Create the vocabulary. | ||||
|         """ | ||||
|         Create the vocabulary. | ||||
| 
 | ||||
|         lex_attr_getters (dict): | ||||
|             A dictionary mapping attribute IDs to functions to compute them. | ||||
|  | @ -123,7 +125,7 @@ cdef class Vocab: | |||
| 
 | ||||
|         Returns: | ||||
|             Vocab: The newly constructed vocab object. | ||||
|         ''' | ||||
|         """ | ||||
|         util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) | ||||
| 
 | ||||
|         lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} | ||||
|  | @ -172,17 +174,19 @@ cdef class Vocab: | |||
|             return langfunc('_') if langfunc else '' | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """The current number of lexemes stored.""" | ||||
|         """ | ||||
|         The current number of lexemes stored. | ||||
|         """ | ||||
|         return self.length | ||||
| 
 | ||||
|     def resize_vectors(self, int new_size): | ||||
|         ''' | ||||
|         """ | ||||
|         Set vectors_length to a new size, and allocate more memory for the Lexeme | ||||
|         vectors if necessary. The memory will be zeroed. | ||||
| 
 | ||||
|         Arguments: | ||||
|             new_size (int): The new size of the vectors. | ||||
|         ''' | ||||
|         """ | ||||
|         cdef hash_t key | ||||
|         cdef size_t addr | ||||
|         if new_size > self.vectors_length: | ||||
|  | @ -193,7 +197,8 @@ cdef class Vocab: | |||
|         self.vectors_length = new_size | ||||
| 
 | ||||
|     def add_flag(self, flag_getter, int flag_id=-1): | ||||
|         '''Set a new boolean flag to words in the vocabulary. | ||||
|         """ | ||||
|         Set a new boolean flag to words in the vocabulary. | ||||
| 
 | ||||
|         The flag_setter function will be called over the words currently in the | ||||
|         vocab, and then applied to new words as they occur. You'll then be able | ||||
|  | @ -213,7 +218,7 @@ cdef class Vocab: | |||
| 
 | ||||
|         Returns: | ||||
|             flag_id (int): The integer ID by which the flag value can be checked. | ||||
|         ''' | ||||
|         """ | ||||
|         if flag_id == -1: | ||||
|             for bit in range(1, 64): | ||||
|                 if bit not in self.lex_attr_getters: | ||||
|  | @ -234,9 +239,11 @@ cdef class Vocab: | |||
|         return flag_id | ||||
| 
 | ||||
|     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: | ||||
|         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||
|         """ | ||||
|         Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||
|         if necessary, using memory acquired from the given pool.  If the pool | ||||
|         is the lexicon's own memory, the lexeme is saved in the lexicon.''' | ||||
|         is the lexicon's own memory, the lexeme is saved in the lexicon. | ||||
|         """ | ||||
|         if string == u'': | ||||
|             return &EMPTY_LEXEME | ||||
|         cdef LexemeC* lex | ||||
|  | @ -252,9 +259,11 @@ cdef class Vocab: | |||
|             return self._new_lexeme(mem, string) | ||||
| 
 | ||||
|     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: | ||||
|         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||
|         """ | ||||
|         Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||
|         if necessary, using memory acquired from the given pool.  If the pool | ||||
|         is the lexicon's own memory, the lexeme is saved in the lexicon.''' | ||||
|         is the lexicon's own memory, the lexeme is saved in the lexicon. | ||||
|         """ | ||||
|         if orth == 0: | ||||
|             return &EMPTY_LEXEME | ||||
|         cdef LexemeC* lex | ||||
|  | @ -297,30 +306,33 @@ cdef class Vocab: | |||
|         self.length += 1 | ||||
| 
 | ||||
|     def __contains__(self, unicode string): | ||||
|         '''Check whether the string has an entry in the vocabulary. | ||||
|         """ | ||||
|         Check whether the string has an entry in the vocabulary. | ||||
| 
 | ||||
|         Arguments: | ||||
|             string (unicode): The ID string. | ||||
| 
 | ||||
|         Returns: | ||||
|             bool Whether the string has an entry in the vocabulary. | ||||
|         ''' | ||||
|         """ | ||||
|         key = hash_string(string) | ||||
|         lex = self._by_hash.get(key) | ||||
|         return lex is not NULL | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         '''Iterate over the lexemes in the vocabulary. | ||||
|         """ | ||||
|         Iterate over the lexemes in the vocabulary. | ||||
| 
 | ||||
|         Yields: Lexeme An entry in the vocabulary. | ||||
|         ''' | ||||
|         """ | ||||
|         cdef attr_t orth | ||||
|         cdef size_t addr | ||||
|         for orth, addr in self._by_orth.items(): | ||||
|             yield Lexeme(self, orth) | ||||
| 
 | ||||
|     def __getitem__(self,  id_or_string): | ||||
|         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously | ||||
|         """ | ||||
|         Retrieve a lexeme, given an int ID or a unicode string.  If a previously | ||||
|         unseen unicode string is given, a new lexeme is created and stored. | ||||
| 
 | ||||
|         Arguments: | ||||
|  | @ -332,7 +344,7 @@ cdef class Vocab: | |||
| 
 | ||||
|         Returns: | ||||
|             lexeme (Lexeme): The lexeme indicated by the given ID. | ||||
|         ''' | ||||
|         """ | ||||
|         cdef attr_t orth | ||||
|         if type(id_or_string) == unicode: | ||||
|             orth = self.strings[id_or_string] | ||||
|  | @ -355,7 +367,8 @@ cdef class Vocab: | |||
|         return tokens | ||||
| 
 | ||||
|     def dump(self, loc=None): | ||||
|         """Save the lexemes binary data to the given location, or | ||||
|         """ | ||||
|         Save the lexemes binary data to the given location, or | ||||
|         return a byte-string with the data if loc is None. | ||||
| 
 | ||||
|         Arguments: | ||||
|  | @ -392,14 +405,15 @@ cdef class Vocab: | |||
|             return fp.string_data() | ||||
| 
 | ||||
|     def load_lexemes(self, loc): | ||||
|         '''Load the binary vocabulary data from the given location. | ||||
|         """ | ||||
|         Load the binary vocabulary data from the given location. | ||||
| 
 | ||||
|         Arguments: | ||||
|             loc (Path): The path to load from. | ||||
| 
 | ||||
|         Returns: | ||||
|             None | ||||
|         ''' | ||||
|         """ | ||||
|         fp = CFile(loc, 'rb', | ||||
|                 on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) | ||||
|         cdef LexemeC* lexeme = NULL | ||||
|  | @ -440,8 +454,9 @@ cdef class Vocab: | |||
|         fp.close() | ||||
| 
 | ||||
|     def _deserialize_lexemes(self, CFile fp): | ||||
|         '''Load the binary vocabulary data from the given CFile. | ||||
|         ''' | ||||
|         """ | ||||
|         Load the binary vocabulary data from the given CFile. | ||||
|         """ | ||||
|         cdef LexemeC* lexeme = NULL | ||||
|         cdef hash_t key | ||||
|         cdef unicode py_str | ||||
|  | @ -494,13 +509,14 @@ cdef class Vocab: | |||
|         fp.close() | ||||
| 
 | ||||
|     def dump_vectors(self, out_loc): | ||||
|         '''Save the word vectors to a binary file. | ||||
|         """ | ||||
|         Save the word vectors to a binary file. | ||||
| 
 | ||||
|         Arguments: | ||||
|             loc (Path): The path to save to. | ||||
|         Returns: | ||||
|             None | ||||
|         ''' | ||||
|         """ | ||||
|         cdef int32_t vec_len = self.vectors_length | ||||
|         cdef int32_t word_len | ||||
|         cdef bytes word_str | ||||
|  | @ -522,7 +538,8 @@ cdef class Vocab: | |||
|         out_file.close() | ||||
| 
 | ||||
|     def load_vectors(self, file_): | ||||
|         """Load vectors from a text-based file. | ||||
|         """ | ||||
|         Load vectors from a text-based file. | ||||
| 
 | ||||
|         Arguments: | ||||
|             file_ (buffer): The file to read from. Entries should be separated by newlines, | ||||
|  | @ -561,7 +578,8 @@ cdef class Vocab: | |||
|         return vec_len | ||||
| 
 | ||||
|     def load_vectors_from_bin_loc(self, loc): | ||||
|         """Load vectors from the location of a binary file. | ||||
|         """ | ||||
|         Load vectors from the location of a binary file. | ||||
| 
 | ||||
|         Arguments: | ||||
|             loc (unicode): The path of the binary file to load from. | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user