diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 0b2920802..1ece755b8 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None): final entity type with `ner_map` if mapping present. Entity tag is 'O' if the pattern is not matched. - lines (unicode): CONLL-U lines for one sentences - tag_pattern (unicode): Regex pattern for entity tag + lines (str): CONLL-U lines for one sentences + tag_pattern (str): Regex pattern for entity tag ner_map (dict): Map old NER tag names to new ones, '' maps to O. RETURNS (list): List of BILUO entity tags """ @@ -187,8 +187,8 @@ def example_from_conllu_sentence( """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. - lines (unicode): The non-comment lines for a CoNLL-U sentence - ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col + lines (str): The non-comment lines for a CoNLL-U sentence + ner_tag_pattern (str): The regex pattern for matching NER in MISC col RETURNS (Example): An example containing the annotation """ # create a Doc with each subtoken as its own token diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3f84dabce..2c377a043 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -22,13 +22,13 @@ def render( """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. DOCS: https://spacy.io/api/top-level#displacy.render USAGE: https://spacy.io/usage/visualizers @@ -73,13 +73,13 @@ def serve( """Serve displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. - host (unicode): Host to serve visualisation. + host (str): Host to serve visualisation. DOCS: https://spacy.io/api/top-level#displacy.serve USAGE: https://spacy.io/usage/visualizers diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 0d4cdb77f..d3572ce78 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -47,7 +47,7 @@ class DependencyRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered SVG or HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ # Create a random ID prefix to make sure parses don't receive the # same ID, even if they're identical @@ -78,7 +78,7 @@ class DependencyRenderer(object): render_id (int): Unique ID, typically index of document. words (list): Individual words and their tags. arcs (list): Individual arcs and their start, end, direction and label. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ self.levels = self.get_levels(arcs) self.highest_level = len(self.levels) @@ -112,10 +112,10 @@ class DependencyRenderer(object): ): """Render individual word. - text (unicode): Word text. - tag (unicode): Part-of-speech tag. + text (str): Word text. + tag (str): Part-of-speech tag. i (int): Unique ID, typically word index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ y = self.offset_y + self.word_spacing x = self.offset_x + i * self.distance @@ -131,12 +131,12 @@ class DependencyRenderer(object): def render_arrow(self, label, start, end, direction, i): """Render individual arrow. - label (unicode): Dependency label. + label (str): Dependency label. start (int): Index of start word. end (int): Index of end word. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. i (int): Unique ID, typically arrow index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ if start < 0 or end < 0: error_args = dict(start=start, end=end, label=label, dir=direction) @@ -179,7 +179,7 @@ class DependencyRenderer(object): y (int): Y-coordinate of arrow start and end point. y_curve (int): Y-corrdinate of Cubic Bézier y_curve point. x_end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arc path ('d' attribute). + RETURNS (str): Definition of the arc path ('d' attribute). """ template = "M{x},{y} C{x},{c} {e},{c} {e},{y}" if self.compact: @@ -189,11 +189,11 @@ class DependencyRenderer(object): def get_arrowhead(self, direction, x, y, end): """Render individual arrow head. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. x (int): X-coordinate of arrow start point. y (int): Y-coordinate of arrow start and end point. end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arrow head path ('d' attribute). + RETURNS (str): Definition of the arrow head path ('d' attribute). """ if direction == "left": pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2) @@ -279,7 +279,7 @@ class EntityRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. """ rendered = [] for i, p in enumerate(parsed): @@ -300,7 +300,7 @@ class EntityRenderer(object): def render_ents(self, text, spans, title): """Render entities in text. - text (unicode): Original text. + text (str): Original text. spans (list): Individual entity spans and their start, end and label. title (unicode or None): Document title set in Doc.user_data['title']. """ diff --git a/spacy/errors.py b/spacy/errors.py index 4d38ab586..932bb1eff 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -598,7 +598,7 @@ class MatchPatternError(ValueError): def __init__(self, key, errors): """Custom error for validating match patterns. - key (unicode): The name of the matcher rule. + key (str): The name of the matcher rule. errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ diff --git a/spacy/glossary.py b/spacy/glossary.py index 938a575cd..c4a6a5c45 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,8 +1,8 @@ def explain(term): """Get a description for a given POS tag, dependency label or entity type. - term (unicode): The term to explain. - RETURNS (unicode): The explanation, or `None` if not found in the glossary. + term (str): The term to explain. + RETURNS (str): The explanation, or `None` if not found in the glossary. EXAMPLE: >>> spacy.explain(u'NORP') diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 86a8d49b8..8d8464f3c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -38,7 +38,7 @@ cdef class Candidate: @property def entity_(self): - """RETURNS (unicode): ID/name of this entity in the KB""" + """RETURNS (str): ID/name of this entity in the KB""" return self.kb.vocab.strings[self.entity_hash] @property @@ -48,7 +48,7 @@ cdef class Candidate: @property def alias_(self): - """RETURNS (unicode): ID of the original alias""" + """RETURNS (str): ID of the original alias""" return self.kb.vocab.strings[self.alias_hash] @property diff --git a/spacy/language.py b/spacy/language.py index 5286bd3b9..e3b770723 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -122,7 +122,7 @@ class Language(object): Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (unicode): Two-letter language ID, i.e. ISO code. + lang (str): Two-letter language ID, i.e. ISO code. DOCS: https://spacy.io/api/language """ @@ -287,7 +287,7 @@ class Language(object): def get_pipe(self, name): """Get a pipeline component for a given component name. - name (unicode): Name of pipeline component to get. + name (str): Name of pipeline component to get. RETURNS (callable): The pipeline component. DOCS: https://spacy.io/api/language#get_pipe @@ -300,7 +300,7 @@ class Language(object): def create_pipe(self, name, config=dict()): """Create a pipeline component from a factory. - name (unicode): Factory name to look up in `Language.factories`. + name (str): Factory name to look up in `Language.factories`. config (dict): Configuration parameters to initialise component. RETURNS (callable): Pipeline component. @@ -343,12 +343,12 @@ class Language(object): of before/after/first/last can be set. Default behaviour is "last". component (callable): The pipeline component. - name (unicode): Name of pipeline component. Overwrites existing + name (str): Name of pipeline component. Overwrites existing component.name attribute if available. If no name is set and the component exposes no name attribute, component.__name__ is used. An error is raised if a name already exists in the pipeline. - before (unicode): Component name to insert component directly before. - after (unicode): Component name to insert component directly after. + before (str): Component name to insert component directly before. + after (str): Component name to insert component directly after. first (bool): Insert component first / not first in the pipeline. last (bool): Insert component last / not last in the pipeline. @@ -389,7 +389,7 @@ class Language(object): """Check if a component name is present in the pipeline. Equivalent to `name in nlp.pipe_names`. - name (unicode): Name of the component. + name (str): Name of the component. RETURNS (bool): Whether a component of the name exists in the pipeline. DOCS: https://spacy.io/api/language#has_pipe @@ -399,7 +399,7 @@ class Language(object): def replace_pipe(self, name, component): """Replace a component in the pipeline. - name (unicode): Name of the component to replace. + name (str): Name of the component to replace. component (callable): Pipeline component. DOCS: https://spacy.io/api/language#replace_pipe @@ -418,8 +418,8 @@ class Language(object): def rename_pipe(self, old_name, new_name): """Rename a pipeline component. - old_name (unicode): Name of the component to rename. - new_name (unicode): New name of the component. + old_name (str): Name of the component to rename. + new_name (str): New name of the component. DOCS: https://spacy.io/api/language#rename_pipe """ @@ -433,7 +433,7 @@ class Language(object): def remove_pipe(self, name): """Remove a component from the pipeline. - name (unicode): Name of the component to remove. + name (str): Name of the component to remove. RETURNS (tuple): A `(name, component)` tuple of the removed component. DOCS: https://spacy.io/api/language#remove_pipe @@ -450,7 +450,7 @@ class Language(object): and can contain arbitrary whitespace. Alignment into the original string is preserved. - text (unicode): The text to be processed. + text (str): The text to be processed. disable (list): Names of the pipeline components to disable. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. @@ -1086,7 +1086,7 @@ class component(object): ): """Decorate a pipeline component. - name (unicode): Default component and factory name. + name (str): Default component and factory name. assigns (list): Attributes assigned by component, e.g. `["token.pos"]`. requires (list): Attributes required by component, e.g. `["token.dep"]`. retokenizes (bool): Whether the component changes the tokenization. diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 3ba86c169..aeedbde84 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -29,8 +29,8 @@ class Lemmatizer(object): def __call__(self, string, univ_pos, morphology=None): """Lemmatize a string. - string (unicode): The string to lemmatize, e.g. the token text. - univ_pos (unicode / int): The token's universal part-of-speech tag. + string (str): The string to lemmatize, e.g. the token text. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. RETURNS (list): The available lemmas for the string. @@ -69,7 +69,7 @@ class Lemmatizer(object): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. """ @@ -128,10 +128,10 @@ class Lemmatizer(object): """Look up a lemma in the table, if available. If no lemma is found, the original string is returned. - string (unicode): The original string. + string (str): The original string. orth (int): Optional hash of the string to look up. If not set, the string will be used and hashed. - RETURNS (unicode): The lemma if the string was found, otherwise the + RETURNS (str): The lemma if the string was found, otherwise the original string. """ lookup_table = self.lookups.get_table("lemma_lookup", {}) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 20e175f03..911112d50 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -190,7 +190,7 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: - """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used + """RETURNS (str): Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id @@ -209,18 +209,18 @@ cdef class Lexeme: @property def orth_(self): - """RETURNS (unicode): The original verbatim text of the lexeme + """RETURNS (str): The original verbatim text of the lexeme (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.""" return self.vocab.strings[self.c.orth] @property def text(self): - """RETURNS (unicode): The original verbatim text of the lexeme.""" + """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ property lower: - """RETURNS (unicode): Lowercase form of the lexeme.""" + """RETURNS (str): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower @@ -293,7 +293,7 @@ cdef class Lexeme: self.c.prob = x property lower_: - """RETURNS (unicode): Lowercase form of the word.""" + """RETURNS (str): Lowercase form of the word.""" def __get__(self): return self.vocab.strings[self.c.lower] @@ -301,7 +301,7 @@ cdef class Lexeme: self.c.lower = self.vocab.strings.add(x) property norm_: - """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the + """RETURNS (str): The lexemes's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -311,7 +311,7 @@ cdef class Lexeme: self.c.norm = self.vocab.strings.add(x) property shape_: - """RETURNS (unicode): Transform of the word's string, to show + """RETURNS (str): Transform of the word's string, to show orthographic features. """ def __get__(self): @@ -321,7 +321,7 @@ cdef class Lexeme: self.c.shape = self.vocab.strings.add(x) property prefix_: - """RETURNS (unicode): Length-N substring from the start of the word. + """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ def __get__(self): @@ -331,7 +331,7 @@ cdef class Lexeme: self.c.prefix = self.vocab.strings.add(x) property suffix_: - """RETURNS (unicode): Length-N substring from the end of the word. + """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ def __get__(self): @@ -341,7 +341,7 @@ cdef class Lexeme: self.c.suffix = self.vocab.strings.add(x) property lang_: - """RETURNS (unicode): Language of the parent vocabulary.""" + """RETURNS (str): Language of the parent vocabulary.""" def __get__(self): return self.vocab.strings[self.c.lang] diff --git a/spacy/lookups.py b/spacy/lookups.py index a9d371b79..5661897e1 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -31,7 +31,7 @@ class Lookups(object): """Check if the lookups contain a table of a given name. Delegates to Lookups.has_table. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name is in the lookups. """ return self.has_table(name) @@ -48,7 +48,7 @@ class Lookups(object): def add_table(self, name, data=SimpleFrozenDict()): """Add a new table to the lookups. Raises an error if the table exists. - name (unicode): Unique name of table. + name (str): Unique name of table. data (dict): Optional data to add to the table. RETURNS (Table): The newly added table. @@ -64,7 +64,7 @@ class Lookups(object): """Get a table. Raises an error if the table doesn't exist and no default value is provided. - name (unicode): Name of the table. + name (str): Name of the table. default: Optional default value to return if table doesn't exist. RETURNS (Table): The table. @@ -79,7 +79,7 @@ class Lookups(object): def remove_table(self, name): """Remove a table. Raises an error if the table doesn't exist. - name (unicode): Name of the table to remove. + name (str): Name of the table to remove. RETURNS (Table): The removed table. DOCS: https://spacy.io/api/lookups#remove_table @@ -91,7 +91,7 @@ class Lookups(object): def has_table(self, name): """Check if the lookups contain a table of a given name. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name exists. DOCS: https://spacy.io/api/lookups#has_table @@ -125,7 +125,7 @@ class Lookups(object): """Save the lookups to a directory as lookups.bin. Expects a path to a directory, which will be created if it doesn't exist. - path (unicode / Path): The file path. + path (str / Path): The file path. DOCS: https://spacy.io/api/lookups#to_disk """ @@ -141,7 +141,7 @@ class Lookups(object): """Load lookups from a directory containing a lookups.bin. Will skip loading if the file doesn't exist. - path (unicode / Path): The directory path. + path (str / Path): The directory path. RETURNS (Lookups): The loaded lookups. DOCS: https://spacy.io/api/lookups#from_disk @@ -167,7 +167,7 @@ class Table(OrderedDict): """Initialize a new table from a dict. data (dict): The dictionary. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict @@ -179,7 +179,7 @@ class Table(OrderedDict): def __init__(self, name=None, data=None): """Initialize a new table. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. @@ -197,7 +197,7 @@ class Table(OrderedDict): def __setitem__(self, key, value): """Set new key/value pair. String keys will be hashed. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ key = get_string_id(key) @@ -208,7 +208,7 @@ class Table(OrderedDict): """Set new key/value pair. String keys will be hashed. Same as table[key] = value. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ self[key] = value @@ -216,7 +216,7 @@ class Table(OrderedDict): def __getitem__(self, key): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. RETURNS: The value. """ key = get_string_id(key) @@ -225,7 +225,7 @@ class Table(OrderedDict): def get(self, key, default=None): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. default: The default value to return. RETURNS: The value. """ @@ -235,7 +235,7 @@ class Table(OrderedDict): def __contains__(self, key): """Check whether a key is in the table. String keys will be hashed. - key (unicode / int): The key to check. + key (str / int): The key to check. RETURNS (bool): Whether the key is in the table. """ key = get_string_id(key) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ff707a71c..732931380 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -66,7 +66,7 @@ cdef class DependencyMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2bcb82a2a..225eba9a9 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -63,7 +63,7 @@ cdef class Matcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns @@ -97,7 +97,7 @@ cdef class Matcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. *_patterns (list): For backwards compatibility: list of patterns to add @@ -138,7 +138,7 @@ cdef class Matcher: """Remove a rule from the matcher. A KeyError is raised if the key does not exist. - key (unicode): The ID of the match rule. + key (str): The ID of the match rule. """ norm_key = self._normalize_key(key) if not norm_key in self._patterns: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 14cc39787..f7ce44ece 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -70,7 +70,7 @@ cdef class PhraseMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. DOCS: https://spacy.io/api/phrasematcher#contains @@ -85,7 +85,7 @@ cdef class PhraseMatcher: """Remove a rule from the matcher by match ID. A KeyError is raised if the key does not exist. - key (unicode): The match ID. + key (str): The match ID. DOCS: https://spacy.io/api/phrasematcher#remove """ @@ -159,7 +159,7 @@ cdef class PhraseMatcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. docs (list): List of `Doc` objects representing match patterns. on_match (callable): Callback executed on match. *_docs (Doc): For backwards compatibility: list of patterns to add diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 0b53b124c..5dcf81ea7 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -198,8 +198,8 @@ cdef class Morphology: """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. + tag (str): The part-of-speech tag to key the exception. + orth (str): The word-form to key the exception. """ attrs = dict(attrs) attrs = _normalize_props(attrs) diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index 4c0950453..971ebe518 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): fulfilled (e.g. if previous components assign the attributes). pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - name (unicode): The name of the pipeline component to analyze. + name (str): The name of the pipeline component to analyze. pipe (callable): The pipeline component function to analyze. index (int): The index of the component in the pipeline. warn (bool): Show user warning if problem is found. @@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr): """Get all pipeline components that assign an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that assign the attr. """ return _get_feature_for_attr(pipeline, attr, "assigns") @@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr): """Get all pipeline components that require an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that require the attr. """ return _get_feature_for_attr(pipeline, attr, "requires") diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 58160c2e9..cdacc82f6 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -315,7 +315,7 @@ class EntityRuler(object): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. - path (unicode / Path): The JSONL file to load. + path (str / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. @@ -351,7 +351,7 @@ class EntityRuler(object): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). - path (unicode / Path): The JSONL file to save. + path (str / Path): The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. DOCS: https://spacy.io/api/entityruler#to_disk diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 6e9d4197c..622791512 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"): """Merge subtokens into a single token. doc (Doc): The Doc object. - label (unicode): The subtoken dependency label. + label (str): The subtoken dependency label. RETURNS (Doc): The Doc object with merged subtokens. DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a30f11729..9fe5af154 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -152,7 +152,7 @@ cdef class StringStore: def add(self, string): """Add a string to the StringStore. - string (unicode): The string to add. + string (str): The string to add. RETURNS (uint64): The string's hash value. """ if isinstance(string, unicode): @@ -179,7 +179,7 @@ cdef class StringStore: def __contains__(self, string not None): """Check whether a string is in the store. - string (unicode): The string to check. + string (str): The string to check. RETURNS (bool): Whether the store contains the string. """ cdef hash_t key @@ -205,7 +205,7 @@ cdef class StringStore: def __iter__(self): """Iterate over the strings in the store, in order. - YIELDS (unicode): A string in the store. + YIELDS (str): A string in the store. """ cdef int i cdef hash_t key diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 7e75052f7..b628b1171 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -134,7 +134,7 @@ cdef class Tokenizer: def __call__(self, unicode string): """Tokenize a string. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. DOCS: https://spacy.io/api/tokenizer#call @@ -147,7 +147,7 @@ cdef class Tokenizer: cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): """Tokenize according to affix and token_match settings. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): @@ -527,7 +527,7 @@ cdef class Tokenizer: def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (list): A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. @@ -542,7 +542,7 @@ cdef class Tokenizer: """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_prefix @@ -556,7 +556,7 @@ cdef class Tokenizer: """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. - string (unicode): The string to segment. + string (str): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_suffix @@ -576,7 +576,7 @@ cdef class Tokenizer: def _validate_special_case(self, chunk, substrings): """Check whether the `ORTH` fields match the string. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. """ @@ -588,7 +588,7 @@ cdef class Tokenizer: def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. @@ -629,7 +629,7 @@ cdef class Tokenizer: produced are identical to `nlp.tokenizer()` except for whitespace tokens. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (list): A list of (pattern_string, token_string) tuples DOCS: https://spacy.io/api/tokenizer#explain diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0716b2b3d..f6d0dbf4a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -107,7 +107,7 @@ cdef class Doc: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Doc._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -125,7 +125,7 @@ cdef class Doc: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/doc#get_extension @@ -136,7 +136,7 @@ cdef class Doc: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/doc#has_extension @@ -147,7 +147,7 @@ cdef class Doc: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -473,7 +473,7 @@ cdef class Doc: def text(self): """A unicode representation of the document text. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return "".join(t.text_with_ws for t in self) @@ -482,7 +482,7 @@ cdef class Doc: """An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return self.text @@ -628,7 +628,7 @@ cdef class Doc: @property def lang_(self): - """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" + """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'.""" return self.vocab.lang cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 66e8d8c3e..59323c393 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -33,7 +33,7 @@ cdef class Span: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Span._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -51,7 +51,7 @@ cdef class Span: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/span#get_extension @@ -62,7 +62,7 @@ cdef class Span: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/span#has_extension @@ -73,7 +73,7 @@ cdef class Span: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -501,7 +501,7 @@ cdef class Span: @property def text(self): - """RETURNS (unicode): The original verbatim text of the span.""" + """RETURNS (str): The original verbatim text of the span.""" text = self.text_with_ws if self[-1].whitespace_: text = text[:-1] @@ -512,7 +512,7 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing + RETURNS (str): The text content of the span (with trailing whitespace). """ return "".join([t.text_with_ws for t in self]) @@ -688,7 +688,7 @@ cdef class Span: raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) property ent_id_: - """RETURNS (unicode): The (string) entity ID.""" + """RETURNS (str): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ @@ -700,12 +700,12 @@ cdef class Span: """Verbatim text content (identical to `Span.text`). Exists mostly for consistency with other attributes. - RETURNS (unicode): The span's text.""" + RETURNS (str): The span's text.""" return self.text @property def lemma_(self): - """RETURNS (unicode): The span's lemma.""" + """RETURNS (str): The span's lemma.""" return " ".join([t.lemma_ for t in self]).strip() @property @@ -724,7 +724,7 @@ cdef class Span: return "".join([t.text_with_ws for t in self]) property label_: - """RETURNS (unicode): The span's label.""" + """RETURNS (str): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -734,7 +734,7 @@ cdef class Span: raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) property kb_id_: - """RETURNS (unicode): The named entity's KB ID.""" + """RETURNS (str): The named entity's KB ID.""" def __get__(self): return self.doc.vocab.strings[self.kb_id] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2486ed991..0d1e82322 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -36,7 +36,7 @@ cdef class Token: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Token._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -54,7 +54,7 @@ cdef class Token: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/token#get_extension @@ -65,7 +65,7 @@ cdef class Token: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/token#has_extension @@ -76,7 +76,7 @@ cdef class Token: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -244,12 +244,12 @@ cdef class Token: @property def text(self): - """RETURNS (unicode): The original verbatim text of the token.""" + """RETURNS (str): The original verbatim text of the token.""" return self.orth_ @property def text_with_ws(self): - """RETURNS (unicode): The text content of the span (with trailing + """RETURNS (str): The text content of the span (with trailing whitespace). """ cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -740,7 +740,7 @@ cdef class Token: self.c.ent_type = ent_type property ent_type_: - """RETURNS (unicode): Named entity type.""" + """RETURNS (str): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] @@ -763,7 +763,7 @@ cdef class Token: and "" means no entity tag is set. "B" with an empty ent_type means that the token is blocked from further processing by NER. - RETURNS (unicode): IOB code of named entity tag. + RETURNS (str): IOB code of named entity tag. """ iob_strings = ("", "I", "O", "B") return iob_strings[self.c.ent_iob] @@ -779,7 +779,7 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """RETURNS (unicode): ID of the entity the token is an instance of, + """RETURNS (str): ID of the entity the token is an instance of, if any. """ def __get__(self): @@ -797,7 +797,7 @@ cdef class Token: self.c.ent_kb_id = ent_kb_id property ent_kb_id_: - """RETURNS (unicode): Named entity KB ID.""" + """RETURNS (str): Named entity KB ID.""" def __get__(self): return self.vocab.strings[self.c.ent_kb_id] @@ -806,12 +806,12 @@ cdef class Token: @property def whitespace_(self): - """RETURNS (unicode): The trailing whitespace character, if present.""" + """RETURNS (str): The trailing whitespace character, if present.""" return " " if self.c.spacy else "" @property def orth_(self): - """RETURNS (unicode): Verbatim text content (identical to + """RETURNS (str): Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. """ @@ -819,13 +819,13 @@ cdef class Token: @property def lower_(self): - """RETURNS (unicode): The lowercase token text. Equivalent to + """RETURNS (str): The lowercase token text. Equivalent to `Token.text.lower()`. """ return self.vocab.strings[self.c.lex.lower] property norm_: - """RETURNS (unicode): The token's norm, i.e. a normalised form of the + """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ @@ -837,34 +837,34 @@ cdef class Token: @property def shape_(self): - """RETURNS (unicode): Transform of the tokens's string, to show + """RETURNS (str): Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". """ return self.vocab.strings[self.c.lex.shape] @property def prefix_(self): - """RETURNS (unicode): A length-N substring from the start of the token. + """RETURNS (str): A length-N substring from the start of the token. Defaults to `N=1`. """ return self.vocab.strings[self.c.lex.prefix] @property def suffix_(self): - """RETURNS (unicode): A length-N substring from the end of the token. + """RETURNS (str): A length-N substring from the end of the token. Defaults to `N=3`. """ return self.vocab.strings[self.c.lex.suffix] @property def lang_(self): - """RETURNS (unicode): Language of the parent document's vocabulary, + """RETURNS (str): Language of the parent document's vocabulary, e.g. 'en'. """ return self.vocab.strings[self.c.lex.lang] property lemma_: - """RETURNS (unicode): The token lemma, i.e. the base form of the word, + """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ def __get__(self): @@ -877,7 +877,7 @@ cdef class Token: self.c.lemma = self.vocab.strings.add(lemma_) property pos_: - """RETURNS (unicode): Coarse-grained part-of-speech tag.""" + """RETURNS (str): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] @@ -885,7 +885,7 @@ cdef class Token: self.c.pos = parts_of_speech.IDS[pos_name] property tag_: - """RETURNS (unicode): Fine-grained part-of-speech tag.""" + """RETURNS (str): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] @@ -893,7 +893,7 @@ cdef class Token: self.tag = self.vocab.strings.add(tag) property dep_: - """RETURNS (unicode): The syntactic dependency label.""" + """RETURNS (str): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] diff --git a/spacy/util.py b/spacy/util.py index 41af881c9..fc5837755 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -58,7 +58,7 @@ def lang_class_is_loaded(lang): loaded lazily, to avoid expensive setup code associated with the language data. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (bool): Whether a Language class has been loaded. """ return lang in registry.languages @@ -67,7 +67,7 @@ def lang_class_is_loaded(lang): def get_lang_class(lang): """Import and load a Language class. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available @@ -85,7 +85,7 @@ def get_lang_class(lang): def set_lang_class(name, cls): """Set a custom Language class name that can be loaded via get_lang_class. - name (unicode): Name of Language class. + name (str): Name of Language class. cls (Language): Language class. """ registry.languages.register(name, func=cls) @@ -107,7 +107,7 @@ def load_language_data(path): """Load JSON language data using the given path as a base. If the provided path isn't present, will attempt to load a gzipped version before giving up. - path (unicode / Path): The data to load. + path (str / Path): The data to load. RETURNS: The loaded data. """ path = ensure_path(path) @@ -128,7 +128,7 @@ def get_module_path(module): def load_model(name, **overrides): """Load a model from a package or data path. - name (unicode): Package name or model path. + name (str): Package name or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. """ @@ -202,7 +202,7 @@ def load_model_from_init_py(init_file, **overrides): """Helper function to use in the `load()` method of a model package's __init__.py. - init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + init_file (str): Path to model's __init__.py, i.e. `__file__`. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with loaded model. """ @@ -227,8 +227,8 @@ def get_package_version(name): """Get the version of an installed package. Typically used to get model package versions. - name (unicode): The name of the installed Python package. - RETURNS (unicode / None): The version or None if package not installed. + name (str): The name of the installed Python package. + RETURNS (str / None): The version or None if package not installed. """ try: return importlib_metadata.version(name) @@ -338,7 +338,7 @@ def get_model_config(path): def is_package(name): """Check if string maps to a package installed via pip. - name (unicode): Name of package. + name (str): Name of package. RETURNS (bool): True if installed package, False if not. """ try: @@ -351,7 +351,7 @@ def is_package(name): def get_package_path(name): """Get the path to an installed package. - name (unicode): Package name. + name (str): Package name. RETURNS (Path): Path to installed package. """ name = name.lower() # use lowercase version to be safe @@ -526,8 +526,8 @@ def expand_exc(excs, search, replace): For example, to add additional versions with typographic apostrophes. excs (dict): Tokenizer exceptions. - search (unicode): String to find and replace. - replace (unicode): Replacement. + search (str): String to find and replace. + replace (str): Replacement. RETURNS (dict): Combined tokenizer exceptions. """ @@ -761,8 +761,8 @@ def from_disk(path, readers, exclude): def import_file(name, loc): """Import module from a file. Used to load models from a directory. - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. + name (str): Name of module to load. + loc (str / Path): Path to the file. RETURNS: The loaded module. """ loc = str(loc) @@ -777,8 +777,8 @@ def minify_html(html): Disclaimer: NOT a general-purpose solution, only removes indentation and newlines. - html (unicode): Markup to minify. - RETURNS (unicode): "Minified" HTML. + html (str): Markup to minify. + RETURNS (str): "Minified" HTML. """ return html.strip().replace(" ", "").replace("\n", "") @@ -787,8 +787,8 @@ def escape_html(text): """Replace <, >, &, " with their HTML encoded representation. Intended to prevent HTML errors in rendered displaCy markup. - text (unicode): The original text. - RETURNS (unicode): Equivalent text to be safely used within HTML. + text (str): The original text. + RETURNS (str): Equivalent text to be safely used within HTML. """ text = text.replace("&", "&") text = text.replace("<", "<") diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index e100ae915..0ed2462c6 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -57,7 +57,7 @@ cdef class Vectors: shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (unicode): A name to identify the vectors table. + name (str): A name to identify the vectors table. RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init @@ -237,7 +237,7 @@ cdef class Vectors: def find(self, *, key=None, keys=None, row=None, rows=None): """Look up one or more keys by row, or vice versa. - key (unicode / int): Find the row that the given key points to. + key (str / int): Find the row that the given key points to. Returns int, -1 if missing. keys (iterable): Find rows that the keys point to. Returns ndarray. @@ -352,7 +352,7 @@ cdef class Vectors: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode / Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exists. DOCS: https://spacy.io/api/vectors#to_disk @@ -372,7 +372,7 @@ cdef class Vectors: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode / Path): Directory path, string or Path-like object. + path (str / Path): Directory path, string or Path-like object. RETURNS (Vectors): The modified object. DOCS: https://spacy.io/api/vectors#from_disk diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a1929559f..ed37f6e98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -41,7 +41,7 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + name (str): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} @@ -97,7 +97,7 @@ cdef class Vocab: See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag_getter (callable): A function `f(str) -> bool`, to get the flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest @@ -187,7 +187,7 @@ cdef class Vocab: def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. - string (unicode): The ID string. + string (str): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. DOCS: https://spacy.io/api/vocab#contains diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..39148e476 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation. | Name | Type | Description | | -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `vocab` | `Vocab` | The lexeme's vocabulary. | -| `text` | unicode | Verbatim text content. | +| `text` | str | Verbatim text content. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `flags` | int | Container of the lexeme's binary flags. | | `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | +| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `lower` | int | Lowercase form of the word. | -| `lower_` | unicode | Lowercase form of the word. | +| `lower_` | str | Lowercase form of the word. | | `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | -| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | +| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. | | `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. | | `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | | `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | | `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | @@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation. | `is_oov` | bool | Is the lexeme out-of-vocabulary? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | -| `lang_` | unicode | Language of the parent vocabulary. | +| `lang_` | str | Language of the parent vocabulary. | | `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | | `cluster` | int | Brown cluster ID. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..b851f6882 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,7 +27,7 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | -| `vectors_name` 2.2 | unicode | A name to identify the vectors table. | +| `vectors_name` 2.2 | str | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -91,10 +91,10 @@ given string, you need to look it up in > assert oov not in nlp.vocab > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------- | -| `string` | unicode | The ID string. | -| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------- | +| `string` | str | The ID string. | +| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | ## Vocab.add_flag {#add_flag tag="method"} @@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`. | Name | Type | Description | | ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. | +| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. | | `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | | **RETURNS** | int | The integer ID by which the flag value can be checked. | @@ -227,10 +227,10 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 5f47bd2e3..a84399312 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Type |  Description | -| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | unicode | The exact verbatim text of a token. | -| `TEXT` 2.1 | unicode | The exact verbatim text of a token. | -| `LOWER` | unicode | The lowercase form of the token text. | -|  `LENGTH` | int | The length of the token text. | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | -| `ENT_TYPE` | unicode | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| Attribute | Type |  Description | +| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | +| `ORTH` | str | The exact verbatim text of a token. | +| `TEXT` 2.1 | str | The exact verbatim text of a token. | +| `LOWER` | str | The lowercase form of the token text. | +|  `LENGTH` | int | The length of the token text. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | +| `ENT_TYPE` | str | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | @@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} -When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, -the EntityRuler calls the nlp object to construct a doc object. This happens in case you try -to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to -extract matches based on the pattern's POS signature. +When using a large amount of **phrase patterns** (roughly > 10000) it's useful +to understand how the `add_patterns` function of the EntityRuler works. For each +**phrase pattern**, the EntityRuler calls the nlp object to construct a doc +object. This happens in case you try to add the EntityRuler at the end of an +existing pipeline with, for example, a POS tagger and want to extract matches +based on the pattern's POS signature. -In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. +In this case you would pass a config value of `phrase_matcher_attr="POS"` for +the EntityRuler. -Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. +Running the full language pipeline across every pattern in a large list scales +linearly and can therefore take a long time on large amounts of phrase patterns. -As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use +nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with +5,000-100,000 phrase patterns respectively. -Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. +Even with this speedup (but especially if you're using an older version) the +`add_patterns` function can still take a long time. -An easy workaround to make this function run faster is disabling the other language pipes -while adding the phrase patterns. +An easy workaround to make this function run faster is disabling the other +language pipes while adding the phrase patterns. ```python entityruler = EntityRuler(nlp)