unicode -> str consistency

This commit is contained in:
Ines Montani 2020-05-24 17:20:58 +02:00
parent cf156ed2f4
commit 5d3806e059
28 changed files with 212 additions and 205 deletions

View File

@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched.
lines (unicode): CONLL-U lines for one sentences
tag_pattern (unicode): Regex pattern for entity tag
lines (str): CONLL-U lines for one sentences
tag_pattern (str): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags
"""
@ -187,8 +187,8 @@ def example_from_conllu_sentence(
"""Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.
lines (unicode): The non-comment lines for a CoNLL-U sentence
ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
lines (str): The non-comment lines for a CoNLL-U sentence
ner_tag_pattern (str): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation
"""
# create a Doc with each subtoken as its own token

View File

@ -22,13 +22,13 @@ def render(
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (unicode): Rendered HTML markup.
RETURNS (str): Rendered HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
@ -73,13 +73,13 @@ def serve(
"""Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation.
host (unicode): Host to serve visualisation.
host (str): Host to serve visualisation.
DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers

View File

@ -47,7 +47,7 @@ class DependencyRenderer(object):
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered SVG or HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
# Create a random ID prefix to make sure parses don't receive the
# same ID, even if they're identical
@ -78,7 +78,7 @@ class DependencyRenderer(object):
render_id (int): Unique ID, typically index of document.
words (list): Individual words and their tags.
arcs (list): Individual arcs and their start, end, direction and label.
RETURNS (unicode): Rendered SVG markup.
RETURNS (str): Rendered SVG markup.
"""
self.levels = self.get_levels(arcs)
self.highest_level = len(self.levels)
@ -112,10 +112,10 @@ class DependencyRenderer(object):
):
"""Render individual word.
text (unicode): Word text.
tag (unicode): Part-of-speech tag.
text (str): Word text.
tag (str): Part-of-speech tag.
i (int): Unique ID, typically word index.
RETURNS (unicode): Rendered SVG markup.
RETURNS (str): Rendered SVG markup.
"""
y = self.offset_y + self.word_spacing
x = self.offset_x + i * self.distance
@ -131,12 +131,12 @@ class DependencyRenderer(object):
def render_arrow(self, label, start, end, direction, i):
"""Render individual arrow.
label (unicode): Dependency label.
label (str): Dependency label.
start (int): Index of start word.
end (int): Index of end word.
direction (unicode): Arrow direction, 'left' or 'right'.
direction (str): Arrow direction, 'left' or 'right'.
i (int): Unique ID, typically arrow index.
RETURNS (unicode): Rendered SVG markup.
RETURNS (str): Rendered SVG markup.
"""
if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction)
@ -179,7 +179,7 @@ class DependencyRenderer(object):
y (int): Y-coordinate of arrow start and end point.
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
x_end (int): X-coordinate of arrow end point.
RETURNS (unicode): Definition of the arc path ('d' attribute).
RETURNS (str): Definition of the arc path ('d' attribute).
"""
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
if self.compact:
@ -189,11 +189,11 @@ class DependencyRenderer(object):
def get_arrowhead(self, direction, x, y, end):
"""Render individual arrow head.
direction (unicode): Arrow direction, 'left' or 'right'.
direction (str): Arrow direction, 'left' or 'right'.
x (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
end (int): X-coordinate of arrow end point.
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
RETURNS (str): Definition of the arrow head path ('d' attribute).
"""
if direction == "left":
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@ -279,7 +279,7 @@ class EntityRenderer(object):
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup.
RETURNS (str): Rendered HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@ -300,7 +300,7 @@ class EntityRenderer(object):
def render_ents(self, text, spans, title):
"""Render entities in text.
text (unicode): Original text.
text (str): Original text.
spans (list): Individual entity spans and their start, end and label.
title (unicode or None): Document title set in Doc.user_data['title'].
"""

View File

@ -598,7 +598,7 @@ class MatchPatternError(ValueError):
def __init__(self, key, errors):
"""Custom error for validating match patterns.
key (unicode): The name of the matcher rule.
key (str): The name of the matcher rule.
errors (dict): Validation errors (sequence of strings) mapped to pattern
ID, i.e. the index of the added pattern.
"""

View File

@ -1,8 +1,8 @@
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
term (unicode): The term to explain.
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
term (str): The term to explain.
RETURNS (str): The explanation, or `None` if not found in the glossary.
EXAMPLE:
>>> spacy.explain(u'NORP')

View File

@ -38,7 +38,7 @@ cdef class Candidate:
@property
def entity_(self):
"""RETURNS (unicode): ID/name of this entity in the KB"""
"""RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
@ -48,7 +48,7 @@ cdef class Candidate:
@property
def alias_(self):
"""RETURNS (unicode): ID of the original alias"""
"""RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property

View File

@ -122,7 +122,7 @@ class Language(object):
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
lang (str): Two-letter language ID, i.e. ISO code.
DOCS: https://spacy.io/api/language
"""
@ -287,7 +287,7 @@ class Language(object):
def get_pipe(self, name):
"""Get a pipeline component for a given component name.
name (unicode): Name of pipeline component to get.
name (str): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
DOCS: https://spacy.io/api/language#get_pipe
@ -300,7 +300,7 @@ class Language(object):
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
name (unicode): Factory name to look up in `Language.factories`.
name (str): Factory name to look up in `Language.factories`.
config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component.
@ -343,12 +343,12 @@ class Language(object):
of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing
name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
before (unicode): Component name to insert component directly before.
after (unicode): Component name to insert component directly after.
before (str): Component name to insert component directly before.
after (str): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline.
@ -389,7 +389,7 @@ class Language(object):
"""Check if a component name is present in the pipeline. Equivalent to
`name in nlp.pipe_names`.
name (unicode): Name of the component.
name (str): Name of the component.
RETURNS (bool): Whether a component of the name exists in the pipeline.
DOCS: https://spacy.io/api/language#has_pipe
@ -399,7 +399,7 @@ class Language(object):
def replace_pipe(self, name, component):
"""Replace a component in the pipeline.
name (unicode): Name of the component to replace.
name (str): Name of the component to replace.
component (callable): Pipeline component.
DOCS: https://spacy.io/api/language#replace_pipe
@ -418,8 +418,8 @@ class Language(object):
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
old_name (unicode): Name of the component to rename.
new_name (unicode): New name of the component.
old_name (str): Name of the component to rename.
new_name (str): New name of the component.
DOCS: https://spacy.io/api/language#rename_pipe
"""
@ -433,7 +433,7 @@ class Language(object):
def remove_pipe(self, name):
"""Remove a component from the pipeline.
name (unicode): Name of the component to remove.
name (str): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component.
DOCS: https://spacy.io/api/language#remove_pipe
@ -450,7 +450,7 @@ class Language(object):
and can contain arbitrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.
text (str): The text to be processed.
disable (list): Names of the pipeline components to disable.
component_cfg (dict): An optional dictionary with extra keyword arguments
for specific components.
@ -1086,7 +1086,7 @@ class component(object):
):
"""Decorate a pipeline component.
name (unicode): Default component and factory name.
name (str): Default component and factory name.
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
requires (list): Attributes required by component, e.g. `["token.dep"]`.
retokenizes (bool): Whether the component changes the tokenization.

View File

@ -29,8 +29,8 @@ class Lemmatizer(object):
def __call__(self, string, univ_pos, morphology=None):
"""Lemmatize a string.
string (unicode): The string to lemmatize, e.g. the token text.
univ_pos (unicode / int): The token's universal part-of-speech tag.
string (str): The string to lemmatize, e.g. the token text.
univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string.
@ -69,7 +69,7 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
@ -128,10 +128,10 @@ class Lemmatizer(object):
"""Look up a lemma in the table, if available. If no lemma is found,
the original string is returned.
string (unicode): The original string.
string (str): The original string.
orth (int): Optional hash of the string to look up. If not set, the
string will be used and hashed.
RETURNS (unicode): The lemma if the string was found, otherwise the
RETURNS (str): The lemma if the string was found, otherwise the
original string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})

View File

@ -190,7 +190,7 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector)
property rank:
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
"""RETURNS (str): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
@ -209,18 +209,18 @@ cdef class Lexeme:
@property
def orth_(self):
"""RETURNS (unicode): The original verbatim text of the lexeme
"""RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
return self.vocab.strings[self.c.orth]
@property
def text(self):
"""RETURNS (unicode): The original verbatim text of the lexeme."""
"""RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
property lower:
"""RETURNS (unicode): Lowercase form of the lexeme."""
"""RETURNS (str): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
@ -293,7 +293,7 @@ cdef class Lexeme:
self.c.prob = x
property lower_:
"""RETURNS (unicode): Lowercase form of the word."""
"""RETURNS (str): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
@ -301,7 +301,7 @@ cdef class Lexeme:
self.c.lower = self.vocab.strings.add(x)
property norm_:
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
"""RETURNS (str): The lexemes's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
@ -311,7 +311,7 @@ cdef class Lexeme:
self.c.norm = self.vocab.strings.add(x)
property shape_:
"""RETURNS (unicode): Transform of the word's string, to show
"""RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
@ -321,7 +321,7 @@ cdef class Lexeme:
self.c.shape = self.vocab.strings.add(x)
property prefix_:
"""RETURNS (unicode): Length-N substring from the start of the word.
"""RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
@ -331,7 +331,7 @@ cdef class Lexeme:
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
"""RETURNS (unicode): Length-N substring from the end of the word.
"""RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
@ -341,7 +341,7 @@ cdef class Lexeme:
self.c.suffix = self.vocab.strings.add(x)
property lang_:
"""RETURNS (unicode): Language of the parent vocabulary."""
"""RETURNS (str): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]

View File

@ -31,7 +31,7 @@ class Lookups(object):
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
name (unicode): Name of the table.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups.
"""
return self.has_table(name)
@ -48,7 +48,7 @@ class Lookups(object):
def add_table(self, name, data=SimpleFrozenDict()):
"""Add a new table to the lookups. Raises an error if the table exists.
name (unicode): Unique name of table.
name (str): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
@ -64,7 +64,7 @@ class Lookups(object):
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (unicode): Name of the table.
name (str): Name of the table.
default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
@ -79,7 +79,7 @@ class Lookups(object):
def remove_table(self, name):
"""Remove a table. Raises an error if the table doesn't exist.
name (unicode): Name of the table to remove.
name (str): Name of the table to remove.
RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
@ -91,7 +91,7 @@ class Lookups(object):
def has_table(self, name):
"""Check if the lookups contain a table of a given name.
name (unicode): Name of the table.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
@ -125,7 +125,7 @@ class Lookups(object):
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
path (unicode / Path): The file path.
path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
"""
@ -141,7 +141,7 @@ class Lookups(object):
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
path (unicode / Path): The directory path.
path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
@ -167,7 +167,7 @@ class Table(OrderedDict):
"""Initialize a new table from a dict.
data (dict): The dictionary.
name (unicode): Optional table name for reference.
name (str): Optional table name for reference.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict
@ -179,7 +179,7 @@ class Table(OrderedDict):
def __init__(self, name=None, data=None):
"""Initialize a new table.
name (unicode): Optional table name for reference.
name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object.
@ -197,7 +197,7 @@ class Table(OrderedDict):
def __setitem__(self, key, value):
"""Set new key/value pair. String keys will be hashed.
key (unicode / int): The key to set.
key (str / int): The key to set.
value: The value to set.
"""
key = get_string_id(key)
@ -208,7 +208,7 @@ class Table(OrderedDict):
"""Set new key/value pair. String keys will be hashed.
Same as table[key] = value.
key (unicode / int): The key to set.
key (str / int): The key to set.
value: The value to set.
"""
self[key] = value
@ -216,7 +216,7 @@ class Table(OrderedDict):
def __getitem__(self, key):
"""Get the value for a given key. String keys will be hashed.
key (unicode / int): The key to get.
key (str / int): The key to get.
RETURNS: The value.
"""
key = get_string_id(key)
@ -225,7 +225,7 @@ class Table(OrderedDict):
def get(self, key, default=None):
"""Get the value for a given key. String keys will be hashed.
key (unicode / int): The key to get.
key (str / int): The key to get.
default: The default value to return.
RETURNS: The value.
"""
@ -235,7 +235,7 @@ class Table(OrderedDict):
def __contains__(self, key):
"""Check whether a key is in the table. String keys will be hashed.
key (unicode / int): The key to check.
key (str / int): The key to check.
RETURNS (bool): Whether the key is in the table.
"""
key = get_string_id(key)

View File

@ -66,7 +66,7 @@ cdef class DependencyMatcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
key (unicode): The match ID.
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns

View File

@ -63,7 +63,7 @@ cdef class Matcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
key (unicode): The match ID.
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
@ -97,7 +97,7 @@ cdef class Matcher:
number of arguments). The on_match callback becomes an optional keyword
argument.
key (unicode): The match ID.
key (str): The match ID.
patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match.
*_patterns (list): For backwards compatibility: list of patterns to add
@ -138,7 +138,7 @@ cdef class Matcher:
"""Remove a rule from the matcher. A KeyError is raised if the key does
not exist.
key (unicode): The ID of the match rule.
key (str): The ID of the match rule.
"""
norm_key = self._normalize_key(key)
if not norm_key in self._patterns:

View File

@ -70,7 +70,7 @@ cdef class PhraseMatcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
key (unicode): The match ID.
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
DOCS: https://spacy.io/api/phrasematcher#contains
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
"""Remove a rule from the matcher by match ID. A KeyError is raised if
the key does not exist.
key (unicode): The match ID.
key (str): The match ID.
DOCS: https://spacy.io/api/phrasematcher#remove
"""
@ -159,7 +159,7 @@ cdef class PhraseMatcher:
number of arguments). The on_match callback becomes an optional keyword
argument.
key (unicode): The match ID.
key (str): The match ID.
docs (list): List of `Doc` objects representing match patterns.
on_match (callable): Callback executed on match.
*_docs (Doc): For backwards compatibility: list of patterns to add

View File

@ -198,8 +198,8 @@ cdef class Morphology:
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
tag (str): The part-of-speech tag to key the exception.
orth (str): The word-form to key the exception.
"""
attrs = dict(attrs)
attrs = _normalize_props(attrs)

View File

@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
name (unicode): The name of the pipeline component to analyze.
name (str): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found.
@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
attr (unicode): The attribute to check.
attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
"""
return _get_feature_for_attr(pipeline, attr, "assigns")
@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
attr (unicode): The attribute to check.
attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr.
"""
return _get_feature_for_attr(pipeline, attr, "requires")

View File

@ -315,7 +315,7 @@ class EntityRuler(object):
"""Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line.
path (unicode / Path): The JSONL file to load.
path (str / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
@ -351,7 +351,7 @@ class EntityRuler(object):
"""Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL).
path (unicode / Path): The JSONL file to save.
path (str / Path): The JSONL file to save.
**kwargs: Other config paramters, mostly for consistency.
DOCS: https://spacy.io/api/entityruler#to_disk

View File

@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
"""Merge subtokens into a single token.
doc (Doc): The Doc object.
label (unicode): The subtoken dependency label.
label (str): The subtoken dependency label.
RETURNS (Doc): The Doc object with merged subtokens.
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens

View File

@ -152,7 +152,7 @@ cdef class StringStore:
def add(self, string):
"""Add a string to the StringStore.
string (unicode): The string to add.
string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
if isinstance(string, unicode):
@ -179,7 +179,7 @@ cdef class StringStore:
def __contains__(self, string not None):
"""Check whether a string is in the store.
string (unicode): The string to check.
string (str): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
@ -205,7 +205,7 @@ cdef class StringStore:
def __iter__(self):
"""Iterate over the strings in the store, in order.
YIELDS (unicode): A string in the store.
YIELDS (str): A string in the store.
"""
cdef int i
cdef hash_t key

View File

@ -134,7 +134,7 @@ cdef class Tokenizer:
def __call__(self, unicode string):
"""Tokenize a string.
string (unicode): The string to tokenize.
string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
DOCS: https://spacy.io/api/tokenizer#call
@ -147,7 +147,7 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
"""Tokenize according to affix and token_match settings.
string (unicode): The string to tokenize.
string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
@ -527,7 +527,7 @@ cdef class Tokenizer:
def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens.
string (unicode): The string to segment.
string (str): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
and `.end()` methods, denoting the placement of internal segment
separators, e.g. hyphens.
@ -542,7 +542,7 @@ cdef class Tokenizer:
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
string (unicode): The string to segment.
string (str): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_prefix
@ -556,7 +556,7 @@ cdef class Tokenizer:
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
string (unicode): The string to segment.
string (str): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_suffix
@ -576,7 +576,7 @@ cdef class Tokenizer:
def _validate_special_case(self, chunk, substrings):
"""Check whether the `ORTH` fields match the string.
string (unicode): The string to specially tokenize.
string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes.
"""
@ -588,7 +588,7 @@ cdef class Tokenizer:
def add_special_case(self, unicode string, substrings):
"""Add a special-case tokenization rule.
string (unicode): The string to specially tokenize.
string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated.
@ -629,7 +629,7 @@ cdef class Tokenizer:
produced are identical to `nlp.tokenizer()` except for whitespace
tokens.
string (unicode): The string to tokenize.
string (str): The string to tokenize.
RETURNS (list): A list of (pattern_string, token_string) tuples
DOCS: https://spacy.io/api/tokenizer#explain

View File

@ -107,7 +107,7 @@ cdef class Doc:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Doc._`.
name (unicode): Name of the attribute to set.
name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@ -125,7 +125,7 @@ cdef class Doc:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/doc#get_extension
@ -136,7 +136,7 @@ cdef class Doc:
def has_extension(cls, name):
"""Check whether an extension has been registered.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/doc#has_extension
@ -147,7 +147,7 @@ cdef class Doc:
def remove_extension(cls, name):
"""Remove a previously registered extension.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@ -473,7 +473,7 @@ cdef class Doc:
def text(self):
"""A unicode representation of the document text.
RETURNS (unicode): The original verbatim text of the document.
RETURNS (str): The original verbatim text of the document.
"""
return "".join(t.text_with_ws for t in self)
@ -482,7 +482,7 @@ cdef class Doc:
"""An alias of `Doc.text`, provided for duck-type compatibility with
`Span` and `Token`.
RETURNS (unicode): The original verbatim text of the document.
RETURNS (str): The original verbatim text of the document.
"""
return self.text
@ -628,7 +628,7 @@ cdef class Doc:
@property
def lang_(self):
"""RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
"""RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
return self.vocab.lang
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:

View File

@ -33,7 +33,7 @@ cdef class Span:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Span._`.
name (unicode): Name of the attribute to set.
name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@ -51,7 +51,7 @@ cdef class Span:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/span#get_extension
@ -62,7 +62,7 @@ cdef class Span:
def has_extension(cls, name):
"""Check whether an extension has been registered.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/span#has_extension
@ -73,7 +73,7 @@ cdef class Span:
def remove_extension(cls, name):
"""Remove a previously registered extension.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@ -501,7 +501,7 @@ cdef class Span:
@property
def text(self):
"""RETURNS (unicode): The original verbatim text of the span."""
"""RETURNS (str): The original verbatim text of the span."""
text = self.text_with_ws
if self[-1].whitespace_:
text = text[:-1]
@ -512,7 +512,7 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if
the last token has one.
RETURNS (unicode): The text content of the span (with trailing
RETURNS (str): The text content of the span (with trailing
whitespace).
"""
return "".join([t.text_with_ws for t in self])
@ -688,7 +688,7 @@ cdef class Span:
raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
property ent_id_:
"""RETURNS (unicode): The (string) entity ID."""
"""RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
@ -700,12 +700,12 @@ cdef class Span:
"""Verbatim text content (identical to `Span.text`). Exists mostly for
consistency with other attributes.
RETURNS (unicode): The span's text."""
RETURNS (str): The span's text."""
return self.text
@property
def lemma_(self):
"""RETURNS (unicode): The span's lemma."""
"""RETURNS (str): The span's lemma."""
return " ".join([t.lemma_ for t in self]).strip()
@property
@ -724,7 +724,7 @@ cdef class Span:
return "".join([t.text_with_ws for t in self])
property label_:
"""RETURNS (unicode): The span's label."""
"""RETURNS (str): The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
@ -734,7 +734,7 @@ cdef class Span:
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
property kb_id_:
"""RETURNS (unicode): The named entity's KB ID."""
"""RETURNS (str): The named entity's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]

View File

@ -36,7 +36,7 @@ cdef class Token:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Token._`.
name (unicode): Name of the attribute to set.
name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@ -54,7 +54,7 @@ cdef class Token:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/token#get_extension
@ -65,7 +65,7 @@ cdef class Token:
def has_extension(cls, name):
"""Check whether an extension has been registered.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/token#has_extension
@ -76,7 +76,7 @@ cdef class Token:
def remove_extension(cls, name):
"""Remove a previously registered extension.
name (unicode): Name of the extension.
name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@ -244,12 +244,12 @@ cdef class Token:
@property
def text(self):
"""RETURNS (unicode): The original verbatim text of the token."""
"""RETURNS (str): The original verbatim text of the token."""
return self.orth_
@property
def text_with_ws(self):
"""RETURNS (unicode): The text content of the span (with trailing
"""RETURNS (str): The text content of the span (with trailing
whitespace).
"""
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -740,7 +740,7 @@ cdef class Token:
self.c.ent_type = ent_type
property ent_type_:
"""RETURNS (unicode): Named entity type."""
"""RETURNS (str): Named entity type."""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
@ -763,7 +763,7 @@ cdef class Token:
and "" means no entity tag is set. "B" with an empty ent_type
means that the token is blocked from further processing by NER.
RETURNS (unicode): IOB code of named entity tag.
RETURNS (str): IOB code of named entity tag.
"""
iob_strings = ("", "I", "O", "B")
return iob_strings[self.c.ent_iob]
@ -779,7 +779,7 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
"""RETURNS (unicode): ID of the entity the token is an instance of,
"""RETURNS (str): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
@ -797,7 +797,7 @@ cdef class Token:
self.c.ent_kb_id = ent_kb_id
property ent_kb_id_:
"""RETURNS (unicode): Named entity KB ID."""
"""RETURNS (str): Named entity KB ID."""
def __get__(self):
return self.vocab.strings[self.c.ent_kb_id]
@ -806,12 +806,12 @@ cdef class Token:
@property
def whitespace_(self):
"""RETURNS (unicode): The trailing whitespace character, if present."""
"""RETURNS (str): The trailing whitespace character, if present."""
return " " if self.c.spacy else ""
@property
def orth_(self):
"""RETURNS (unicode): Verbatim text content (identical to
"""RETURNS (str): Verbatim text content (identical to
`Token.text`). Exists mostly for consistency with the other
attributes.
"""
@ -819,13 +819,13 @@ cdef class Token:
@property
def lower_(self):
"""RETURNS (unicode): The lowercase token text. Equivalent to
"""RETURNS (str): The lowercase token text. Equivalent to
`Token.text.lower()`.
"""
return self.vocab.strings[self.c.lex.lower]
property norm_:
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
"""RETURNS (str): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
"""
@ -837,34 +837,34 @@ cdef class Token:
@property
def shape_(self):
"""RETURNS (unicode): Transform of the tokens's string, to show
"""RETURNS (str): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd".
"""
return self.vocab.strings[self.c.lex.shape]
@property
def prefix_(self):
"""RETURNS (unicode): A length-N substring from the start of the token.
"""RETURNS (str): A length-N substring from the start of the token.
Defaults to `N=1`.
"""
return self.vocab.strings[self.c.lex.prefix]
@property
def suffix_(self):
"""RETURNS (unicode): A length-N substring from the end of the token.
"""RETURNS (str): A length-N substring from the end of the token.
Defaults to `N=3`.
"""
return self.vocab.strings[self.c.lex.suffix]
@property
def lang_(self):
"""RETURNS (unicode): Language of the parent document's vocabulary,
"""RETURNS (str): Language of the parent document's vocabulary,
e.g. 'en'.
"""
return self.vocab.strings[self.c.lex.lang]
property lemma_:
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
"""RETURNS (str): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
"""
def __get__(self):
@ -877,7 +877,7 @@ cdef class Token:
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
"""RETURNS (str): Coarse-grained part-of-speech tag."""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
@ -885,7 +885,7 @@ cdef class Token:
self.c.pos = parts_of_speech.IDS[pos_name]
property tag_:
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
"""RETURNS (str): Fine-grained part-of-speech tag."""
def __get__(self):
return self.vocab.strings[self.c.tag]
@ -893,7 +893,7 @@ cdef class Token:
self.tag = self.vocab.strings.add(tag)
property dep_:
"""RETURNS (unicode): The syntactic dependency label."""
"""RETURNS (str): The syntactic dependency label."""
def __get__(self):
return self.vocab.strings[self.c.dep]

View File

@ -58,7 +58,7 @@ def lang_class_is_loaded(lang):
loaded lazily, to avoid expensive setup code associated with the language
data.
lang (unicode): Two-letter language code, e.g. 'en'.
lang (str): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
return lang in registry.languages
@ -67,7 +67,7 @@ def lang_class_is_loaded(lang):
def get_lang_class(lang):
"""Import and load a Language class.
lang (unicode): Two-letter language code, e.g. 'en'.
lang (str): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
@ -85,7 +85,7 @@ def get_lang_class(lang):
def set_lang_class(name, cls):
"""Set a custom Language class name that can be loaded via get_lang_class.
name (unicode): Name of Language class.
name (str): Name of Language class.
cls (Language): Language class.
"""
registry.languages.register(name, func=cls)
@ -107,7 +107,7 @@ def load_language_data(path):
"""Load JSON language data using the given path as a base. If the provided
path isn't present, will attempt to load a gzipped version before giving up.
path (unicode / Path): The data to load.
path (str / Path): The data to load.
RETURNS: The loaded data.
"""
path = ensure_path(path)
@ -128,7 +128,7 @@ def get_module_path(module):
def load_model(name, **overrides):
"""Load a model from a package or data path.
name (unicode): Package name or model path.
name (str): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model.
"""
@ -202,7 +202,7 @@ def load_model_from_init_py(init_file, **overrides):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
init_file (str): Path to model's __init__.py, i.e. `__file__`.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with loaded model.
"""
@ -227,8 +227,8 @@ def get_package_version(name):
"""Get the version of an installed package. Typically used to get model
package versions.
name (unicode): The name of the installed Python package.
RETURNS (unicode / None): The version or None if package not installed.
name (str): The name of the installed Python package.
RETURNS (str / None): The version or None if package not installed.
"""
try:
return importlib_metadata.version(name)
@ -338,7 +338,7 @@ def get_model_config(path):
def is_package(name):
"""Check if string maps to a package installed via pip.
name (unicode): Name of package.
name (str): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
try:
@ -351,7 +351,7 @@ def is_package(name):
def get_package_path(name):
"""Get the path to an installed package.
name (unicode): Package name.
name (str): Package name.
RETURNS (Path): Path to installed package.
"""
name = name.lower() # use lowercase version to be safe
@ -526,8 +526,8 @@ def expand_exc(excs, search, replace):
For example, to add additional versions with typographic apostrophes.
excs (dict): Tokenizer exceptions.
search (unicode): String to find and replace.
replace (unicode): Replacement.
search (str): String to find and replace.
replace (str): Replacement.
RETURNS (dict): Combined tokenizer exceptions.
"""
@ -761,8 +761,8 @@ def from_disk(path, readers, exclude):
def import_file(name, loc):
"""Import module from a file. Used to load models from a directory.
name (unicode): Name of module to load.
loc (unicode / Path): Path to the file.
name (str): Name of module to load.
loc (str / Path): Path to the file.
RETURNS: The loaded module.
"""
loc = str(loc)
@ -777,8 +777,8 @@ def minify_html(html):
Disclaimer: NOT a general-purpose solution, only removes indentation and
newlines.
html (unicode): Markup to minify.
RETURNS (unicode): "Minified" HTML.
html (str): Markup to minify.
RETURNS (str): "Minified" HTML.
"""
return html.strip().replace(" ", "").replace("\n", "")
@ -787,8 +787,8 @@ def escape_html(text):
"""Replace <, >, &, " with their HTML encoded representation. Intended to
prevent HTML errors in rendered displaCy markup.
text (unicode): The original text.
RETURNS (unicode): Equivalent text to be safely used within HTML.
text (str): The original text.
RETURNS (str): Equivalent text to be safely used within HTML.
"""
text = text.replace("&", "&amp;")
text = text.replace("<", "&lt;")

View File

@ -57,7 +57,7 @@ cdef class Vectors:
shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
name (unicode): A name to identify the vectors table.
name (str): A name to identify the vectors table.
RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init
@ -237,7 +237,7 @@ cdef class Vectors:
def find(self, *, key=None, keys=None, row=None, rows=None):
"""Look up one or more keys by row, or vice versa.
key (unicode / int): Find the row that the given key points to.
key (str / int): Find the row that the given key points to.
Returns int, -1 if missing.
keys (iterable): Find rows that the keys point to.
Returns ndarray.
@ -352,7 +352,7 @@ cdef class Vectors:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if
path (str / Path): A path to a directory, which will be created if
it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk
@ -372,7 +372,7 @@ cdef class Vectors:
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode / Path): Directory path, string or Path-like object.
path (str / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
DOCS: https://spacy.io/api/vectors#from_disk

View File

@ -41,7 +41,7 @@ cdef class Vocab:
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
name (unicode): Optional name to identify the vectors table.
name (str): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -97,7 +97,7 @@ cdef class Vocab:
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
flag_getter (callable): A function `f(unicode) -> bool`, to get the
flag_getter (callable): A function `f(str) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
@ -187,7 +187,7 @@ cdef class Vocab:
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
string (unicode): The ID string.
string (str): The ID string.
RETURNS (bool) Whether the string has an entry in the vocabulary.
DOCS: https://spacy.io/api/vocab#contains

View File

@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
| Name | Type | Description |
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | The lexeme's vocabulary. |
| `text` | unicode | Verbatim text content. |
| `text` | str | Verbatim text content. |
| `orth` | int | ID of the verbatim text content. |
| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
| `flags` | int | Container of the lexeme's binary flags. |
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `lower` | int | Lowercase form of the word. |
| `lower_` | unicode | Lowercase form of the word. |
| `lower_` | str | Lowercase form of the word. |
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. |
| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. |
| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
| `lang` | int | Language of the parent vocabulary. |
| `lang_` | unicode | Language of the parent vocabulary. |
| `lang_` | str | Language of the parent vocabulary. |
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `cluster` | int | Brown cluster ID. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |

View File

@ -27,7 +27,7 @@ Create the vocabulary.
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
| `vectors_name` <Tag variant="new">2.2</Tag> | unicode | A name to identify the vectors table. |
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"}
@ -91,10 +91,10 @@ given string, you need to look it up in
> assert oov not in nlp.vocab
> ```
| Name | Type | Description |
| ----------- | ------- | -------------------------------------------------- |
| `string` | unicode | The ID string. |
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
| Name | Type | Description |
| ----------- | ---- | -------------------------------------------------- |
| `string` | str | The ID string. |
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
## Vocab.add_flag {#add_flag tag="method"}
@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
| Name | Type | Description |
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. |
| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
@ -227,10 +227,10 @@ Save the current state to a directory.
> nlp.vocab.to_disk("/path/to/vocab")
> ```
| Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| Name | Type | Description |
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Vocab.from_disk {#from_disk tag="method" new="2"}
@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it.
> vocab = Vocab().from_disk("/path/to/vocab")
> ```
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
| Name | Type | Description |
| ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
## Vocab.to_bytes {#to_bytes tag="method"}

View File

@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute | Type |  Description |
| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
| `ORTH` | unicode | The exact verbatim text of a token. |
| `TEXT` <Tag variant="new">2.1</Tag> | unicode | The exact verbatim text of a token. |
| `LOWER` | unicode | The lowercase form of the token text. |
|  `LENGTH` | int | The length of the token text. |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
| `ENT_TYPE` | unicode | The token's entity label. |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
| Attribute | Type |  Description |
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
| `ORTH` | str | The exact verbatim text of a token. |
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
| `LOWER` | str | The lowercase form of the token text. |
|  `LENGTH` | int | The length of the token text. |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
| `ENT_TYPE` | str | The token's entity label. |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
extract matches based on the pattern's POS signature.
When using a large amount of **phrase patterns** (roughly > 10000) it's useful
to understand how the `add_patterns` function of the EntityRuler works. For each
**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
object. This happens in case you try to add the EntityRuler at the end of an
existing pipeline with, for example, a POS tagger and want to extract matches
based on the pattern's POS signature.
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
In this case you would pass a config value of `phrase_matcher_attr="POS"` for
the EntityRuler.
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
Running the full language pipeline across every pattern in a large list scales
linearly and can therefore take a long time on large amounts of phrase patterns.
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
5,000-100,000 phrase patterns respectively.
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
Even with this speedup (but especially if you're using an older version) the
`add_patterns` function can still take a long time.
An easy workaround to make this function run faster is disabling the other language pipes
while adding the phrase patterns.
An easy workaround to make this function run faster is disabling the other
language pipes while adding the phrase patterns.
```python
entityruler = EntityRuler(nlp)