mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
unicode -> str consistency
This commit is contained in:
parent
cf156ed2f4
commit
5d3806e059
|
@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
|
||||||
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
|
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
|
||||||
the pattern is not matched.
|
the pattern is not matched.
|
||||||
|
|
||||||
lines (unicode): CONLL-U lines for one sentences
|
lines (str): CONLL-U lines for one sentences
|
||||||
tag_pattern (unicode): Regex pattern for entity tag
|
tag_pattern (str): Regex pattern for entity tag
|
||||||
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
|
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
|
||||||
RETURNS (list): List of BILUO entity tags
|
RETURNS (list): List of BILUO entity tags
|
||||||
"""
|
"""
|
||||||
|
@ -187,8 +187,8 @@ def example_from_conllu_sentence(
|
||||||
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
||||||
subtokens and appending morphology to tags if required.
|
subtokens and appending morphology to tags if required.
|
||||||
|
|
||||||
lines (unicode): The non-comment lines for a CoNLL-U sentence
|
lines (str): The non-comment lines for a CoNLL-U sentence
|
||||||
ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
|
ner_tag_pattern (str): The regex pattern for matching NER in MISC col
|
||||||
RETURNS (Example): An example containing the annotation
|
RETURNS (Example): An example containing the annotation
|
||||||
"""
|
"""
|
||||||
# create a Doc with each subtoken as its own token
|
# create a Doc with each subtoken as its own token
|
||||||
|
|
|
@ -22,13 +22,13 @@ def render(
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
style (str): Visualisation style, 'dep' or 'ent'.
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
jupyter (bool): Override Jupyter auto-detection.
|
jupyter (bool): Override Jupyter auto-detection.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (str): Rendered HTML markup.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
|
@ -73,13 +73,13 @@ def serve(
|
||||||
"""Serve displaCy visualisation.
|
"""Serve displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
style (str): Visualisation style, 'dep' or 'ent'.
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
host (unicode): Host to serve visualisation.
|
host (str): Host to serve visualisation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
|
|
|
@ -47,7 +47,7 @@ class DependencyRenderer(object):
|
||||||
parsed (list): Dependency parses to render.
|
parsed (list): Dependency parses to render.
|
||||||
page (bool): Render parses wrapped as full HTML page.
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered SVG or HTML markup.
|
RETURNS (str): Rendered SVG or HTML markup.
|
||||||
"""
|
"""
|
||||||
# Create a random ID prefix to make sure parses don't receive the
|
# Create a random ID prefix to make sure parses don't receive the
|
||||||
# same ID, even if they're identical
|
# same ID, even if they're identical
|
||||||
|
@ -78,7 +78,7 @@ class DependencyRenderer(object):
|
||||||
render_id (int): Unique ID, typically index of document.
|
render_id (int): Unique ID, typically index of document.
|
||||||
words (list): Individual words and their tags.
|
words (list): Individual words and their tags.
|
||||||
arcs (list): Individual arcs and their start, end, direction and label.
|
arcs (list): Individual arcs and their start, end, direction and label.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (str): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
self.levels = self.get_levels(arcs)
|
self.levels = self.get_levels(arcs)
|
||||||
self.highest_level = len(self.levels)
|
self.highest_level = len(self.levels)
|
||||||
|
@ -112,10 +112,10 @@ class DependencyRenderer(object):
|
||||||
):
|
):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
|
||||||
text (unicode): Word text.
|
text (str): Word text.
|
||||||
tag (unicode): Part-of-speech tag.
|
tag (str): Part-of-speech tag.
|
||||||
i (int): Unique ID, typically word index.
|
i (int): Unique ID, typically word index.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (str): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
y = self.offset_y + self.word_spacing
|
y = self.offset_y + self.word_spacing
|
||||||
x = self.offset_x + i * self.distance
|
x = self.offset_x + i * self.distance
|
||||||
|
@ -131,12 +131,12 @@ class DependencyRenderer(object):
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
"""Render individual arrow.
|
"""Render individual arrow.
|
||||||
|
|
||||||
label (unicode): Dependency label.
|
label (str): Dependency label.
|
||||||
start (int): Index of start word.
|
start (int): Index of start word.
|
||||||
end (int): Index of end word.
|
end (int): Index of end word.
|
||||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
direction (str): Arrow direction, 'left' or 'right'.
|
||||||
i (int): Unique ID, typically arrow index.
|
i (int): Unique ID, typically arrow index.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (str): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
if start < 0 or end < 0:
|
if start < 0 or end < 0:
|
||||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||||
|
@ -179,7 +179,7 @@ class DependencyRenderer(object):
|
||||||
y (int): Y-coordinate of arrow start and end point.
|
y (int): Y-coordinate of arrow start and end point.
|
||||||
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
|
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
|
||||||
x_end (int): X-coordinate of arrow end point.
|
x_end (int): X-coordinate of arrow end point.
|
||||||
RETURNS (unicode): Definition of the arc path ('d' attribute).
|
RETURNS (str): Definition of the arc path ('d' attribute).
|
||||||
"""
|
"""
|
||||||
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
|
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
|
||||||
if self.compact:
|
if self.compact:
|
||||||
|
@ -189,11 +189,11 @@ class DependencyRenderer(object):
|
||||||
def get_arrowhead(self, direction, x, y, end):
|
def get_arrowhead(self, direction, x, y, end):
|
||||||
"""Render individual arrow head.
|
"""Render individual arrow head.
|
||||||
|
|
||||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
direction (str): Arrow direction, 'left' or 'right'.
|
||||||
x (int): X-coordinate of arrow start point.
|
x (int): X-coordinate of arrow start point.
|
||||||
y (int): Y-coordinate of arrow start and end point.
|
y (int): Y-coordinate of arrow start and end point.
|
||||||
end (int): X-coordinate of arrow end point.
|
end (int): X-coordinate of arrow end point.
|
||||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
RETURNS (str): Definition of the arrow head path ('d' attribute).
|
||||||
"""
|
"""
|
||||||
if direction == "left":
|
if direction == "left":
|
||||||
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||||
|
@ -279,7 +279,7 @@ class EntityRenderer(object):
|
||||||
parsed (list): Dependency parses to render.
|
parsed (list): Dependency parses to render.
|
||||||
page (bool): Render parses wrapped as full HTML page.
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (str): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = []
|
rendered = []
|
||||||
for i, p in enumerate(parsed):
|
for i, p in enumerate(parsed):
|
||||||
|
@ -300,7 +300,7 @@ class EntityRenderer(object):
|
||||||
def render_ents(self, text, spans, title):
|
def render_ents(self, text, spans, title):
|
||||||
"""Render entities in text.
|
"""Render entities in text.
|
||||||
|
|
||||||
text (unicode): Original text.
|
text (str): Original text.
|
||||||
spans (list): Individual entity spans and their start, end and label.
|
spans (list): Individual entity spans and their start, end and label.
|
||||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -598,7 +598,7 @@ class MatchPatternError(ValueError):
|
||||||
def __init__(self, key, errors):
|
def __init__(self, key, errors):
|
||||||
"""Custom error for validating match patterns.
|
"""Custom error for validating match patterns.
|
||||||
|
|
||||||
key (unicode): The name of the matcher rule.
|
key (str): The name of the matcher rule.
|
||||||
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
||||||
ID, i.e. the index of the added pattern.
|
ID, i.e. the index of the added pattern.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
def explain(term):
|
def explain(term):
|
||||||
"""Get a description for a given POS tag, dependency label or entity type.
|
"""Get a description for a given POS tag, dependency label or entity type.
|
||||||
|
|
||||||
term (unicode): The term to explain.
|
term (str): The term to explain.
|
||||||
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
|
RETURNS (str): The explanation, or `None` if not found in the glossary.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> spacy.explain(u'NORP')
|
>>> spacy.explain(u'NORP')
|
||||||
|
|
|
@ -38,7 +38,7 @@ cdef class Candidate:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_(self):
|
def entity_(self):
|
||||||
"""RETURNS (unicode): ID/name of this entity in the KB"""
|
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||||
return self.kb.vocab.strings[self.entity_hash]
|
return self.kb.vocab.strings[self.entity_hash]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -48,7 +48,7 @@ cdef class Candidate:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alias_(self):
|
def alias_(self):
|
||||||
"""RETURNS (unicode): ID of the original alias"""
|
"""RETURNS (str): ID of the original alias"""
|
||||||
return self.kb.vocab.strings[self.alias_hash]
|
return self.kb.vocab.strings[self.alias_hash]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -122,7 +122,7 @@ class Language(object):
|
||||||
|
|
||||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (unicode): Two-letter language ID, i.e. ISO code.
|
lang (str): Two-letter language ID, i.e. ISO code.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
@ -287,7 +287,7 @@ class Language(object):
|
||||||
def get_pipe(self, name):
|
def get_pipe(self, name):
|
||||||
"""Get a pipeline component for a given component name.
|
"""Get a pipeline component for a given component name.
|
||||||
|
|
||||||
name (unicode): Name of pipeline component to get.
|
name (str): Name of pipeline component to get.
|
||||||
RETURNS (callable): The pipeline component.
|
RETURNS (callable): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#get_pipe
|
DOCS: https://spacy.io/api/language#get_pipe
|
||||||
|
@ -300,7 +300,7 @@ class Language(object):
|
||||||
def create_pipe(self, name, config=dict()):
|
def create_pipe(self, name, config=dict()):
|
||||||
"""Create a pipeline component from a factory.
|
"""Create a pipeline component from a factory.
|
||||||
|
|
||||||
name (unicode): Factory name to look up in `Language.factories`.
|
name (str): Factory name to look up in `Language.factories`.
|
||||||
config (dict): Configuration parameters to initialise component.
|
config (dict): Configuration parameters to initialise component.
|
||||||
RETURNS (callable): Pipeline component.
|
RETURNS (callable): Pipeline component.
|
||||||
|
|
||||||
|
@ -343,12 +343,12 @@ class Language(object):
|
||||||
of before/after/first/last can be set. Default behaviour is "last".
|
of before/after/first/last can be set. Default behaviour is "last".
|
||||||
|
|
||||||
component (callable): The pipeline component.
|
component (callable): The pipeline component.
|
||||||
name (unicode): Name of pipeline component. Overwrites existing
|
name (str): Name of pipeline component. Overwrites existing
|
||||||
component.name attribute if available. If no name is set and
|
component.name attribute if available. If no name is set and
|
||||||
the component exposes no name attribute, component.__name__ is
|
the component exposes no name attribute, component.__name__ is
|
||||||
used. An error is raised if a name already exists in the pipeline.
|
used. An error is raised if a name already exists in the pipeline.
|
||||||
before (unicode): Component name to insert component directly before.
|
before (str): Component name to insert component directly before.
|
||||||
after (unicode): Component name to insert component directly after.
|
after (str): Component name to insert component directly after.
|
||||||
first (bool): Insert component first / not first in the pipeline.
|
first (bool): Insert component first / not first in the pipeline.
|
||||||
last (bool): Insert component last / not last in the pipeline.
|
last (bool): Insert component last / not last in the pipeline.
|
||||||
|
|
||||||
|
@ -389,7 +389,7 @@ class Language(object):
|
||||||
"""Check if a component name is present in the pipeline. Equivalent to
|
"""Check if a component name is present in the pipeline. Equivalent to
|
||||||
`name in nlp.pipe_names`.
|
`name in nlp.pipe_names`.
|
||||||
|
|
||||||
name (unicode): Name of the component.
|
name (str): Name of the component.
|
||||||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#has_pipe
|
DOCS: https://spacy.io/api/language#has_pipe
|
||||||
|
@ -399,7 +399,7 @@ class Language(object):
|
||||||
def replace_pipe(self, name, component):
|
def replace_pipe(self, name, component):
|
||||||
"""Replace a component in the pipeline.
|
"""Replace a component in the pipeline.
|
||||||
|
|
||||||
name (unicode): Name of the component to replace.
|
name (str): Name of the component to replace.
|
||||||
component (callable): Pipeline component.
|
component (callable): Pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#replace_pipe
|
DOCS: https://spacy.io/api/language#replace_pipe
|
||||||
|
@ -418,8 +418,8 @@ class Language(object):
|
||||||
def rename_pipe(self, old_name, new_name):
|
def rename_pipe(self, old_name, new_name):
|
||||||
"""Rename a pipeline component.
|
"""Rename a pipeline component.
|
||||||
|
|
||||||
old_name (unicode): Name of the component to rename.
|
old_name (str): Name of the component to rename.
|
||||||
new_name (unicode): New name of the component.
|
new_name (str): New name of the component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#rename_pipe
|
DOCS: https://spacy.io/api/language#rename_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -433,7 +433,7 @@ class Language(object):
|
||||||
def remove_pipe(self, name):
|
def remove_pipe(self, name):
|
||||||
"""Remove a component from the pipeline.
|
"""Remove a component from the pipeline.
|
||||||
|
|
||||||
name (unicode): Name of the component to remove.
|
name (str): Name of the component to remove.
|
||||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#remove_pipe
|
DOCS: https://spacy.io/api/language#remove_pipe
|
||||||
|
@ -450,7 +450,7 @@ class Language(object):
|
||||||
and can contain arbitrary whitespace. Alignment into the original string
|
and can contain arbitrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
text (unicode): The text to be processed.
|
text (str): The text to be processed.
|
||||||
disable (list): Names of the pipeline components to disable.
|
disable (list): Names of the pipeline components to disable.
|
||||||
component_cfg (dict): An optional dictionary with extra keyword arguments
|
component_cfg (dict): An optional dictionary with extra keyword arguments
|
||||||
for specific components.
|
for specific components.
|
||||||
|
@ -1086,7 +1086,7 @@ class component(object):
|
||||||
):
|
):
|
||||||
"""Decorate a pipeline component.
|
"""Decorate a pipeline component.
|
||||||
|
|
||||||
name (unicode): Default component and factory name.
|
name (str): Default component and factory name.
|
||||||
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
|
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
|
||||||
requires (list): Attributes required by component, e.g. `["token.dep"]`.
|
requires (list): Attributes required by component, e.g. `["token.dep"]`.
|
||||||
retokenizes (bool): Whether the component changes the tokenization.
|
retokenizes (bool): Whether the component changes the tokenization.
|
||||||
|
|
|
@ -29,8 +29,8 @@ class Lemmatizer(object):
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
"""Lemmatize a string.
|
"""Lemmatize a string.
|
||||||
|
|
||||||
string (unicode): The string to lemmatize, e.g. the token text.
|
string (str): The string to lemmatize, e.g. the token text.
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||||
morphology (dict): The token's morphological features following the
|
morphology (dict): The token's morphological features following the
|
||||||
Universal Dependencies scheme.
|
Universal Dependencies scheme.
|
||||||
RETURNS (list): The available lemmas for the string.
|
RETURNS (list): The available lemmas for the string.
|
||||||
|
@ -69,7 +69,7 @@ class Lemmatizer(object):
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||||
morphology (dict): The token's morphological features following the
|
morphology (dict): The token's morphological features following the
|
||||||
Universal Dependencies scheme.
|
Universal Dependencies scheme.
|
||||||
"""
|
"""
|
||||||
|
@ -128,10 +128,10 @@ class Lemmatizer(object):
|
||||||
"""Look up a lemma in the table, if available. If no lemma is found,
|
"""Look up a lemma in the table, if available. If no lemma is found,
|
||||||
the original string is returned.
|
the original string is returned.
|
||||||
|
|
||||||
string (unicode): The original string.
|
string (str): The original string.
|
||||||
orth (int): Optional hash of the string to look up. If not set, the
|
orth (int): Optional hash of the string to look up. If not set, the
|
||||||
string will be used and hashed.
|
string will be used and hashed.
|
||||||
RETURNS (unicode): The lemma if the string was found, otherwise the
|
RETURNS (str): The lemma if the string was found, otherwise the
|
||||||
original string.
|
original string.
|
||||||
"""
|
"""
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
|
|
|
@ -190,7 +190,7 @@ cdef class Lexeme:
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
|
|
||||||
property rank:
|
property rank:
|
||||||
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
|
"""RETURNS (str): Sequential ID of the lexemes's lexical type, used
|
||||||
to index into tables, e.g. for word vectors."""
|
to index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
@ -209,18 +209,18 @@ cdef class Lexeme:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
"""RETURNS (unicode): The original verbatim text of the lexeme
|
"""RETURNS (str): The original verbatim text of the lexeme
|
||||||
(identical to `Lexeme.text`). Exists mostly for consistency with
|
(identical to `Lexeme.text`). Exists mostly for consistency with
|
||||||
the other attributes."""
|
the other attributes."""
|
||||||
return self.vocab.strings[self.c.orth]
|
return self.vocab.strings[self.c.orth]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
"""RETURNS (unicode): The original verbatim text of the lexeme."""
|
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
"""RETURNS (unicode): Lowercase form of the lexeme."""
|
"""RETURNS (str): Lowercase form of the lexeme."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lower
|
return self.c.lower
|
||||||
|
|
||||||
|
@ -293,7 +293,7 @@ cdef class Lexeme:
|
||||||
self.c.prob = x
|
self.c.prob = x
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
"""RETURNS (unicode): Lowercase form of the word."""
|
"""RETURNS (str): Lowercase form of the word."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lower]
|
return self.vocab.strings[self.c.lower]
|
||||||
|
|
||||||
|
@ -301,7 +301,7 @@ cdef class Lexeme:
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
|
"""RETURNS (str): The lexemes's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -311,7 +311,7 @@ cdef class Lexeme:
|
||||||
self.c.norm = self.vocab.strings.add(x)
|
self.c.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
"""RETURNS (unicode): Transform of the word's string, to show
|
"""RETURNS (str): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -321,7 +321,7 @@ cdef class Lexeme:
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
"""RETURNS (unicode): Length-N substring from the start of the word.
|
"""RETURNS (str): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -331,7 +331,7 @@ cdef class Lexeme:
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
property suffix_:
|
||||||
"""RETURNS (unicode): Length-N substring from the end of the word.
|
"""RETURNS (str): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -341,7 +341,7 @@ cdef class Lexeme:
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
"""RETURNS (unicode): Language of the parent vocabulary."""
|
"""RETURNS (str): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lang]
|
return self.vocab.strings[self.c.lang]
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ class Lookups(object):
|
||||||
"""Check if the lookups contain a table of a given name. Delegates to
|
"""Check if the lookups contain a table of a given name. Delegates to
|
||||||
Lookups.has_table.
|
Lookups.has_table.
|
||||||
|
|
||||||
name (unicode): Name of the table.
|
name (str): Name of the table.
|
||||||
RETURNS (bool): Whether a table of that name is in the lookups.
|
RETURNS (bool): Whether a table of that name is in the lookups.
|
||||||
"""
|
"""
|
||||||
return self.has_table(name)
|
return self.has_table(name)
|
||||||
|
@ -48,7 +48,7 @@ class Lookups(object):
|
||||||
def add_table(self, name, data=SimpleFrozenDict()):
|
def add_table(self, name, data=SimpleFrozenDict()):
|
||||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||||
|
|
||||||
name (unicode): Unique name of table.
|
name (str): Unique name of table.
|
||||||
data (dict): Optional data to add to the table.
|
data (dict): Optional data to add to the table.
|
||||||
RETURNS (Table): The newly added table.
|
RETURNS (Table): The newly added table.
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ class Lookups(object):
|
||||||
"""Get a table. Raises an error if the table doesn't exist and no
|
"""Get a table. Raises an error if the table doesn't exist and no
|
||||||
default value is provided.
|
default value is provided.
|
||||||
|
|
||||||
name (unicode): Name of the table.
|
name (str): Name of the table.
|
||||||
default: Optional default value to return if table doesn't exist.
|
default: Optional default value to return if table doesn't exist.
|
||||||
RETURNS (Table): The table.
|
RETURNS (Table): The table.
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ class Lookups(object):
|
||||||
def remove_table(self, name):
|
def remove_table(self, name):
|
||||||
"""Remove a table. Raises an error if the table doesn't exist.
|
"""Remove a table. Raises an error if the table doesn't exist.
|
||||||
|
|
||||||
name (unicode): Name of the table to remove.
|
name (str): Name of the table to remove.
|
||||||
RETURNS (Table): The removed table.
|
RETURNS (Table): The removed table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#remove_table
|
DOCS: https://spacy.io/api/lookups#remove_table
|
||||||
|
@ -91,7 +91,7 @@ class Lookups(object):
|
||||||
def has_table(self, name):
|
def has_table(self, name):
|
||||||
"""Check if the lookups contain a table of a given name.
|
"""Check if the lookups contain a table of a given name.
|
||||||
|
|
||||||
name (unicode): Name of the table.
|
name (str): Name of the table.
|
||||||
RETURNS (bool): Whether a table of that name exists.
|
RETURNS (bool): Whether a table of that name exists.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#has_table
|
DOCS: https://spacy.io/api/lookups#has_table
|
||||||
|
@ -125,7 +125,7 @@ class Lookups(object):
|
||||||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||||
directory, which will be created if it doesn't exist.
|
directory, which will be created if it doesn't exist.
|
||||||
|
|
||||||
path (unicode / Path): The file path.
|
path (str / Path): The file path.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#to_disk
|
DOCS: https://spacy.io/api/lookups#to_disk
|
||||||
"""
|
"""
|
||||||
|
@ -141,7 +141,7 @@ class Lookups(object):
|
||||||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||||
loading if the file doesn't exist.
|
loading if the file doesn't exist.
|
||||||
|
|
||||||
path (unicode / Path): The directory path.
|
path (str / Path): The directory path.
|
||||||
RETURNS (Lookups): The loaded lookups.
|
RETURNS (Lookups): The loaded lookups.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#from_disk
|
DOCS: https://spacy.io/api/lookups#from_disk
|
||||||
|
@ -167,7 +167,7 @@ class Table(OrderedDict):
|
||||||
"""Initialize a new table from a dict.
|
"""Initialize a new table from a dict.
|
||||||
|
|
||||||
data (dict): The dictionary.
|
data (dict): The dictionary.
|
||||||
name (unicode): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
RETURNS (Table): The newly created object.
|
RETURNS (Table): The newly created object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.from_dict
|
DOCS: https://spacy.io/api/lookups#table.from_dict
|
||||||
|
@ -179,7 +179,7 @@ class Table(OrderedDict):
|
||||||
def __init__(self, name=None, data=None):
|
def __init__(self, name=None, data=None):
|
||||||
"""Initialize a new table.
|
"""Initialize a new table.
|
||||||
|
|
||||||
name (unicode): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
data (dict): Initial data, used to hint Bloom Filter.
|
data (dict): Initial data, used to hint Bloom Filter.
|
||||||
RETURNS (Table): The newly created object.
|
RETURNS (Table): The newly created object.
|
||||||
|
|
||||||
|
@ -197,7 +197,7 @@ class Table(OrderedDict):
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
"""Set new key/value pair. String keys will be hashed.
|
"""Set new key/value pair. String keys will be hashed.
|
||||||
|
|
||||||
key (unicode / int): The key to set.
|
key (str / int): The key to set.
|
||||||
value: The value to set.
|
value: The value to set.
|
||||||
"""
|
"""
|
||||||
key = get_string_id(key)
|
key = get_string_id(key)
|
||||||
|
@ -208,7 +208,7 @@ class Table(OrderedDict):
|
||||||
"""Set new key/value pair. String keys will be hashed.
|
"""Set new key/value pair. String keys will be hashed.
|
||||||
Same as table[key] = value.
|
Same as table[key] = value.
|
||||||
|
|
||||||
key (unicode / int): The key to set.
|
key (str / int): The key to set.
|
||||||
value: The value to set.
|
value: The value to set.
|
||||||
"""
|
"""
|
||||||
self[key] = value
|
self[key] = value
|
||||||
|
@ -216,7 +216,7 @@ class Table(OrderedDict):
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
"""Get the value for a given key. String keys will be hashed.
|
"""Get the value for a given key. String keys will be hashed.
|
||||||
|
|
||||||
key (unicode / int): The key to get.
|
key (str / int): The key to get.
|
||||||
RETURNS: The value.
|
RETURNS: The value.
|
||||||
"""
|
"""
|
||||||
key = get_string_id(key)
|
key = get_string_id(key)
|
||||||
|
@ -225,7 +225,7 @@ class Table(OrderedDict):
|
||||||
def get(self, key, default=None):
|
def get(self, key, default=None):
|
||||||
"""Get the value for a given key. String keys will be hashed.
|
"""Get the value for a given key. String keys will be hashed.
|
||||||
|
|
||||||
key (unicode / int): The key to get.
|
key (str / int): The key to get.
|
||||||
default: The default value to return.
|
default: The default value to return.
|
||||||
RETURNS: The value.
|
RETURNS: The value.
|
||||||
"""
|
"""
|
||||||
|
@ -235,7 +235,7 @@ class Table(OrderedDict):
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether a key is in the table. String keys will be hashed.
|
"""Check whether a key is in the table. String keys will be hashed.
|
||||||
|
|
||||||
key (unicode / int): The key to check.
|
key (str / int): The key to check.
|
||||||
RETURNS (bool): Whether the key is in the table.
|
RETURNS (bool): Whether the key is in the table.
|
||||||
"""
|
"""
|
||||||
key = get_string_id(key)
|
key = get_string_id(key)
|
||||||
|
|
|
@ -66,7 +66,7 @@ cdef class DependencyMatcher:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the matcher contains rules for a match ID.
|
"""Check whether the matcher contains rules for a match ID.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
"""
|
"""
|
||||||
return self._normalize_key(key) in self._patterns
|
return self._normalize_key(key) in self._patterns
|
||||||
|
|
|
@ -63,7 +63,7 @@ cdef class Matcher:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the matcher contains rules for a match ID.
|
"""Check whether the matcher contains rules for a match ID.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
"""
|
"""
|
||||||
return self._normalize_key(key) in self._patterns
|
return self._normalize_key(key) in self._patterns
|
||||||
|
@ -97,7 +97,7 @@ cdef class Matcher:
|
||||||
number of arguments). The on_match callback becomes an optional keyword
|
number of arguments). The on_match callback becomes an optional keyword
|
||||||
argument.
|
argument.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (str): The match ID.
|
||||||
patterns (list): The patterns to add for the given key.
|
patterns (list): The patterns to add for the given key.
|
||||||
on_match (callable): Optional callback executed on match.
|
on_match (callable): Optional callback executed on match.
|
||||||
*_patterns (list): For backwards compatibility: list of patterns to add
|
*_patterns (list): For backwards compatibility: list of patterns to add
|
||||||
|
@ -138,7 +138,7 @@ cdef class Matcher:
|
||||||
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
||||||
not exist.
|
not exist.
|
||||||
|
|
||||||
key (unicode): The ID of the match rule.
|
key (str): The ID of the match rule.
|
||||||
"""
|
"""
|
||||||
norm_key = self._normalize_key(key)
|
norm_key = self._normalize_key(key)
|
||||||
if not norm_key in self._patterns:
|
if not norm_key in self._patterns:
|
||||||
|
|
|
@ -70,7 +70,7 @@ cdef class PhraseMatcher:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the matcher contains rules for a match ID.
|
"""Check whether the matcher contains rules for a match ID.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#contains
|
DOCS: https://spacy.io/api/phrasematcher#contains
|
||||||
|
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
|
||||||
"""Remove a rule from the matcher by match ID. A KeyError is raised if
|
"""Remove a rule from the matcher by match ID. A KeyError is raised if
|
||||||
the key does not exist.
|
the key does not exist.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (str): The match ID.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#remove
|
DOCS: https://spacy.io/api/phrasematcher#remove
|
||||||
"""
|
"""
|
||||||
|
@ -159,7 +159,7 @@ cdef class PhraseMatcher:
|
||||||
number of arguments). The on_match callback becomes an optional keyword
|
number of arguments). The on_match callback becomes an optional keyword
|
||||||
argument.
|
argument.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (str): The match ID.
|
||||||
docs (list): List of `Doc` objects representing match patterns.
|
docs (list): List of `Doc` objects representing match patterns.
|
||||||
on_match (callable): Callback executed on match.
|
on_match (callable): Callback executed on match.
|
||||||
*_docs (Doc): For backwards compatibility: list of patterns to add
|
*_docs (Doc): For backwards compatibility: list of patterns to add
|
||||||
|
|
|
@ -198,8 +198,8 @@ cdef class Morphology:
|
||||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
tag and orth match the rule will receive the specified properties.
|
tag and orth match the rule will receive the specified properties.
|
||||||
|
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (str): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (str): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
|
|
|
@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
fulfilled (e.g. if previous components assign the attributes).
|
fulfilled (e.g. if previous components assign the attributes).
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
name (unicode): The name of the pipeline component to analyze.
|
name (str): The name of the pipeline component to analyze.
|
||||||
pipe (callable): The pipeline component function to analyze.
|
pipe (callable): The pipeline component function to analyze.
|
||||||
index (int): The index of the component in the pipeline.
|
index (int): The index of the component in the pipeline.
|
||||||
warn (bool): Show user warning if problem is found.
|
warn (bool): Show user warning if problem is found.
|
||||||
|
@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
|
||||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
attr (unicode): The attribute to check.
|
attr (str): The attribute to check.
|
||||||
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
||||||
"""
|
"""
|
||||||
return _get_feature_for_attr(pipeline, attr, "assigns")
|
return _get_feature_for_attr(pipeline, attr, "assigns")
|
||||||
|
@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
|
||||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
attr (unicode): The attribute to check.
|
attr (str): The attribute to check.
|
||||||
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
||||||
"""
|
"""
|
||||||
return _get_feature_for_attr(pipeline, attr, "requires")
|
return _get_feature_for_attr(pipeline, attr, "requires")
|
||||||
|
|
|
@ -315,7 +315,7 @@ class EntityRuler(object):
|
||||||
"""Load the entity ruler from a file. Expects a file containing
|
"""Load the entity ruler from a file. Expects a file containing
|
||||||
newline-delimited JSON (JSONL) with one entry per line.
|
newline-delimited JSON (JSONL) with one entry per line.
|
||||||
|
|
||||||
path (unicode / Path): The JSONL file to load.
|
path (str / Path): The JSONL file to load.
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
@ -351,7 +351,7 @@ class EntityRuler(object):
|
||||||
"""Save the entity ruler patterns to a directory. The patterns will be
|
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||||
saved as newline-delimited JSON (JSONL).
|
saved as newline-delimited JSON (JSONL).
|
||||||
|
|
||||||
path (unicode / Path): The JSONL file to save.
|
path (str / Path): The JSONL file to save.
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
DOCS: https://spacy.io/api/entityruler#to_disk
|
||||||
|
|
|
@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
|
||||||
"""Merge subtokens into a single token.
|
"""Merge subtokens into a single token.
|
||||||
|
|
||||||
doc (Doc): The Doc object.
|
doc (Doc): The Doc object.
|
||||||
label (unicode): The subtoken dependency label.
|
label (str): The subtoken dependency label.
|
||||||
RETURNS (Doc): The Doc object with merged subtokens.
|
RETURNS (Doc): The Doc object with merged subtokens.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
|
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
|
||||||
|
|
|
@ -152,7 +152,7 @@ cdef class StringStore:
|
||||||
def add(self, string):
|
def add(self, string):
|
||||||
"""Add a string to the StringStore.
|
"""Add a string to the StringStore.
|
||||||
|
|
||||||
string (unicode): The string to add.
|
string (str): The string to add.
|
||||||
RETURNS (uint64): The string's hash value.
|
RETURNS (uint64): The string's hash value.
|
||||||
"""
|
"""
|
||||||
if isinstance(string, unicode):
|
if isinstance(string, unicode):
|
||||||
|
@ -179,7 +179,7 @@ cdef class StringStore:
|
||||||
def __contains__(self, string not None):
|
def __contains__(self, string not None):
|
||||||
"""Check whether a string is in the store.
|
"""Check whether a string is in the store.
|
||||||
|
|
||||||
string (unicode): The string to check.
|
string (str): The string to check.
|
||||||
RETURNS (bool): Whether the store contains the string.
|
RETURNS (bool): Whether the store contains the string.
|
||||||
"""
|
"""
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
|
@ -205,7 +205,7 @@ cdef class StringStore:
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the strings in the store, in order.
|
"""Iterate over the strings in the store, in order.
|
||||||
|
|
||||||
YIELDS (unicode): A string in the store.
|
YIELDS (str): A string in the store.
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
|
|
|
@ -134,7 +134,7 @@ cdef class Tokenizer:
|
||||||
def __call__(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
string (unicode): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
RETURNS (Doc): A container for linguistic annotations.
|
RETURNS (Doc): A container for linguistic annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#call
|
DOCS: https://spacy.io/api/tokenizer#call
|
||||||
|
@ -147,7 +147,7 @@ cdef class Tokenizer:
|
||||||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
|
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
|
||||||
"""Tokenize according to affix and token_match settings.
|
"""Tokenize according to affix and token_match settings.
|
||||||
|
|
||||||
string (unicode): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
RETURNS (Doc): A container for linguistic annotations.
|
RETURNS (Doc): A container for linguistic annotations.
|
||||||
"""
|
"""
|
||||||
if len(string) >= (2 ** 30):
|
if len(string) >= (2 ** 30):
|
||||||
|
@ -527,7 +527,7 @@ cdef class Tokenizer:
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, unicode string):
|
||||||
"""Find internal split points of the string, such as hyphens.
|
"""Find internal split points of the string, such as hyphens.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (str): The string to segment.
|
||||||
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
|
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
|
||||||
and `.end()` methods, denoting the placement of internal segment
|
and `.end()` methods, denoting the placement of internal segment
|
||||||
separators, e.g. hyphens.
|
separators, e.g. hyphens.
|
||||||
|
@ -542,7 +542,7 @@ cdef class Tokenizer:
|
||||||
"""Find the length of a prefix that should be segmented from the
|
"""Find the length of a prefix that should be segmented from the
|
||||||
string, or None if no prefix rules match.
|
string, or None if no prefix rules match.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (str): The string to segment.
|
||||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#find_prefix
|
DOCS: https://spacy.io/api/tokenizer#find_prefix
|
||||||
|
@ -556,7 +556,7 @@ cdef class Tokenizer:
|
||||||
"""Find the length of a suffix that should be segmented from the
|
"""Find the length of a suffix that should be segmented from the
|
||||||
string, or None if no suffix rules match.
|
string, or None if no suffix rules match.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (str): The string to segment.
|
||||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#find_suffix
|
DOCS: https://spacy.io/api/tokenizer#find_suffix
|
||||||
|
@ -576,7 +576,7 @@ cdef class Tokenizer:
|
||||||
def _validate_special_case(self, chunk, substrings):
|
def _validate_special_case(self, chunk, substrings):
|
||||||
"""Check whether the `ORTH` fields match the string.
|
"""Check whether the `ORTH` fields match the string.
|
||||||
|
|
||||||
string (unicode): The string to specially tokenize.
|
string (str): The string to specially tokenize.
|
||||||
substrings (iterable): A sequence of dicts, where each dict describes
|
substrings (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes.
|
a token and its attributes.
|
||||||
"""
|
"""
|
||||||
|
@ -588,7 +588,7 @@ cdef class Tokenizer:
|
||||||
def add_special_case(self, unicode string, substrings):
|
def add_special_case(self, unicode string, substrings):
|
||||||
"""Add a special-case tokenization rule.
|
"""Add a special-case tokenization rule.
|
||||||
|
|
||||||
string (unicode): The string to specially tokenize.
|
string (str): The string to specially tokenize.
|
||||||
substrings (iterable): A sequence of dicts, where each dict describes
|
substrings (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes. The `ORTH` fields of the attributes
|
a token and its attributes. The `ORTH` fields of the attributes
|
||||||
must exactly match the string when they are concatenated.
|
must exactly match the string when they are concatenated.
|
||||||
|
@ -629,7 +629,7 @@ cdef class Tokenizer:
|
||||||
produced are identical to `nlp.tokenizer()` except for whitespace
|
produced are identical to `nlp.tokenizer()` except for whitespace
|
||||||
tokens.
|
tokens.
|
||||||
|
|
||||||
string (unicode): The string to tokenize.
|
string (str): The string to tokenize.
|
||||||
RETURNS (list): A list of (pattern_string, token_string) tuples
|
RETURNS (list): A list of (pattern_string, token_string) tuples
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#explain
|
DOCS: https://spacy.io/api/tokenizer#explain
|
||||||
|
|
|
@ -107,7 +107,7 @@ cdef class Doc:
|
||||||
def set_extension(cls, name, **kwargs):
|
def set_extension(cls, name, **kwargs):
|
||||||
"""Define a custom attribute which becomes available as `Doc._`.
|
"""Define a custom attribute which becomes available as `Doc._`.
|
||||||
|
|
||||||
name (unicode): Name of the attribute to set.
|
name (str): Name of the attribute to set.
|
||||||
default: Optional default value of the attribute.
|
default: Optional default value of the attribute.
|
||||||
getter (callable): Optional getter function.
|
getter (callable): Optional getter function.
|
||||||
setter (callable): Optional setter function.
|
setter (callable): Optional setter function.
|
||||||
|
@ -125,7 +125,7 @@ cdef class Doc:
|
||||||
def get_extension(cls, name):
|
def get_extension(cls, name):
|
||||||
"""Look up a previously registered extension by name.
|
"""Look up a previously registered extension by name.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#get_extension
|
DOCS: https://spacy.io/api/doc#get_extension
|
||||||
|
@ -136,7 +136,7 @@ cdef class Doc:
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
"""Check whether an extension has been registered.
|
"""Check whether an extension has been registered.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (bool): Whether the extension has been registered.
|
RETURNS (bool): Whether the extension has been registered.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#has_extension
|
DOCS: https://spacy.io/api/doc#has_extension
|
||||||
|
@ -147,7 +147,7 @@ cdef class Doc:
|
||||||
def remove_extension(cls, name):
|
def remove_extension(cls, name):
|
||||||
"""Remove a previously registered extension.
|
"""Remove a previously registered extension.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||||
removed extension.
|
removed extension.
|
||||||
|
|
||||||
|
@ -473,7 +473,7 @@ cdef class Doc:
|
||||||
def text(self):
|
def text(self):
|
||||||
"""A unicode representation of the document text.
|
"""A unicode representation of the document text.
|
||||||
|
|
||||||
RETURNS (unicode): The original verbatim text of the document.
|
RETURNS (str): The original verbatim text of the document.
|
||||||
"""
|
"""
|
||||||
return "".join(t.text_with_ws for t in self)
|
return "".join(t.text_with_ws for t in self)
|
||||||
|
|
||||||
|
@ -482,7 +482,7 @@ cdef class Doc:
|
||||||
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
||||||
`Span` and `Token`.
|
`Span` and `Token`.
|
||||||
|
|
||||||
RETURNS (unicode): The original verbatim text of the document.
|
RETURNS (str): The original verbatim text of the document.
|
||||||
"""
|
"""
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
@ -628,7 +628,7 @@ cdef class Doc:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lang_(self):
|
def lang_(self):
|
||||||
"""RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
|
"""RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
|
||||||
return self.vocab.lang
|
return self.vocab.lang
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||||
|
|
|
@ -33,7 +33,7 @@ cdef class Span:
|
||||||
def set_extension(cls, name, **kwargs):
|
def set_extension(cls, name, **kwargs):
|
||||||
"""Define a custom attribute which becomes available as `Span._`.
|
"""Define a custom attribute which becomes available as `Span._`.
|
||||||
|
|
||||||
name (unicode): Name of the attribute to set.
|
name (str): Name of the attribute to set.
|
||||||
default: Optional default value of the attribute.
|
default: Optional default value of the attribute.
|
||||||
getter (callable): Optional getter function.
|
getter (callable): Optional getter function.
|
||||||
setter (callable): Optional setter function.
|
setter (callable): Optional setter function.
|
||||||
|
@ -51,7 +51,7 @@ cdef class Span:
|
||||||
def get_extension(cls, name):
|
def get_extension(cls, name):
|
||||||
"""Look up a previously registered extension by name.
|
"""Look up a previously registered extension by name.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#get_extension
|
DOCS: https://spacy.io/api/span#get_extension
|
||||||
|
@ -62,7 +62,7 @@ cdef class Span:
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
"""Check whether an extension has been registered.
|
"""Check whether an extension has been registered.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (bool): Whether the extension has been registered.
|
RETURNS (bool): Whether the extension has been registered.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#has_extension
|
DOCS: https://spacy.io/api/span#has_extension
|
||||||
|
@ -73,7 +73,7 @@ cdef class Span:
|
||||||
def remove_extension(cls, name):
|
def remove_extension(cls, name):
|
||||||
"""Remove a previously registered extension.
|
"""Remove a previously registered extension.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||||
removed extension.
|
removed extension.
|
||||||
|
|
||||||
|
@ -501,7 +501,7 @@ cdef class Span:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
"""RETURNS (unicode): The original verbatim text of the span."""
|
"""RETURNS (str): The original verbatim text of the span."""
|
||||||
text = self.text_with_ws
|
text = self.text_with_ws
|
||||||
if self[-1].whitespace_:
|
if self[-1].whitespace_:
|
||||||
text = text[:-1]
|
text = text[:-1]
|
||||||
|
@ -512,7 +512,7 @@ cdef class Span:
|
||||||
"""The text content of the span with a trailing whitespace character if
|
"""The text content of the span with a trailing whitespace character if
|
||||||
the last token has one.
|
the last token has one.
|
||||||
|
|
||||||
RETURNS (unicode): The text content of the span (with trailing
|
RETURNS (str): The text content of the span (with trailing
|
||||||
whitespace).
|
whitespace).
|
||||||
"""
|
"""
|
||||||
return "".join([t.text_with_ws for t in self])
|
return "".join([t.text_with_ws for t in self])
|
||||||
|
@ -688,7 +688,7 @@ cdef class Span:
|
||||||
raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
|
raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""RETURNS (unicode): The (string) entity ID."""
|
"""RETURNS (str): The (string) entity ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id_
|
return self.root.ent_id_
|
||||||
|
|
||||||
|
@ -700,12 +700,12 @@ cdef class Span:
|
||||||
"""Verbatim text content (identical to `Span.text`). Exists mostly for
|
"""Verbatim text content (identical to `Span.text`). Exists mostly for
|
||||||
consistency with other attributes.
|
consistency with other attributes.
|
||||||
|
|
||||||
RETURNS (unicode): The span's text."""
|
RETURNS (str): The span's text."""
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lemma_(self):
|
def lemma_(self):
|
||||||
"""RETURNS (unicode): The span's lemma."""
|
"""RETURNS (str): The span's lemma."""
|
||||||
return " ".join([t.lemma_ for t in self]).strip()
|
return " ".join([t.lemma_ for t in self]).strip()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -724,7 +724,7 @@ cdef class Span:
|
||||||
return "".join([t.text_with_ws for t in self])
|
return "".join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property label_:
|
property label_:
|
||||||
"""RETURNS (unicode): The span's label."""
|
"""RETURNS (str): The span's label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
|
@ -734,7 +734,7 @@ cdef class Span:
|
||||||
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
|
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
|
||||||
|
|
||||||
property kb_id_:
|
property kb_id_:
|
||||||
"""RETURNS (unicode): The named entity's KB ID."""
|
"""RETURNS (str): The named entity's KB ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.kb_id]
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ cdef class Token:
|
||||||
def set_extension(cls, name, **kwargs):
|
def set_extension(cls, name, **kwargs):
|
||||||
"""Define a custom attribute which becomes available as `Token._`.
|
"""Define a custom attribute which becomes available as `Token._`.
|
||||||
|
|
||||||
name (unicode): Name of the attribute to set.
|
name (str): Name of the attribute to set.
|
||||||
default: Optional default value of the attribute.
|
default: Optional default value of the attribute.
|
||||||
getter (callable): Optional getter function.
|
getter (callable): Optional getter function.
|
||||||
setter (callable): Optional setter function.
|
setter (callable): Optional setter function.
|
||||||
|
@ -54,7 +54,7 @@ cdef class Token:
|
||||||
def get_extension(cls, name):
|
def get_extension(cls, name):
|
||||||
"""Look up a previously registered extension by name.
|
"""Look up a previously registered extension by name.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#get_extension
|
DOCS: https://spacy.io/api/token#get_extension
|
||||||
|
@ -65,7 +65,7 @@ cdef class Token:
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
"""Check whether an extension has been registered.
|
"""Check whether an extension has been registered.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (bool): Whether the extension has been registered.
|
RETURNS (bool): Whether the extension has been registered.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#has_extension
|
DOCS: https://spacy.io/api/token#has_extension
|
||||||
|
@ -76,7 +76,7 @@ cdef class Token:
|
||||||
def remove_extension(cls, name):
|
def remove_extension(cls, name):
|
||||||
"""Remove a previously registered extension.
|
"""Remove a previously registered extension.
|
||||||
|
|
||||||
name (unicode): Name of the extension.
|
name (str): Name of the extension.
|
||||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||||
removed extension.
|
removed extension.
|
||||||
|
|
||||||
|
@ -244,12 +244,12 @@ cdef class Token:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
"""RETURNS (unicode): The original verbatim text of the token."""
|
"""RETURNS (str): The original verbatim text of the token."""
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text_with_ws(self):
|
def text_with_ws(self):
|
||||||
"""RETURNS (unicode): The text content of the span (with trailing
|
"""RETURNS (str): The text content of the span (with trailing
|
||||||
whitespace).
|
whitespace).
|
||||||
"""
|
"""
|
||||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||||
|
@ -740,7 +740,7 @@ cdef class Token:
|
||||||
self.c.ent_type = ent_type
|
self.c.ent_type = ent_type
|
||||||
|
|
||||||
property ent_type_:
|
property ent_type_:
|
||||||
"""RETURNS (unicode): Named entity type."""
|
"""RETURNS (str): Named entity type."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_type]
|
return self.vocab.strings[self.c.ent_type]
|
||||||
|
|
||||||
|
@ -763,7 +763,7 @@ cdef class Token:
|
||||||
and "" means no entity tag is set. "B" with an empty ent_type
|
and "" means no entity tag is set. "B" with an empty ent_type
|
||||||
means that the token is blocked from further processing by NER.
|
means that the token is blocked from further processing by NER.
|
||||||
|
|
||||||
RETURNS (unicode): IOB code of named entity tag.
|
RETURNS (str): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
iob_strings = ("", "I", "O", "B")
|
iob_strings = ("", "I", "O", "B")
|
||||||
return iob_strings[self.c.ent_iob]
|
return iob_strings[self.c.ent_iob]
|
||||||
|
@ -779,7 +779,7 @@ cdef class Token:
|
||||||
self.c.ent_id = key
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""RETURNS (unicode): ID of the entity the token is an instance of,
|
"""RETURNS (str): ID of the entity the token is an instance of,
|
||||||
if any.
|
if any.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -797,7 +797,7 @@ cdef class Token:
|
||||||
self.c.ent_kb_id = ent_kb_id
|
self.c.ent_kb_id = ent_kb_id
|
||||||
|
|
||||||
property ent_kb_id_:
|
property ent_kb_id_:
|
||||||
"""RETURNS (unicode): Named entity KB ID."""
|
"""RETURNS (str): Named entity KB ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_kb_id]
|
return self.vocab.strings[self.c.ent_kb_id]
|
||||||
|
|
||||||
|
@ -806,12 +806,12 @@ cdef class Token:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def whitespace_(self):
|
def whitespace_(self):
|
||||||
"""RETURNS (unicode): The trailing whitespace character, if present."""
|
"""RETURNS (str): The trailing whitespace character, if present."""
|
||||||
return " " if self.c.spacy else ""
|
return " " if self.c.spacy else ""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
"""RETURNS (unicode): Verbatim text content (identical to
|
"""RETURNS (str): Verbatim text content (identical to
|
||||||
`Token.text`). Exists mostly for consistency with the other
|
`Token.text`). Exists mostly for consistency with the other
|
||||||
attributes.
|
attributes.
|
||||||
"""
|
"""
|
||||||
|
@ -819,13 +819,13 @@ cdef class Token:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lower_(self):
|
def lower_(self):
|
||||||
"""RETURNS (unicode): The lowercase token text. Equivalent to
|
"""RETURNS (str): The lowercase token text. Equivalent to
|
||||||
`Token.text.lower()`.
|
`Token.text.lower()`.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.lower]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
|
"""RETURNS (str): The token's norm, i.e. a normalised form of the
|
||||||
token text. Usually set in the language's tokenizer exceptions or
|
token text. Usually set in the language's tokenizer exceptions or
|
||||||
norm exceptions.
|
norm exceptions.
|
||||||
"""
|
"""
|
||||||
|
@ -837,34 +837,34 @@ cdef class Token:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape_(self):
|
def shape_(self):
|
||||||
"""RETURNS (unicode): Transform of the tokens's string, to show
|
"""RETURNS (str): Transform of the tokens's string, to show
|
||||||
orthographic features. For example, "Xxxx" or "dd".
|
orthographic features. For example, "Xxxx" or "dd".
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.shape]
|
return self.vocab.strings[self.c.lex.shape]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def prefix_(self):
|
def prefix_(self):
|
||||||
"""RETURNS (unicode): A length-N substring from the start of the token.
|
"""RETURNS (str): A length-N substring from the start of the token.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.prefix]
|
return self.vocab.strings[self.c.lex.prefix]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def suffix_(self):
|
def suffix_(self):
|
||||||
"""RETURNS (unicode): A length-N substring from the end of the token.
|
"""RETURNS (str): A length-N substring from the end of the token.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.suffix]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lang_(self):
|
def lang_(self):
|
||||||
"""RETURNS (unicode): Language of the parent document's vocabulary,
|
"""RETURNS (str): Language of the parent document's vocabulary,
|
||||||
e.g. 'en'.
|
e.g. 'en'.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.lang]
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
|
"""RETURNS (str): The token lemma, i.e. the base form of the word,
|
||||||
with no inflectional suffixes.
|
with no inflectional suffixes.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -877,7 +877,7 @@ cdef class Token:
|
||||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
|
"""RETURNS (str): Coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return parts_of_speech.NAMES[self.c.pos]
|
return parts_of_speech.NAMES[self.c.pos]
|
||||||
|
|
||||||
|
@ -885,7 +885,7 @@ cdef class Token:
|
||||||
self.c.pos = parts_of_speech.IDS[pos_name]
|
self.c.pos = parts_of_speech.IDS[pos_name]
|
||||||
|
|
||||||
property tag_:
|
property tag_:
|
||||||
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
|
"""RETURNS (str): Fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.tag]
|
return self.vocab.strings[self.c.tag]
|
||||||
|
|
||||||
|
@ -893,7 +893,7 @@ cdef class Token:
|
||||||
self.tag = self.vocab.strings.add(tag)
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
"""RETURNS (unicode): The syntactic dependency label."""
|
"""RETURNS (str): The syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,7 @@ def lang_class_is_loaded(lang):
|
||||||
loaded lazily, to avoid expensive setup code associated with the language
|
loaded lazily, to avoid expensive setup code associated with the language
|
||||||
data.
|
data.
|
||||||
|
|
||||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
lang (str): Two-letter language code, e.g. 'en'.
|
||||||
RETURNS (bool): Whether a Language class has been loaded.
|
RETURNS (bool): Whether a Language class has been loaded.
|
||||||
"""
|
"""
|
||||||
return lang in registry.languages
|
return lang in registry.languages
|
||||||
|
@ -67,7 +67,7 @@ def lang_class_is_loaded(lang):
|
||||||
def get_lang_class(lang):
|
def get_lang_class(lang):
|
||||||
"""Import and load a Language class.
|
"""Import and load a Language class.
|
||||||
|
|
||||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
lang (str): Two-letter language code, e.g. 'en'.
|
||||||
RETURNS (Language): Language class.
|
RETURNS (Language): Language class.
|
||||||
"""
|
"""
|
||||||
# Check if language is registered / entry point is available
|
# Check if language is registered / entry point is available
|
||||||
|
@ -85,7 +85,7 @@ def get_lang_class(lang):
|
||||||
def set_lang_class(name, cls):
|
def set_lang_class(name, cls):
|
||||||
"""Set a custom Language class name that can be loaded via get_lang_class.
|
"""Set a custom Language class name that can be loaded via get_lang_class.
|
||||||
|
|
||||||
name (unicode): Name of Language class.
|
name (str): Name of Language class.
|
||||||
cls (Language): Language class.
|
cls (Language): Language class.
|
||||||
"""
|
"""
|
||||||
registry.languages.register(name, func=cls)
|
registry.languages.register(name, func=cls)
|
||||||
|
@ -107,7 +107,7 @@ def load_language_data(path):
|
||||||
"""Load JSON language data using the given path as a base. If the provided
|
"""Load JSON language data using the given path as a base. If the provided
|
||||||
path isn't present, will attempt to load a gzipped version before giving up.
|
path isn't present, will attempt to load a gzipped version before giving up.
|
||||||
|
|
||||||
path (unicode / Path): The data to load.
|
path (str / Path): The data to load.
|
||||||
RETURNS: The loaded data.
|
RETURNS: The loaded data.
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
|
@ -128,7 +128,7 @@ def get_module_path(module):
|
||||||
def load_model(name, **overrides):
|
def load_model(name, **overrides):
|
||||||
"""Load a model from a package or data path.
|
"""Load a model from a package or data path.
|
||||||
|
|
||||||
name (unicode): Package name or model path.
|
name (str): Package name or model path.
|
||||||
**overrides: Specific overrides, like pipeline components to disable.
|
**overrides: Specific overrides, like pipeline components to disable.
|
||||||
RETURNS (Language): `Language` class with the loaded model.
|
RETURNS (Language): `Language` class with the loaded model.
|
||||||
"""
|
"""
|
||||||
|
@ -202,7 +202,7 @@ def load_model_from_init_py(init_file, **overrides):
|
||||||
"""Helper function to use in the `load()` method of a model package's
|
"""Helper function to use in the `load()` method of a model package's
|
||||||
__init__.py.
|
__init__.py.
|
||||||
|
|
||||||
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
init_file (str): Path to model's __init__.py, i.e. `__file__`.
|
||||||
**overrides: Specific overrides, like pipeline components to disable.
|
**overrides: Specific overrides, like pipeline components to disable.
|
||||||
RETURNS (Language): `Language` class with loaded model.
|
RETURNS (Language): `Language` class with loaded model.
|
||||||
"""
|
"""
|
||||||
|
@ -227,8 +227,8 @@ def get_package_version(name):
|
||||||
"""Get the version of an installed package. Typically used to get model
|
"""Get the version of an installed package. Typically used to get model
|
||||||
package versions.
|
package versions.
|
||||||
|
|
||||||
name (unicode): The name of the installed Python package.
|
name (str): The name of the installed Python package.
|
||||||
RETURNS (unicode / None): The version or None if package not installed.
|
RETURNS (str / None): The version or None if package not installed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return importlib_metadata.version(name)
|
return importlib_metadata.version(name)
|
||||||
|
@ -338,7 +338,7 @@ def get_model_config(path):
|
||||||
def is_package(name):
|
def is_package(name):
|
||||||
"""Check if string maps to a package installed via pip.
|
"""Check if string maps to a package installed via pip.
|
||||||
|
|
||||||
name (unicode): Name of package.
|
name (str): Name of package.
|
||||||
RETURNS (bool): True if installed package, False if not.
|
RETURNS (bool): True if installed package, False if not.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
@ -351,7 +351,7 @@ def is_package(name):
|
||||||
def get_package_path(name):
|
def get_package_path(name):
|
||||||
"""Get the path to an installed package.
|
"""Get the path to an installed package.
|
||||||
|
|
||||||
name (unicode): Package name.
|
name (str): Package name.
|
||||||
RETURNS (Path): Path to installed package.
|
RETURNS (Path): Path to installed package.
|
||||||
"""
|
"""
|
||||||
name = name.lower() # use lowercase version to be safe
|
name = name.lower() # use lowercase version to be safe
|
||||||
|
@ -526,8 +526,8 @@ def expand_exc(excs, search, replace):
|
||||||
For example, to add additional versions with typographic apostrophes.
|
For example, to add additional versions with typographic apostrophes.
|
||||||
|
|
||||||
excs (dict): Tokenizer exceptions.
|
excs (dict): Tokenizer exceptions.
|
||||||
search (unicode): String to find and replace.
|
search (str): String to find and replace.
|
||||||
replace (unicode): Replacement.
|
replace (str): Replacement.
|
||||||
RETURNS (dict): Combined tokenizer exceptions.
|
RETURNS (dict): Combined tokenizer exceptions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -761,8 +761,8 @@ def from_disk(path, readers, exclude):
|
||||||
def import_file(name, loc):
|
def import_file(name, loc):
|
||||||
"""Import module from a file. Used to load models from a directory.
|
"""Import module from a file. Used to load models from a directory.
|
||||||
|
|
||||||
name (unicode): Name of module to load.
|
name (str): Name of module to load.
|
||||||
loc (unicode / Path): Path to the file.
|
loc (str / Path): Path to the file.
|
||||||
RETURNS: The loaded module.
|
RETURNS: The loaded module.
|
||||||
"""
|
"""
|
||||||
loc = str(loc)
|
loc = str(loc)
|
||||||
|
@ -777,8 +777,8 @@ def minify_html(html):
|
||||||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||||
newlines.
|
newlines.
|
||||||
|
|
||||||
html (unicode): Markup to minify.
|
html (str): Markup to minify.
|
||||||
RETURNS (unicode): "Minified" HTML.
|
RETURNS (str): "Minified" HTML.
|
||||||
"""
|
"""
|
||||||
return html.strip().replace(" ", "").replace("\n", "")
|
return html.strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
@ -787,8 +787,8 @@ def escape_html(text):
|
||||||
"""Replace <, >, &, " with their HTML encoded representation. Intended to
|
"""Replace <, >, &, " with their HTML encoded representation. Intended to
|
||||||
prevent HTML errors in rendered displaCy markup.
|
prevent HTML errors in rendered displaCy markup.
|
||||||
|
|
||||||
text (unicode): The original text.
|
text (str): The original text.
|
||||||
RETURNS (unicode): Equivalent text to be safely used within HTML.
|
RETURNS (str): Equivalent text to be safely used within HTML.
|
||||||
"""
|
"""
|
||||||
text = text.replace("&", "&")
|
text = text.replace("&", "&")
|
||||||
text = text.replace("<", "<")
|
text = text.replace("<", "<")
|
||||||
|
|
|
@ -57,7 +57,7 @@ cdef class Vectors:
|
||||||
shape (tuple): Size of the table, as (# entries, # columns)
|
shape (tuple): Size of the table, as (# entries, # columns)
|
||||||
data (numpy.ndarray): The vector data.
|
data (numpy.ndarray): The vector data.
|
||||||
keys (iterable): A sequence of keys, aligned with the data.
|
keys (iterable): A sequence of keys, aligned with the data.
|
||||||
name (unicode): A name to identify the vectors table.
|
name (str): A name to identify the vectors table.
|
||||||
RETURNS (Vectors): The newly created object.
|
RETURNS (Vectors): The newly created object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#init
|
DOCS: https://spacy.io/api/vectors#init
|
||||||
|
@ -237,7 +237,7 @@ cdef class Vectors:
|
||||||
def find(self, *, key=None, keys=None, row=None, rows=None):
|
def find(self, *, key=None, keys=None, row=None, rows=None):
|
||||||
"""Look up one or more keys by row, or vice versa.
|
"""Look up one or more keys by row, or vice versa.
|
||||||
|
|
||||||
key (unicode / int): Find the row that the given key points to.
|
key (str / int): Find the row that the given key points to.
|
||||||
Returns int, -1 if missing.
|
Returns int, -1 if missing.
|
||||||
keys (iterable): Find rows that the keys point to.
|
keys (iterable): Find rows that the keys point to.
|
||||||
Returns ndarray.
|
Returns ndarray.
|
||||||
|
@ -352,7 +352,7 @@ cdef class Vectors:
|
||||||
def to_disk(self, path, **kwargs):
|
def to_disk(self, path, **kwargs):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode / Path): A path to a directory, which will be created if
|
path (str / Path): A path to a directory, which will be created if
|
||||||
it doesn't exists.
|
it doesn't exists.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#to_disk
|
DOCS: https://spacy.io/api/vectors#to_disk
|
||||||
|
@ -372,7 +372,7 @@ cdef class Vectors:
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
path (unicode / Path): Directory path, string or Path-like object.
|
path (str / Path): Directory path, string or Path-like object.
|
||||||
RETURNS (Vectors): The modified object.
|
RETURNS (Vectors): The modified object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#from_disk
|
DOCS: https://spacy.io/api/vectors#from_disk
|
||||||
|
|
|
@ -41,7 +41,7 @@ cdef class Vocab:
|
||||||
strings (StringStore): StringStore that maps strings to integers, and
|
strings (StringStore): StringStore that maps strings to integers, and
|
||||||
vice versa.
|
vice versa.
|
||||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||||
name (unicode): Optional name to identify the vectors table.
|
name (str): Optional name to identify the vectors table.
|
||||||
RETURNS (Vocab): The newly constructed object.
|
RETURNS (Vocab): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
|
@ -97,7 +97,7 @@ cdef class Vocab:
|
||||||
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
||||||
`Token.check_flag`.
|
`Token.check_flag`.
|
||||||
|
|
||||||
flag_getter (callable): A function `f(unicode) -> bool`, to get the
|
flag_getter (callable): A function `f(str) -> bool`, to get the
|
||||||
flag value.
|
flag value.
|
||||||
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
||||||
the bit at which the flag will be stored. If -1, the lowest
|
the bit at which the flag will be stored. If -1, the lowest
|
||||||
|
@ -187,7 +187,7 @@ cdef class Vocab:
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the string or int key has an entry in the vocabulary.
|
"""Check whether the string or int key has an entry in the vocabulary.
|
||||||
|
|
||||||
string (unicode): The ID string.
|
string (str): The ID string.
|
||||||
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab#contains
|
DOCS: https://spacy.io/api/vocab#contains
|
||||||
|
|
|
@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `vocab` | `Vocab` | The lexeme's vocabulary. |
|
| `vocab` | `Vocab` | The lexeme's vocabulary. |
|
||||||
| `text` | unicode | Verbatim text content. |
|
| `text` | str | Verbatim text content. |
|
||||||
| `orth` | int | ID of the verbatim text content. |
|
| `orth` | int | ID of the verbatim text content. |
|
||||||
| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
|
| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
|
||||||
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
|
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
|
||||||
| `flags` | int | Container of the lexeme's binary flags. |
|
| `flags` | int | Container of the lexeme's binary flags. |
|
||||||
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||||
| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||||
| `lower` | int | Lowercase form of the word. |
|
| `lower` | int | Lowercase form of the word. |
|
||||||
| `lower_` | unicode | Lowercase form of the word. |
|
| `lower_` | str | Lowercase form of the word. |
|
||||||
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||||
| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||||
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
|
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||||
| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. |
|
| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||||
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
|
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
|
||||||
| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. |
|
| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
|
||||||
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
|
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
|
||||||
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
|
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
|
||||||
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
|
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
|
||||||
|
@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
|
||||||
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
|
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
|
||||||
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
|
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
|
||||||
| `lang` | int | Language of the parent vocabulary. |
|
| `lang` | int | Language of the parent vocabulary. |
|
||||||
| `lang_` | unicode | Language of the parent vocabulary. |
|
| `lang_` | str | Language of the parent vocabulary. |
|
||||||
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
|
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
|
||||||
| `cluster` | int | Brown cluster ID. |
|
| `cluster` | int | Brown cluster ID. |
|
||||||
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
|
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
|
||||||
|
|
|
@ -27,7 +27,7 @@ Create the vocabulary.
|
||||||
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
|
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
|
||||||
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
|
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
|
||||||
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
|
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
|
||||||
| `vectors_name` <Tag variant="new">2.2</Tag> | unicode | A name to identify the vectors table. |
|
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
|
||||||
| **RETURNS** | `Vocab` | The newly constructed object. |
|
| **RETURNS** | `Vocab` | The newly constructed object. |
|
||||||
|
|
||||||
## Vocab.\_\_len\_\_ {#len tag="method"}
|
## Vocab.\_\_len\_\_ {#len tag="method"}
|
||||||
|
@ -92,8 +92,8 @@ given string, you need to look it up in
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------- | -------------------------------------------------- |
|
| ----------- | ---- | -------------------------------------------------- |
|
||||||
| `string` | unicode | The ID string. |
|
| `string` | str | The ID string. |
|
||||||
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
|
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
|
||||||
|
|
||||||
## Vocab.add_flag {#add_flag tag="method"}
|
## Vocab.add_flag {#add_flag tag="method"}
|
||||||
|
@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. |
|
| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
|
||||||
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
|
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
|
||||||
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
|
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
|
||||||
|
|
||||||
|
@ -228,8 +228,8 @@ Save the current state to a directory.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
|
|
||||||
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
||||||
|
@ -244,8 +244,8 @@ Loads state from a directory. Modifies the object in place and returns it.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||||
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
||||||
|
|
||||||
|
|
|
@ -158,17 +158,17 @@ The available token pattern keys correspond to a number of
|
||||||
rule-based matching are:
|
rule-based matching are:
|
||||||
|
|
||||||
| Attribute | Type | Description |
|
| Attribute | Type | Description |
|
||||||
| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
|
||||||
| `ORTH` | unicode | The exact verbatim text of a token. |
|
| `ORTH` | str | The exact verbatim text of a token. |
|
||||||
| `TEXT` <Tag variant="new">2.1</Tag> | unicode | The exact verbatim text of a token. |
|
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
|
||||||
| `LOWER` | unicode | The lowercase form of the token text. |
|
| `LOWER` | str | The lowercase form of the token text. |
|
||||||
| `LENGTH` | int | The length of the token text. |
|
| `LENGTH` | int | The length of the token text. |
|
||||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||||
| `ENT_TYPE` | unicode | The token's entity label. |
|
| `ENT_TYPE` | str | The token's entity label. |
|
||||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||||
|
|
||||||
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
|
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
|
||||||
|
@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
|
||||||
|
|
||||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||||
|
|
||||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
|
When using a large amount of **phrase patterns** (roughly > 10000) it's useful
|
||||||
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
|
to understand how the `add_patterns` function of the EntityRuler works. For each
|
||||||
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
|
**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
|
||||||
extract matches based on the pattern's POS signature.
|
object. This happens in case you try to add the EntityRuler at the end of an
|
||||||
|
existing pipeline with, for example, a POS tagger and want to extract matches
|
||||||
|
based on the pattern's POS signature.
|
||||||
|
|
||||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
|
In this case you would pass a config value of `phrase_matcher_attr="POS"` for
|
||||||
|
the EntityRuler.
|
||||||
|
|
||||||
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
|
Running the full language pipeline across every pattern in a large list scales
|
||||||
|
linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||||
|
|
||||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
|
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
|
||||||
|
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
|
||||||
|
5,000-100,000 phrase patterns respectively.
|
||||||
|
|
||||||
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
|
Even with this speedup (but especially if you're using an older version) the
|
||||||
|
`add_patterns` function can still take a long time.
|
||||||
|
|
||||||
An easy workaround to make this function run faster is disabling the other language pipes
|
An easy workaround to make this function run faster is disabling the other
|
||||||
while adding the phrase patterns.
|
language pipes while adding the phrase patterns.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
entityruler = EntityRuler(nlp)
|
entityruler = EntityRuler(nlp)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user