Merge branch 'develop' of https://github.com/explosion/spaCy into develop
|
@ -173,13 +173,13 @@ class Language(object):
|
|||
flat_list.append(pipe)
|
||||
self.pipeline = flat_list
|
||||
|
||||
def __call__(self, text, **disabled):
|
||||
def __call__(self, text, disable=[]):
|
||||
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
text (unicode): The text to be processed.
|
||||
**disabled: Elements of the pipeline that should not be run.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
RETURNS (Doc): A container for accessing the annotations.
|
||||
|
||||
EXAMPLE:
|
||||
|
@ -190,7 +190,7 @@ class Language(object):
|
|||
doc = self.make_doc(text)
|
||||
for proc in self.pipeline:
|
||||
name = getattr(proc, 'name', None)
|
||||
if name in disabled and not disabled[name]:
|
||||
if name in disable:
|
||||
continue
|
||||
proc(doc)
|
||||
return doc
|
||||
|
@ -322,7 +322,7 @@ class Language(object):
|
|||
except StopIteration:
|
||||
pass
|
||||
|
||||
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
|
||||
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||
GIL-free multi-threading.
|
||||
|
||||
|
@ -330,7 +330,7 @@ class Language(object):
|
|||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||
decide how many to use at run time. Default is 2.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
**disabled: Pipeline components to exclude.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
||||
EXAMPLE:
|
||||
|
@ -342,7 +342,7 @@ class Language(object):
|
|||
docs = texts
|
||||
for proc in self.pipeline:
|
||||
name = getattr(proc, 'name', None)
|
||||
if name in disabled and not disabled[name]:
|
||||
if name in disable:
|
||||
continue
|
||||
if hasattr(proc, 'pipe'):
|
||||
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
||||
|
@ -352,12 +352,14 @@ class Language(object):
|
|||
for doc in docs:
|
||||
yield doc
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
def to_disk(self, path, disable=[]):
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
will include the model.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
**exclude: Named attributes to prevent from being saved.
|
||||
disable (list): Nameds of pipeline components to disable and prevent
|
||||
from being saved.
|
||||
|
||||
EXAMPLE:
|
||||
>>> nlp.to_disk('/path/to/models')
|
||||
|
@ -369,7 +371,7 @@ class Language(object):
|
|||
raise IOError("Output path must be a directory")
|
||||
props = {}
|
||||
for name, value in self.__dict__.items():
|
||||
if name in exclude:
|
||||
if name in disable:
|
||||
continue
|
||||
if hasattr(value, 'to_disk'):
|
||||
value.to_disk(path / name)
|
||||
|
@ -378,13 +380,14 @@ class Language(object):
|
|||
with (path / 'props.pickle').open('wb') as file_:
|
||||
dill.dump(props, file_)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def from_disk(self, path, disable=[]):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
returns it. If the saved `Language` object contains a model, the
|
||||
model will be loaded.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
RETURNS (Language): The modified `Language` object.
|
||||
|
||||
EXAMPLE:
|
||||
|
@ -393,35 +396,36 @@ class Language(object):
|
|||
"""
|
||||
path = util.ensure_path(path)
|
||||
for name in path.iterdir():
|
||||
if name not in exclude and hasattr(self, str(name)):
|
||||
if name not in disable and hasattr(self, str(name)):
|
||||
getattr(self, name).from_disk(path / name)
|
||||
with (path / 'props.pickle').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.from_bytes(bytes_data, **exclude)
|
||||
self.from_bytes(bytes_data, disable)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, disable=[]):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
disable (list): Nameds of pipeline components to disable and prevent
|
||||
from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Language` object.
|
||||
"""
|
||||
props = dict(self.__dict__)
|
||||
for key in exclude:
|
||||
for key in disable:
|
||||
if key in props:
|
||||
props.pop(key)
|
||||
return dill.dumps(props, -1)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, disable=[]):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
RETURNS (Language): The `Language` object.
|
||||
"""
|
||||
props = dill.loads(bytes_data)
|
||||
for key, value in props.items():
|
||||
if key not in exclude:
|
||||
if key not in disable:
|
||||
setattr(self, key, value)
|
||||
return self
|
||||
|
||||
|
|
|
@ -174,6 +174,7 @@ def get_async(stream, numpy_array):
|
|||
array.set(numpy_array, stream=stream)
|
||||
return array
|
||||
|
||||
|
||||
def itershuffle(iterable, bufsize=1000):
|
||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
||||
and yielding them sometime later. Obviously, this is not unbiased --
|
||||
|
|
|
@ -37,7 +37,8 @@ mixin svg(file, name, width, height)
|
|||
size - [integer] icon width and height (default: 20)
|
||||
|
||||
mixin icon(name, size)
|
||||
+svg("icons", name, size || 20).o-icon&attributes(attributes)
|
||||
- var size = size || 20
|
||||
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
|
||||
|
||||
|
||||
//- Pro/Con/Neutral icon
|
||||
|
@ -185,3 +186,14 @@ mixin landing-header()
|
|||
mixin landing-badge(url, graphic, alt, size)
|
||||
+a(url)(aria-label=alt title=alt).c-landing__badge
|
||||
+svg("graphics", graphic, size || 225)
|
||||
|
||||
|
||||
//- Under construction (temporary)
|
||||
Marks sections that still need to be completed for the v2.0 release.
|
||||
|
||||
mixin under-construction()
|
||||
+infobox("🚧 Under construction")
|
||||
| This section is still being written and will be updated for the v2.0
|
||||
| release. Is there anything that you think should definitely mentioned or
|
||||
| explained here? Any examples you'd like to see? #[strong Let us know]
|
||||
| on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub!
|
||||
|
|
|
@ -178,7 +178,7 @@ mixin label()
|
|||
//- Tag
|
||||
|
||||
mixin tag()
|
||||
span.u-text-tag.u-text-tag--spaced(aria-hidden="true")
|
||||
span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
|
@ -192,6 +192,17 @@ mixin tag-model(...capabs)
|
|||
+help(intro + ext + ".").u-color-theme
|
||||
|
||||
|
||||
//- "New" tag to label features new in a specific version
|
||||
By using a separate mixin with a version ID, it becomes easy to quickly
|
||||
enable/disable tags without having to modify the markup in the docs.
|
||||
version - [string or integer] version number, without "v" prefix
|
||||
|
||||
mixin tag-new(version)
|
||||
- var version = (typeof version == 'number') ? version.toFixed(1) : version
|
||||
+tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.")
|
||||
| v#{version}
|
||||
|
||||
|
||||
//- List
|
||||
type - [string] "numbers", "letters", "roman" (bulleted list if none set)
|
||||
start - [integer] start number
|
||||
|
|
|
@ -6,9 +6,17 @@ include _sidebar
|
|||
|
||||
main.o-main.o-main--sidebar.o-main--aside
|
||||
article.o-content
|
||||
+h(1)=title
|
||||
if tag
|
||||
+tag=tag
|
||||
+grid.o-no-block
|
||||
+grid-col(source ? "two-thirds" : "full")
|
||||
+h(1)=title
|
||||
if tag
|
||||
+tag=tag
|
||||
|
||||
if source
|
||||
+grid-col("third").u-text-right
|
||||
.o-inline-list
|
||||
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
|
||||
|
||||
|
||||
if ALPHA
|
||||
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
|
||||
|
|
|
@ -1,128 +1,128 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="736">
|
||||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
|
||||
<style>
|
||||
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
|
||||
.text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
|
||||
.text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
|
||||
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
|
||||
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
|
||||
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
|
||||
</style>
|
||||
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
|
||||
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
|
||||
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/>
|
||||
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/>
|
||||
<rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/>
|
||||
<text class="text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
|
||||
<text class="svg__architecture__text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
|
||||
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
|
||||
<text class="svg__architecture__text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" />
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
|
||||
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
|
||||
<text class="svg__architecture__text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" />
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/>
|
||||
<text class="text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
|
||||
<text class="svg__architecture__text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/>
|
||||
<text transform="translate(221.5 77.5)" class="text-small" dy="0.85em" width="10" height="14">ja</text>
|
||||
<text transform="translate(221.5 77.5)" class="svg__architecture__text-small" dy="0.85em" width="10" height="14">ja</text>
|
||||
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/>
|
||||
<text class="text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
|
||||
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
|
||||
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/>
|
||||
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/>
|
||||
<rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/>
|
||||
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
|
||||
<text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/>
|
||||
<text class="text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
|
||||
<text class="svg__architecture__text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
|
||||
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/>
|
||||
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/>
|
||||
<rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/>
|
||||
<text class="text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
|
||||
<text class="svg__architecture__text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
|
||||
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/>
|
||||
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/>
|
||||
<rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" />
|
||||
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
|
||||
<text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/>
|
||||
<rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/>
|
||||
<text class="text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
|
||||
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/>
|
||||
<text class="text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/>
|
||||
<text class="text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
|
||||
<text class="svg__architecture__text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
|
||||
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
|
||||
<text class="svg__architecture__text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
|
@ -1,13 +1,13 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
|
||||
<style>
|
||||
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
|
||||
</style>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
|
||||
<text class="text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
|
||||
<text class="svg__langdata__text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
|
||||
|
@ -17,7 +17,7 @@
|
|||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
|
||||
<ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
|
||||
<text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
|
||||
|
@ -33,50 +33,50 @@
|
|||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
|
||||
<ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
|
||||
<text class="text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
|
||||
<text class="svg__langdata__text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
|
||||
<ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
|
||||
<text class="svg__langdata__text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
|
||||
<path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
|
||||
<ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
|
||||
<text class="svg__langdata__text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
|
||||
<ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
|
||||
<text class="svg__langdata__text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
|
||||
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
|
||||
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
|
||||
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
|
||||
<ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
|
||||
<text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
|
||||
</text>
|
||||
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
|
||||
<ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
|
||||
<text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
|
||||
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
|
||||
<text class="text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
|
||||
<text class="svg__langdata__text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
|
||||
<ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
|
||||
<text class="text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
|
||||
<text class="svg__langdata__text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
|
||||
<text class="text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
|
||||
<text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
|
||||
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
|
||||
<ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
|
||||
<text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
|
||||
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
|
||||
<ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
|
||||
<text class="text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
|
||||
<text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
|
||||
<ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
|
||||
<text class="text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
|
||||
<text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 8.8 KiB After Width: | Height: | Size: 9.0 KiB |
|
@ -1,30 +1,30 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
|
||||
<style>
|
||||
.text { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
|
||||
.text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
|
||||
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
|
||||
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
|
||||
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
|
||||
</style>
|
||||
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
|
||||
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
|
||||
<text class="text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
|
||||
<text class="svg__pipeline__text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
|
||||
<path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
|
||||
<text class="text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
|
||||
<text class="svg__pipeline__text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
|
||||
<rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
|
||||
<text class="text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
|
||||
<text class="svg__pipeline__text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
|
||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
|
||||
<text class="text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
|
||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
|
||||
<text class="text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
|
||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
|
||||
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
|
||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
|
||||
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
|
||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
|
||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
|
||||
<text class="text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
|
||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 2.9 KiB After Width: | Height: | Size: 3.1 KiB |
77
website/assets/img/docs/vocab_stringstore.svg
Normal file
|
@ -0,0 +1,77 @@
|
|||
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
|
||||
<style>
|
||||
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
|
||||
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
|
||||
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
|
||||
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
|
||||
</style>
|
||||
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
|
||||
<text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
|
||||
<rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
|
||||
<rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
|
||||
<rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
|
||||
<rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">"coffee"</text>
|
||||
<rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">"I"</text>
|
||||
<rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">508</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">"love"</text>
|
||||
<rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
|
||||
<rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
|
||||
<rect fill="#E1D5E7" width="50" height="12" transform="translate(202.5 53.5)"/>
|
||||
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="50" height="12" transform="translate(202.5 53.5)">nsubj</text>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M363 60h72.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M441.8 60l-8 4 2-4-2-4z"/>
|
||||
<rect fill="#E1D5E7" width="43" height="12" transform="translate(375.5 54.5)"/>
|
||||
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="43" height="12" transform="translate(375.5 54.5)">dobj</text>
|
||||
<rect width="50" height="88" x="1" y="246" fill="#666" stroke="#666" stroke-width="2" rx="7.5" ry="7.5"/>
|
||||
<text class="svg__vocab__text-large" dx="-0.25em" dy="1em" width="53" height="36" transform="rotate(-90 162 155)">String</text>
|
||||
<text class="svg__vocab__text-large" dy="2em" width="53" height="36" transform="rotate(-90 162 155)">Store</text>
|
||||
<rect width="50" height="88" x="1" y="135" fill="#82b366" stroke="#82b366" stroke-width="2" rx="7.5" ry="7.5"/>
|
||||
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="47" height="17" transform="rotate(-90 109.5 93)">Vocab</text>
|
||||
<rect width="50" height="110" x="1" y="1" fill="#9673a6" stroke="#9673a6" stroke-width="2" rx="7.5" ry="7.5"/>
|
||||
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="31" height="17" transform="rotate(-90 44 27.5)">Doc</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 27h100v66H263z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="31" height="33" transform="translate(296.5 42.5)">love</text>
|
||||
<text class="svg__vocab__text-code" dy="2.8em" width="31" height="33" transform="translate(296.5 42.5)">VERB</text>
|
||||
<rect width="50" height="20" x="288" y="16" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" transform="translate(294.5 19.5)">Token</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 27h100v66H76z"/>
|
||||
<text class="svg__vocab__text" dx="0.8em" dy="1em" width="29" height="33" transform="translate(110.5 42.5)">I</text>
|
||||
<text class="svg__vocab__text-code" dy="2.8em" width="29" height="33" transform="translate(110.5 42.5)">PRON</text>
|
||||
<rect width="50" height="20" x="105" y="17" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(111.5 20.5)">Token</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 27h100v66H444z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="45" height="33" transform="translate(470.5 42.5)">coffee</text>
|
||||
<text class="svg__vocab__text-code" dx="0.6em" dy="2.8em" width="45" height="33" transform="translate(470.5 42.5)">NOUN</text>
|
||||
<rect width="50" height="20" x="469" y="16" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(475.5 19.5)">Token</text>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 141.8v-38.6"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 149.8l-2.7-8h5.4zM126 95.2l2.7 8h-5.4z"/>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 214.2v38.6"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 206.2l2.7 8h-5.4zM126 260.8l-2.7-8h5.4z"/>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 103.2v38.6"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 95.2l2.7 8h-5.4zM313 149.8l-2.7-8h5.4z"/>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 214.2v38.6"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 206.2l2.7 8h-5.4zM313 260.8l-2.7-8h5.4z"/>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 214.2v38.6"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 206.2l2.7 8h-5.4zM494 260.8l-2.7-8h5.4z"/>
|
||||
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 103.2v38.6"/>
|
||||
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 95.2l2.7 8h-5.4zM494 149.8l-2.7-8h5.4z"/>
|
||||
</svg>
|
After Width: | Height: | Size: 7.6 KiB |
|
@ -24,7 +24,8 @@
|
|||
"Vocab": "vocab",
|
||||
"StringStore": "stringstore",
|
||||
"GoldParse": "goldparse",
|
||||
"GoldCorpus": "goldcorpus"
|
||||
"GoldCorpus": "goldcorpus",
|
||||
"Binder": "binder"
|
||||
},
|
||||
"Other": {
|
||||
"Annotation Specs": "annotation"
|
||||
|
@ -47,62 +48,74 @@
|
|||
|
||||
"spacy": {
|
||||
"title": "spaCy top-level functions",
|
||||
"source": "spacy/__init__.py",
|
||||
"next": "displacy"
|
||||
},
|
||||
|
||||
"displacy": {
|
||||
"title": "displaCy",
|
||||
"tag": "module",
|
||||
"source": "spacy/displacy",
|
||||
"next": "util"
|
||||
},
|
||||
|
||||
"util": {
|
||||
"title": "Utility Functions",
|
||||
"source": "spacy/util.py",
|
||||
"next": "cli"
|
||||
},
|
||||
|
||||
"cli": {
|
||||
"title": "Command Line Interface"
|
||||
"title": "Command Line Interface",
|
||||
"source": "spacy/cli"
|
||||
},
|
||||
|
||||
"language": {
|
||||
"title": "Language",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/language.py"
|
||||
},
|
||||
|
||||
"doc": {
|
||||
"title": "Doc",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/tokens/doc.pyx"
|
||||
},
|
||||
|
||||
"token": {
|
||||
"title": "Token",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/tokens/token.pyx"
|
||||
},
|
||||
|
||||
"span": {
|
||||
"title": "Span",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/tokens/span.pyx"
|
||||
},
|
||||
|
||||
"lexeme": {
|
||||
"title": "Lexeme",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/lexeme.pyx"
|
||||
},
|
||||
|
||||
"vocab": {
|
||||
"title": "Vocab",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/vocab.pyx"
|
||||
},
|
||||
|
||||
"stringstore": {
|
||||
"title": "StringStore",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/strings.pyx"
|
||||
},
|
||||
|
||||
"matcher": {
|
||||
"title": "Matcher",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/matcher.pyx"
|
||||
},
|
||||
|
||||
"dependenyparser": {
|
||||
|
@ -122,7 +135,8 @@
|
|||
|
||||
"tokenizer": {
|
||||
"title": "Tokenizer",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/tokenizer.pyx"
|
||||
},
|
||||
|
||||
"tagger": {
|
||||
|
@ -132,11 +146,18 @@
|
|||
|
||||
"goldparse": {
|
||||
"title": "GoldParse",
|
||||
"tag": "class"
|
||||
"tag": "class",
|
||||
"source": "spacy/gold.pyx"
|
||||
},
|
||||
|
||||
"goldcorpus": {
|
||||
"title": "GoldCorpus",
|
||||
"tag": "class",
|
||||
"source": "spacy/gold.pyx"
|
||||
},
|
||||
|
||||
"binder": {
|
||||
"title": "Binder",
|
||||
"tag": "class"
|
||||
},
|
||||
|
||||
|
|
5
website/docs/api/binder.jade
Normal file
|
@ -0,0 +1,5 @@
|
|||
//- 💫 DOCS > API > BINDER
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
+under-construction
|
|
@ -166,7 +166,7 @@ p
|
|||
| #[+a("/docs/api/annotation#json-input") JSON format].
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -192,18 +192,13 @@ p
|
|||
+row
|
||||
+cell #[code --n-iter], #[code -n]
|
||||
+cell option
|
||||
+cell Number of iterations (default: #[code 15]).
|
||||
+cell Number of iterations (default: #[code 20]).
|
||||
|
||||
+row
|
||||
+cell #[code --n_sents], #[code -ns]
|
||||
+cell #[code --n-sents], #[code -ns]
|
||||
+cell option
|
||||
+cell Number of sentences (default: #[code 0]).
|
||||
|
||||
+row
|
||||
+cell #[code --parser-L1], #[code -L]
|
||||
+cell option
|
||||
+cell L1 regularization penalty for parser (default: #[code 0.0]).
|
||||
|
||||
+row
|
||||
+cell #[code --use-gpu], #[code -G]
|
||||
+cell flag
|
||||
|
@ -220,7 +215,7 @@ p
|
|||
+cell Don't train parser.
|
||||
|
||||
+row
|
||||
+cell #[code --no-ner], #[code -N]
|
||||
+cell #[code --no-entities], #[code -N]
|
||||
+cell flag
|
||||
+cell Don't train NER.
|
||||
|
||||
|
@ -229,6 +224,106 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(3, "train-hyperparams") Environment variables for hyperparameters
|
||||
|
||||
p
|
||||
| spaCy lets you set hyperparameters for training via environment variables.
|
||||
| This is useful, because it keeps the command simple and allows you to
|
||||
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias]
|
||||
| for your custom #[code train] command while still being able to easily
|
||||
| tweak the hyperparameters. For example:
|
||||
|
||||
+code(false, "bash").
|
||||
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
|
||||
|
||||
+under-construction
|
||||
|
||||
+table(["Name", "Description", "Default"])
|
||||
+row
|
||||
+cell #[code dropout_from]
|
||||
+cell
|
||||
+cell #[code 0.2]
|
||||
|
||||
+row
|
||||
+cell #[code dropout_to]
|
||||
+cell
|
||||
+cell #[code 0.2]
|
||||
|
||||
+row
|
||||
+cell #[code dropout_decay]
|
||||
+cell
|
||||
+cell #[code 0.0]
|
||||
|
||||
+row
|
||||
+cell #[code batch_from]
|
||||
+cell
|
||||
+cell #[code 1]
|
||||
|
||||
+row
|
||||
+cell #[code batch_to]
|
||||
+cell
|
||||
+cell #[code 64]
|
||||
|
||||
+row
|
||||
+cell #[code batch_compound]
|
||||
+cell
|
||||
+cell #[code 1.001]
|
||||
|
||||
+row
|
||||
+cell #[code token_vector_width]
|
||||
+cell
|
||||
+cell #[code 128]
|
||||
|
||||
+row
|
||||
+cell #[code embed_size]
|
||||
+cell
|
||||
+cell #[code 7500]
|
||||
|
||||
+row
|
||||
+cell #[code parser_maxout_pieces]
|
||||
+cell
|
||||
+cell #[code 2]
|
||||
|
||||
+row
|
||||
+cell #[code parser_hidden_depth]
|
||||
+cell
|
||||
+cell #[code 1]
|
||||
|
||||
+row
|
||||
+cell #[code hidden_width]
|
||||
+cell
|
||||
+cell #[code 128]
|
||||
|
||||
+row
|
||||
+cell #[code learn_rate]
|
||||
+cell
|
||||
+cell #[code 0.001]
|
||||
|
||||
+row
|
||||
+cell #[code optimizer_B1]
|
||||
+cell
|
||||
+cell #[code 0.9]
|
||||
|
||||
+row
|
||||
+cell #[code optimizer_B2]
|
||||
+cell
|
||||
+cell #[code 0.999]
|
||||
|
||||
+row
|
||||
+cell #[code optimizer_eps]
|
||||
+cell
|
||||
+cell #[code 1e-08]
|
||||
|
||||
+row
|
||||
+cell #[code L2_penalty]
|
||||
+cell
|
||||
+cell #[code 1e-06]
|
||||
|
||||
+row
|
||||
+cell #[code grad_norm_clip]
|
||||
+cell
|
||||
+cell #[code 1.0]
|
||||
|
||||
+h(2, "package") Package
|
||||
|
||||
p
|
||||
|
|
|
@ -10,6 +10,7 @@ p
|
|||
|
||||
+h(2, "serve") displacy.serve
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Serve a dependency parse tree or named entity visualization to view it
|
||||
|
@ -71,6 +72,7 @@ p
|
|||
|
||||
+h(2, "render") displacy.render
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Render a dependency parse tree or named entity visualization.
|
||||
|
||||
|
|
|
@ -255,6 +255,7 @@ p
|
|||
|
||||
+h(2, "to_disk") Doc.to_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Save the current state to a directory.
|
||||
|
||||
|
@ -271,12 +272,14 @@ p Save the current state to a directory.
|
|||
|
||||
+h(2, "from_disk") Doc.from_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
doc = Doc().from_disk('/path/to/doc')
|
||||
from spacy.vocab import Vocab
|
||||
doc = Doc(Vocab()).from_disk('/path/to/doc')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
|
|
@ -8,6 +8,7 @@ p
|
|||
|
||||
+h(2, "init") GoldCorpus.__init__
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Create a #[code GoldCorpus].
|
||||
|
||||
|
|
|
@ -73,15 +73,26 @@ p
|
|||
+cell The text to be processed.
|
||||
|
||||
+row
|
||||
+cell #[code **disabled]
|
||||
+cell -
|
||||
+cell Elements of the pipeline that should not be run.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell A container for accessing the annotations.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| Pipeline components to prevent from being loaded can now be added as
|
||||
| a list to #[code disable], instead of specifying one keyword argument
|
||||
| per component.
|
||||
|
||||
+code-new doc = nlp(u"I don't want parsed", disable=['parser'])
|
||||
+code-old doc = nlp(u"I don't want parsed", parse=False)
|
||||
|
||||
+h(2, "pipe") Language.pipe
|
||||
+tag method
|
||||
|
||||
|
@ -112,6 +123,13 @@ p
|
|||
+cell int
|
||||
+cell The number of texts to buffer.
|
||||
|
||||
+row
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
|
@ -227,8 +245,11 @@ p
|
|||
|
||||
+h(2, "to_disk") Language.to_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Save the current state to a directory.
|
||||
p
|
||||
| Save the current state to a directory. If a model is loaded, this will
|
||||
| #[strong include the model].
|
||||
|
||||
+aside-code("Example").
|
||||
nlp.to_disk('/path/to/models')
|
||||
|
@ -242,14 +263,21 @@ p Save the current state to a directory.
|
|||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being saved.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
|
||||
| and prevent from being saved.
|
||||
|
||||
+h(2, "from_disk") Language.from_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
p
|
||||
| Loads state from a directory. Modifies the object in place and returns
|
||||
| it. If the saved #[code Language] object contains a model, the
|
||||
| #[strong model will be loaded].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.language import Language
|
||||
|
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
|
|||
| #[code Path]-like objects.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell The modified #[code Language] object.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| As of spaCy v2.0, the #[code save_to_directory] method has been
|
||||
| renamed to #[code to_disk], to improve consistency across classes.
|
||||
| Pipeline components to prevent from being loaded can now be added as
|
||||
| a list to #[code disable], instead of specifying one keyword argument
|
||||
| per component.
|
||||
|
||||
+code-new nlp = English().from_disk(disable=['tagger', 'ner'])
|
||||
+code-old nlp = spacy.load('en', tagger=False, entity=False)
|
||||
|
||||
+h(2, "to_bytes") Language.to_bytes
|
||||
+tag method
|
||||
|
||||
|
@ -283,9 +324,12 @@ p Serialize the current state to a binary string.
|
|||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
|
||||
| and prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
|
@ -310,15 +354,26 @@ p Load state from a binary string.
|
|||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell The #[code Language] object.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| Pipeline components to prevent from being loaded can now be added as
|
||||
| a list to #[code disable], instead of specifying one keyword argument
|
||||
| per component.
|
||||
|
||||
+code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
|
||||
+code-old nlp = English().from_bytes('en', tagger=False, entity=False)
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
@ -327,6 +382,11 @@ p Load state from a binary string.
|
|||
+cell #[code Vocab]
|
||||
+cell A container for the lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code tokenizer]
|
||||
+cell #[code Tokenizer]
|
||||
+cell The tokenizer.
|
||||
|
||||
+row
|
||||
+cell #[code make_doc]
|
||||
+cell #[code lambda text: Doc]
|
||||
|
|
|
@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+row
|
||||
+cell #[code is_alpha]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isalpha()].
|
||||
+cell
|
||||
| Does the lexeme consist of alphabetic characters? Equivalent to
|
||||
| #[code lexeme.text.isalpha()].
|
||||
|
||||
+row
|
||||
+cell #[code is_ascii]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
|
||||
+cell
|
||||
| Does the lexeme consist of ASCII characters? Equivalent to
|
||||
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
|
||||
|
||||
+row
|
||||
+cell #[code is_digit]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isdigit()].
|
||||
+cell
|
||||
| Does the lexeme consist of digits? Equivalent to
|
||||
| #[code lexeme.text.isdigit()].
|
||||
|
||||
+row
|
||||
+cell #[code is_lower]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.islower()].
|
||||
+cell
|
||||
| Is the lexeme in lowercase? Equivalent to
|
||||
| #[code lexeme.text.islower()].
|
||||
|
||||
+row
|
||||
+cell #[code is_title]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.istitle()].
|
||||
+cell
|
||||
| Is the lexeme in titlecase? Equivalent to
|
||||
| #[code lexeme.text.istitle()].
|
||||
|
||||
+row
|
||||
+cell #[code is_punct]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.ispunct()].
|
||||
+cell Is the lexeme punctuation?
|
||||
|
||||
+row
|
||||
+cell #[code is_space]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isspace()].
|
||||
+cell
|
||||
| Does the lexeme consist of whitespace characters? Equivalent to
|
||||
| #[code lexeme.text.isspace()].
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
+cell Does the word resemble a URL?
|
||||
+cell Does the lexeme resemble a URL?
|
||||
|
||||
+row
|
||||
+cell #[code like_num]
|
||||
+cell bool
|
||||
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
|
||||
+cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
|
||||
|
||||
+row
|
||||
+cell #[code like_email]
|
||||
+cell bool
|
||||
+cell Does the word resemble an email address?
|
||||
+cell Does the lexeme resemble an email address?
|
||||
|
||||
+row
|
||||
+cell #[code is_oov]
|
||||
+cell bool
|
||||
+cell Is the word out-of-vocabulary?
|
||||
+cell Is the lexeme out-of-vocabulary?
|
||||
|
||||
+row
|
||||
+cell #[code is_stop]
|
||||
+cell bool
|
||||
+cell Is the word part of a "stop list"?
|
||||
+cell Is the lexeme part of a "stop list"?
|
||||
|
||||
+row
|
||||
+cell #[code lang]
|
||||
|
|
|
@ -5,13 +5,14 @@ include ../../_includes/_mixins
|
|||
p Match sequences of tokens, based on pattern rules.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
||||
| is now called #[+api("matcher#get") #[code matcher.get]].
|
||||
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
||||
.o-block
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
||||
| is now called #[+api("matcher#get") #[code matcher.get]].
|
||||
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
||||
|
||||
+h(2, "init") Matcher.__init__
|
||||
+tag method
|
||||
|
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|||
doc = nlp(u'hello world!')
|
||||
matches = matcher(doc)
|
||||
|
||||
+infobox("Important note")
|
||||
| By default, the matcher #[strong does not perform any action] on matches,
|
||||
| like tagging matched phrases with entity types. Instead, actions need to
|
||||
| be specified when #[strong adding patterns or entities], by
|
||||
| passing in a callback function as the #[code on_match] argument on
|
||||
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
||||
| actions per pattern within the same matcher. For example, you might only
|
||||
| want to merge some entity types, and set custom flags for other matched
|
||||
| patterns. For more details and examples, see the usage workflow on
|
||||
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
|
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|||
| matches. A match tuple describes a span #[code doc[start:end]].
|
||||
| The #[code match_id] is the ID of the added match pattern.
|
||||
|
||||
+infobox("Important note")
|
||||
| By default, the matcher #[strong does not perform any action] on matches,
|
||||
| like tagging matched phrases with entity types. Instead, actions need to
|
||||
| be specified when #[strong adding patterns or entities], by
|
||||
| passing in a callback function as the #[code on_match] argument on
|
||||
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
||||
| actions per pattern within the same matcher. For example, you might only
|
||||
| want to merge some entity types, and set custom flags for other matched
|
||||
| patterns. For more details and examples, see the usage workflow on
|
||||
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
||||
|
||||
+h(2, "pipe") Matcher.pipe
|
||||
+tag method
|
||||
|
||||
|
@ -118,6 +119,7 @@ p Match a stream of documents, yielding them in turn.
|
|||
|
||||
+h(2, "len") Matcher.__len__
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Get the number of rules added to the matcher. Note that this only returns
|
||||
|
@ -138,6 +140,7 @@ p
|
|||
|
||||
+h(2, "contains") Matcher.__contains__
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Check whether the matcher contains rules for a match ID.
|
||||
|
||||
|
@ -159,6 +162,7 @@ p Check whether the matcher contains rules for a match ID.
|
|||
|
||||
+h(2, "add") Matcher.add
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
|
||||
|
@ -198,8 +202,23 @@ p
|
|||
| Match pattern. A pattern consists of a list of dicts, where each
|
||||
| dict describes a token.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID.
|
||||
|
||||
+code-new.
|
||||
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
|
||||
+code-old.
|
||||
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
||||
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
|
||||
+h(2, "remove") Matcher.remove
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
|
||||
|
@ -219,6 +238,7 @@ p
|
|||
|
||||
+h(2, "get") Matcher.get
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Retrieve the pattern stored for a key. Returns the rule as an
|
||||
|
|
|
@ -20,12 +20,7 @@ p
|
|||
nlp = spacy.load('/path/to/en') # unicode path
|
||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
| will also raise an error if no model could be loaded and never just
|
||||
| return an empty #[code Language] object. If you need a blank language,
|
||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -34,15 +29,28 @@ p
|
|||
+cell Model to load, i.e. shortcut link, package name or path.
|
||||
|
||||
+row
|
||||
+cell #[code **overrides]
|
||||
+cell -
|
||||
+cell Override or disable components.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell A #[code Language] object with the loaded model.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
| will also raise an error if no model could be loaded and never just
|
||||
| return an empty #[code Language] object. If you need a blank language,
|
||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
||||
|
||||
+code-new nlp = spacy.load('/model')
|
||||
+code-old nlp = spacy.load('en', path='/model')
|
||||
|
||||
+h(2, "info") spacy.info
|
||||
+tag function
|
||||
|
||||
|
@ -98,3 +106,37 @@ p
|
|||
+cell returns
|
||||
+cell unicode
|
||||
+cell The explanation, or #[code None] if not found in the glossary.
|
||||
|
||||
+h(2, "set_factory") spacy.set_factory
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Set a factory that returns a custom
|
||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
|
||||
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
||||
|
||||
+aside-code("Example").
|
||||
def my_factory(vocab):
|
||||
def my_component(doc):
|
||||
return doc
|
||||
return my_component
|
||||
|
||||
spacy.set_factory('my_factory', my_factory)
|
||||
nlp = Language(pipeline=['my_factory'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code factory_id]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Unique name of factory. If added to a new pipeline, spaCy will
|
||||
| look up the factory for this ID and use it to create the
|
||||
| component.
|
||||
|
||||
+row
|
||||
+cell #[code factory]
|
||||
+cell callable
|
||||
+cell
|
||||
| Callable that takes a #[code Vocab] object and returns a pipeline
|
||||
| component.
|
||||
|
|
|
@ -104,6 +104,7 @@ p
|
|||
|
||||
+h(2, "to_disk") StringStore.to_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Save the current state to a directory.
|
||||
|
||||
|
@ -118,8 +119,9 @@ p Save the current state to a directory.
|
|||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+h(2, "from_disk") Tokenizer.from_disk
|
||||
+h(2, "from_disk") StringStore.from_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
|
@ -137,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.
|
|||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The modified #[code Tokenizer] object.
|
||||
+cell #[code StringStore]
|
||||
+cell The modified #[code StringStore] object.
|
||||
|
||||
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||
+h(2, "to_bytes") StringStore.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
@ -157,9 +159,9 @@ p Serialize the current state to a binary string.
|
|||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Tokenizer] object.
|
||||
+cell The serialized form of the #[code StringStore] object.
|
||||
|
||||
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||
+h(2, "from_bytes") StringStore.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
|
|
@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation.
|
|||
+cell #[code lemma]
|
||||
+cell int
|
||||
+cell
|
||||
| Base form of the word, with no inflectional suffixes.
|
||||
| Base form of the token, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code lemma_]
|
||||
+cell unicode
|
||||
+cell Base form of the word, with no inflectional suffixes.
|
||||
+cell Base form of the token, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
+cell Lower-case form of the word.
|
||||
+cell Lower-case form of the token.
|
||||
|
||||
+row
|
||||
+cell #[code lower_]
|
||||
+cell unicode
|
||||
+cell Lower-case form of the word.
|
||||
+cell Lower-case form of the token.
|
||||
|
||||
+row
|
||||
+cell #[code shape]
|
||||
+cell int
|
||||
+cell Transform of the word's string, to show orthographic features.
|
||||
+cell
|
||||
| Transform of the tokens's string, to show orthographic features.
|
||||
| For example, "Xxxx" or "dd".
|
||||
|
||||
+row
|
||||
+cell #[code shape_]
|
||||
+cell unicode
|
||||
+cell A transform of the word's string, to show orthographic features.
|
||||
| Transform of the tokens's string, to show orthographic features.
|
||||
| For example, "Xxxx" or "dd".
|
||||
|
||||
+row
|
||||
+cell #[code prefix]
|
||||
+cell int
|
||||
+cell Integer ID of a length-N substring from the start of the
|
||||
| word. Defaults to #[code N=1].
|
||||
| token. Defaults to #[code N=1].
|
||||
|
||||
+row
|
||||
+cell #[code prefix_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| A length-N substring from the start of the word. Defaults to
|
||||
| A length-N substring from the start of the token. Defaults to
|
||||
| #[code N=1].
|
||||
|
||||
+row
|
||||
+cell #[code suffix]
|
||||
+cell int
|
||||
+cell
|
||||
| Length-N substring from the end of the word. Defaults to #[code N=3].
|
||||
| Length-N substring from the end of the token. Defaults to #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code suffix_]
|
||||
+cell unicode
|
||||
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
|
||||
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code is_alpha]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isalpha()].
|
||||
+cell
|
||||
| Does the token consist of alphabetic characters? Equivalent to
|
||||
| #[code token.text.isalpha()].
|
||||
|
||||
+row
|
||||
+cell #[code is_ascii]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
|
||||
+cell
|
||||
| Does the token consist of ASCII characters? Equivalent to
|
||||
| #[code [any(ord(c) >= 128 for c in token.text)]].
|
||||
|
||||
+row
|
||||
+cell #[code is_digit]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isdigit()].
|
||||
+cell
|
||||
| Does the token consist of digits? Equivalent to
|
||||
| #[code token.text.isdigit()].
|
||||
|
||||
+row
|
||||
+cell #[code is_lower]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.islower()].
|
||||
+cell
|
||||
| Is the token in lowercase? Equivalent to
|
||||
| #[code token.text.islower()].
|
||||
|
||||
+row
|
||||
+cell #[code is_title]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.istitle()].
|
||||
+cell
|
||||
| Is the token in titlecase? Equivalent to
|
||||
| #[code token.text.istitle()].
|
||||
|
||||
+row
|
||||
+cell #[code is_punct]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.ispunct()].
|
||||
+cell Is the token punctuation?
|
||||
|
||||
+row
|
||||
+cell #[code is_space]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isspace()].
|
||||
+cell
|
||||
| Does the token consist of whitespace characters? Equivalent to
|
||||
| #[code token.text.isspace()].
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
+cell Does the word resemble a URL?
|
||||
+cell Does the token resemble a URL?
|
||||
|
||||
+row
|
||||
+cell #[code like_num]
|
||||
+cell bool
|
||||
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
|
||||
+cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
|
||||
|
||||
+row
|
||||
+cell #[code like_email]
|
||||
+cell bool
|
||||
+cell Does the word resemble an email address?
|
||||
+cell Does the token resemble an email address?
|
||||
|
||||
+row
|
||||
+cell #[code is_oov]
|
||||
+cell bool
|
||||
+cell Is the word out-of-vocabulary?
|
||||
+cell Is the token out-of-vocabulary?
|
||||
|
||||
+row
|
||||
+cell #[code is_stop]
|
||||
+cell bool
|
||||
+cell Is the word part of a "stop list"?
|
||||
+cell Is the token part of a "stop list"?
|
||||
|
||||
+row
|
||||
+cell #[code pos]
|
||||
|
|
|
@ -198,91 +198,6 @@ p
|
|||
| attributes. The #[code ORTH] fields of the attributes must
|
||||
| exactly match the string when they are concatenated.
|
||||
|
||||
+h(2, "to_disk") Tokenizer.to_disk
|
||||
+tag method
|
||||
|
||||
p Save the current state to a directory.
|
||||
|
||||
+aside-code("Example").
|
||||
tokenizer.to_disk('/path/to/tokenizer')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+h(2, "from_disk") Tokenizer.from_disk
|
||||
+tag method
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokenizer import Tokenizer
|
||||
tokenizer = Tokenizer(nlp.vocab)
|
||||
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The modified #[code Tokenizer] object.
|
||||
|
||||
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Tokenizer] object.
|
||||
|
||||
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
fron spacy.tokenizer import Tokenizer
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
new_tokenizer = Tokenizer(nlp.vocab)
|
||||
new_tokenizer.from_bytes(tokenizer_bytes)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The #[code Tokenizer] object.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
|
|
@ -76,6 +76,7 @@ p
|
|||
|
||||
+h(2, "resolve_model_path") util.resolve_model_path
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p Resolve a model name or string to a model path.
|
||||
|
||||
|
@ -169,6 +170,7 @@ p
|
|||
|
||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
|
||||
|
@ -221,6 +223,7 @@ p
|
|||
|
||||
+h(2, "prints") util.prints
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Print a formatted, text-wrapped message with optional title. If a text
|
||||
|
|
|
@ -159,6 +159,7 @@ p
|
|||
|
||||
+h(2, "to_disk") Vocab.to_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Save the current state to a directory.
|
||||
|
||||
|
@ -175,6 +176,7 @@ p Save the current state to a directory.
|
|||
|
||||
+h(2, "from_disk") Vocab.from_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
|
|
|
@ -80,7 +80,7 @@
|
|||
},
|
||||
|
||||
"customizing-tokenizer": {
|
||||
"title": "Customizing the tokenizer",
|
||||
"title": "Customising the tokenizer",
|
||||
"next": "rule-based-matching"
|
||||
},
|
||||
|
||||
|
|
|
@ -48,3 +48,13 @@ p
|
|||
+cell ner
|
||||
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
|
||||
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
|
||||
|
||||
p
|
||||
| The processing pipeline always #[strong depends on the statistical model]
|
||||
| and its capabilities. For example, a pipeline can only include an entity
|
||||
| recognizer component if the model includes data to make predictions of
|
||||
| entity labels. This is why each model will specify the pipeline to use
|
||||
| in its meta data, as a simple list containing the component names:
|
||||
|
||||
+code(false, "json").
|
||||
"pipeline": ["vectorizer", "tagger", "parser", "ner"]
|
||||
|
|
|
@ -22,10 +22,10 @@ p
|
|||
| untrusted sources.
|
||||
|
||||
p
|
||||
| All container classes and pipeline components, i.e.
|
||||
for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"]
|
||||
| #[+api(cls.toLowerCase()) #[code=cls]],
|
||||
| have the following methods available:
|
||||
| All container classes, i.e. #[+api("language") #[code Language]],
|
||||
| #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and
|
||||
| #[+api("stringstore") #[code StringStore]] have the following methods
|
||||
| available:
|
||||
|
||||
+table(["Method", "Returns", "Example"])
|
||||
- style = [1, 0, 1]
|
||||
|
@ -34,7 +34,35 @@ p
|
|||
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
|
||||
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
|
||||
|
||||
p
|
||||
| For example, if you've processed a very large document, you can use
|
||||
| #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
|
||||
| local machine. This will save the document and its tokens, as well as
|
||||
| the vocabulary associated with the #[code Doc].
|
||||
|
||||
+aside("Why saving the vocab?")
|
||||
| Saving the vocabulary with the #[code Doc] is important, because the
|
||||
| #[code Vocab] holds the context-independent information about the words,
|
||||
| tags and labels, and their #[strong integer IDs]. If the #[code Vocab]
|
||||
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
|
||||
| those IDs – for example, the word text or the dependency labels. You
|
||||
| might be saving #[code 446] for "whale", but in a different vocabulary,
|
||||
| this ID could map to "VERB". Similarly, if your document was processed by
|
||||
| a German model, its vocab will include the specific
|
||||
| #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
|
||||
|
||||
+code.
|
||||
moby_dick = open('moby_dick.txt', 'r') # open a large document
|
||||
doc = nlp(moby_dick) # process it
|
||||
doc.to_disk('/moby_dick.bin') # save the processed Doc
|
||||
|
||||
p
|
||||
| If you need it again later, you can load it back into an empty #[code Doc]
|
||||
| with an empty #[code Vocab] by calling
|
||||
| #[+api("doc#from_disk") #[code from_disk()]]:
|
||||
|
||||
+code.
|
||||
from spacy.tokens import Doc # to create empty Doc
|
||||
from spacy.vocab import Vocab # to create empty Vocab
|
||||
|
||||
doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
|
||||
|
||||
p
|
||||
+under-construction
|
||||
|
|
92
website/docs/usage/_spacy-101/_vocab-stringstore.jade
Normal file
|
@ -0,0 +1,92 @@
|
|||
//- 💫 DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE
|
||||
|
||||
p
|
||||
| Whenever possible, spaCy tries to store data in a vocabulary, the
|
||||
| #[+api("vocab") #[code Vocab]], that will be
|
||||
| #[strong shared by multiple documents]. To save memory, spaCy also
|
||||
| encodes all strings to #[strong integer IDs] – in this case for example,
|
||||
| "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
|
||||
| part-of-speech tags like "VERB" are also encoded. Internally, spaCy
|
||||
| only "speaks" in integer IDs.
|
||||
|
||||
+aside
|
||||
| #[strong Token]: A word, punctuation mark etc. #[em in context], including
|
||||
| its attributes, tags and dependencies.#[br]
|
||||
| #[strong Lexeme]: A "word type" with no context. Includes the word shape
|
||||
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
|
||||
| #[strong Doc]: A processed container of tokens in context.#[br]
|
||||
| #[strong Vocab]: The collection of lexemes.#[br]
|
||||
| #[strong StringStore]: The dictionary mapping integer IDs to strings, for
|
||||
| example #[code 3672] → "coffee".
|
||||
|
||||
+image
|
||||
include ../../../assets/img/docs/vocab_stringstore.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
p
|
||||
| If you process lots of documents containing the word "coffee" in all
|
||||
| kinds of different contexts, storing the exact string "coffee" every time
|
||||
| would take up way too much space. So instead, spaCy assigns it an ID
|
||||
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can
|
||||
| think of the #[code StringStore] as a
|
||||
| #[strong lookup table that works in both directions] – you can look up a
|
||||
| string to get its ID, or an ID to get its string:
|
||||
|
||||
+code.
|
||||
doc = nlp(u'I like coffee')
|
||||
assert doc.vocab.strings[u'coffee'] == 3572
|
||||
assert doc.vocab.strings[3572] == u'coffee'
|
||||
|
||||
p
|
||||
| Now that all strings are encoded, the entries in the vocabulary
|
||||
| #[strong don't need to include the word text] themselves. Instead,
|
||||
| they can look it up in the #[code StringStore] via its integer ID. Each
|
||||
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
|
||||
| contains the #[strong context-independent] information about a word.
|
||||
| For example, no matter if "love" is used as a verb or a noun in some
|
||||
| context, its spelling and whether it consists of alphabetic characters
|
||||
| won't ever change.
|
||||
|
||||
+code.
|
||||
for word in doc:
|
||||
lexeme = doc.vocab[word.text]
|
||||
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
|
||||
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
|
||||
|
||||
+aside
|
||||
| #[strong Text]: The original text of the lexeme.#[br]
|
||||
| #[strong Orth]: The integer ID of the lexeme.#[br]
|
||||
| #[strong Shape]: The abstract word shape of the lexeme.#[br]
|
||||
| #[strong Prefix]: By default, the first letter of the word string.#[br]
|
||||
| #[strong Suffix]: By default, the last three letters of the word string.#[br]
|
||||
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
|
||||
| #[strong is digit]: Does the lexeme consist of digits?#[br]
|
||||
| #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
|
||||
| #[strong Lang]: The language of the parent vocabulary.
|
||||
|
||||
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
|
||||
- var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
|
||||
+annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
|
||||
+annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
|
||||
+annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
|
||||
|
||||
p
|
||||
| The specific entries in the voabulary and their IDs don't really matter –
|
||||
| #[strong as long as they match]. That's why you always need to make sure
|
||||
| all objects you create have access to the same vocabulary. If they don't,
|
||||
| the IDs won't match and spaCy will either produce very confusing results,
|
||||
| or fail alltogether.
|
||||
|
||||
+code.
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
doc = nlp(u'I like coffee') # original Doc
|
||||
new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
|
||||
assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
|
||||
assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
|
||||
|
||||
p
|
||||
| Even though both #[code Doc] objects contain the same words, the internal
|
||||
| integer IDs are very different.
|
|
@ -107,7 +107,6 @@ p
|
|||
.u-text-right
|
||||
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
|
||||
+table(["File name", "Variables", "Description"])
|
||||
+row
|
||||
+cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
|
||||
|
@ -439,7 +438,7 @@ p
|
|||
|
||||
+h(3, "morph-rules") Morph rules
|
||||
|
||||
//- TODO: write morph rules section
|
||||
+under-construction
|
||||
|
||||
+h(2, "testing") Testing the new language tokenizer
|
||||
|
||||
|
@ -631,7 +630,7 @@ p
|
|||
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
|
||||
| The #[code vectors.bin] file should consist of one word and vector per line.
|
||||
|
||||
+aside-code("your_data_directory", "yaml").
|
||||
//-+aside-code("your_data_directory", "yaml").
|
||||
├── vocab/
|
||||
| ├── lexemes.bin
|
||||
| ├── strings.json
|
||||
|
@ -662,4 +661,4 @@ p
|
|||
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
||||
|
|
|
@ -17,6 +17,8 @@ p
|
|||
| #[+a("http://deeplearning.net/software/theano/") Theano] is also
|
||||
| supported.
|
||||
|
||||
+under-construction
|
||||
|
||||
+code("Runtime usage").
|
||||
def count_entity_sentiment(nlp, texts):
|
||||
'''Compute the net document sentiment for each entity in the texts.'''
|
||||
|
@ -153,7 +155,9 @@ p
|
|||
| adding another LSTM layer, using attention mechanism, using character
|
||||
| features, etc.
|
||||
|
||||
+h(2, "attribute-hooks") Attribute hooks (experimental)
|
||||
+h(2, "attribute-hooks") Attribute hooks
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| Earlier, we saw how to store data in the new generic #[code user_data]
|
||||
|
|
|
@ -322,8 +322,9 @@ p
|
|||
| If you don't need a particular component of the pipeline – for
|
||||
| example, the tagger or the parser, you can disable loading it. This can
|
||||
| sometimes make a big difference and improve loading speed. Disabled
|
||||
| component names can be provided to #[code spacy.load], #[code from_disk]
|
||||
| or the #[code nlp] object itself as a list:
|
||||
| component names can be provided to #[+api("spacy#load") #[code spacy.load]],
|
||||
| #[+api("language#from_disk") #[code Language.from_disk]] or the
|
||||
| #[code nlp] object itself as a list:
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||
|
|
|
@ -35,7 +35,7 @@ p
|
|||
assert doc[0].text == u'Peach'
|
||||
assert doc[1].text == u'emoji'
|
||||
assert doc[-1].text == u'🍑'
|
||||
assert doc[17:19] == u'outranking eggplant'
|
||||
assert doc[17:19].text == u'outranking eggplant'
|
||||
assert doc.noun_chunks[0].text == u'Peach emoji'
|
||||
|
||||
sentences = list(doc.sents)
|
||||
|
|
|
@ -2,16 +2,18 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
+under-construction
|
||||
|
||||
+h(2, "multithreading") Multi-threading with #[code .pipe()]
|
||||
|
||||
p
|
||||
| If you have a sequence of documents to process, you should use the
|
||||
| #[+api("language#pipe") #[code .pipe()]] method. The method takes an
|
||||
| iterator of texts, and accumulates an internal buffer,
|
||||
| #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
|
||||
| an iterator of texts, and accumulates an internal buffer,
|
||||
| which it works on in parallel. It then yields the documents in order,
|
||||
| one-by-one. After a long and bitter struggle, the global interpreter
|
||||
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
|
||||
| that the #[code .pipe()] method will be significantly faster in most
|
||||
| that #[code .pipe()] will be significantly faster in most
|
||||
| practical situations, because it allows shared memory parallelism.
|
||||
|
||||
+code.
|
||||
|
@ -20,23 +22,27 @@ p
|
|||
|
||||
p
|
||||
| To make full use of the #[code .pipe()] function, you might want to
|
||||
| brush up on Python generators. Here are a few quick hints:
|
||||
| brush up on #[strong Python generators]. Here are a few quick hints:
|
||||
|
||||
+list
|
||||
+item
|
||||
| Generator comprehensions can be written
|
||||
| (#[code item for item in sequence])
|
||||
| Generator comprehensions can be written as
|
||||
| #[code (item for item in sequence)].
|
||||
|
||||
+item
|
||||
| The #[code itertools] built-in library and the #[code cytoolz]
|
||||
| package provide a lot of handy generator tools
|
||||
| The
|
||||
| #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
|
||||
| and the
|
||||
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
|
||||
| provide a lot of handy #[strong generator tools].
|
||||
|
||||
+item
|
||||
| Often you'll have an input stream that pairs text with some
|
||||
| important metadata, e.g. a JSON document. To pair up the metadata
|
||||
| with the processed #[code Doc] object, you should use the tee
|
||||
| function to split the generator in two, and then #[code izip] the
|
||||
| extra stream to the document stream.
|
||||
| important meta data, e.g. a JSON document. To
|
||||
| #[strong pair up the meta data] with the processed #[code Doc]
|
||||
| object, you should use the #[code itertools.tee] function to split
|
||||
| the generator in two, and then #[code izip] the extra stream to the
|
||||
| document stream.
|
||||
|
||||
+h(2, "own-annotations") Bringing your own annotations
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ include ../../_includes/_mixins
|
|||
|
||||
+h(2, "features") Features
|
||||
|
||||
+under-construction
|
||||
|
||||
+aside
|
||||
| If one of spaCy's functionalities #[strong needs a model], it means that
|
||||
| you need to have one our the available
|
||||
|
@ -91,17 +93,35 @@ p
|
|||
|
||||
include _spacy-101/_tokenization
|
||||
|
||||
+infobox
|
||||
| To learn more about how spaCy's tokenizer and its rules work in detail,
|
||||
| how to #[strong customise] it and how to #[strong add your own tokenizer]
|
||||
| to a processing pipeline, see the usage guide on
|
||||
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
|
||||
|
||||
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
||||
+tag-model("dependency parse")
|
||||
|
||||
include _spacy-101/_pos-deps
|
||||
|
||||
+infobox
|
||||
| To learn more about #[strong part-of-speech tagging] and rule-based
|
||||
| morphology, and how to #[strong navigate and use the parse tree]
|
||||
| effectively, see the usage guides on
|
||||
| #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
|
||||
| #[+a("/docs/usage/dependency-parse") using the dependency parse].
|
||||
|
||||
+h(3, "annotations-ner") Named Entities
|
||||
+tag-model("named entities")
|
||||
|
||||
include _spacy-101/_named-entities
|
||||
|
||||
+infobox
|
||||
| To learn more about entity recognition in spaCy, how to
|
||||
| #[strong add your own entities] to a document and how to train and update
|
||||
| the entity predictions of a model, see the usage guide on
|
||||
| #[+a("/docs/usage/entity-recognition") named entity recognition].
|
||||
|
||||
+h(2, "vectors-similarity") Word vectors and similarity
|
||||
+tag-model("vectors")
|
||||
|
||||
|
@ -109,20 +129,43 @@ include _spacy-101/_similarity
|
|||
|
||||
include _spacy-101/_word-vectors
|
||||
|
||||
+infobox
|
||||
| To learn more about word vectors, how to #[strong customise them] and
|
||||
| how to load #[strong your own vectors] into spaCy, see the usage
|
||||
| guide on
|
||||
| #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
|
||||
|
||||
+h(2, "pipelines") Pipelines
|
||||
|
||||
include _spacy-101/_pipelines
|
||||
|
||||
+infobox
|
||||
| To learn more about #[strong how processing pipelines work] in detail,
|
||||
| how to enable and disable their components, and how to
|
||||
| #[strong create your own], see the usage guide on
|
||||
| #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
|
||||
|
||||
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
|
||||
|
||||
include _spacy-101/_vocab-stringstore
|
||||
|
||||
+h(2, "serialization") Serialization
|
||||
|
||||
include _spacy-101/_serialization
|
||||
|
||||
+infobox
|
||||
| To learn more about #[strong serialization] and how to
|
||||
| #[strong save and load your own models], see the usage guide on
|
||||
| #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
|
||||
|
||||
+h(2, "training") Training
|
||||
|
||||
include _spacy-101/_training
|
||||
|
||||
+h(2, "architecture") Architecture
|
||||
|
||||
+under-construction
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/architecture.svg
|
||||
.u-text-right
|
||||
|
|
|
@ -64,44 +64,10 @@ p
|
|||
| predicts the new category with minimal difference from the previous
|
||||
| output.
|
||||
|
||||
+h(2, "saving-loading") Saving and loading
|
||||
|
||||
p
|
||||
| After training our model, you'll usually want to save its state, and load
|
||||
| it back later. You can do this with the #[code Language.save_to_directory()]
|
||||
| method:
|
||||
|
||||
+code.
|
||||
nlp.save_to_directory('/home/me/data/en_technology')
|
||||
|
||||
p
|
||||
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||
| a Python package, so that you can install it via pip and load it as a
|
||||
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
|
||||
| CLI command to create all required files and directories.
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
||||
|
||||
p
|
||||
| To build the package and create a #[code .tar.gz] archive, run
|
||||
| #[code python setup.py sdist] from within its directory.
|
||||
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/docs/usage/saving-loading") saving and loading models].
|
||||
|
||||
p
|
||||
| After you've generated and installed the package, you'll be able to
|
||||
| load the model as follows:
|
||||
|
||||
+code.
|
||||
import en_technology
|
||||
nlp = en_technology.load()
|
||||
|
||||
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| This script shows how to add a new entity type to an existing pre-trained
|
||||
| NER model. To keep the example short and simple, only four sentences are
|
||||
|
@ -170,5 +136,33 @@ p
|
|||
|
||||
p
|
||||
| After training your model, you can
|
||||
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
|
||||
| models as Python packages, for ease of deployment.
|
||||
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
|
||||
| wrapping models as Python packages, for ease of deployment.
|
||||
|
||||
+h(2, "saving-loading") Saving and loading
|
||||
|
||||
p
|
||||
| After training our model, you'll usually want to save its state, and load
|
||||
| it back later. You can do this with the
|
||||
| #[+api("language#to_disk") #[code Language.to_disk()]] method:
|
||||
|
||||
+code.
|
||||
nlp.to_disk('/home/me/data/en_technology')
|
||||
|
||||
p
|
||||
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||
| a Python package, so that you can install it via pip and load it as a
|
||||
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
|
||||
| CLI command to create all required files and directories.
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
||||
|
||||
p
|
||||
| To build the package and create a #[code .tar.gz] archive, run
|
||||
| #[code python setup.py sdist] from within its directory.
|
||||
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/docs/usage/saving-loading#models") saving and loading models].
|
||||
|
|
|
@ -81,59 +81,3 @@ p.o-inline-list
|
|||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
|
||||
|
||||
+h(2, "feature-templates") Customizing the feature extraction
|
||||
|
||||
p
|
||||
| spaCy currently uses linear models for the tagger, parser and entity
|
||||
| recognizer, with weights learned using the
|
||||
| #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
|
||||
|
||||
+aside("Linear Model Feature Scheme")
|
||||
| For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
|
||||
|
||||
p
|
||||
| Because it's a linear model, it's important for accuracy to build
|
||||
| conjunction features out of the atomic predictors. Let's say you have
|
||||
| two atomic predictors asking, "What is the part-of-speech of the
|
||||
| previous token?", and "What is the part-of-speech of the previous
|
||||
| previous token?". These predictors will introduce a number of features,
|
||||
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
||||
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
||||
|
||||
p
|
||||
| The feature extraction proceeds in two passes. In the first pass, we
|
||||
| fill an array with the values of all of the atomic predictors. In the
|
||||
| second pass, we iterate over the feature templates, and fill a small
|
||||
| temporary array with the predictors that will be combined into a
|
||||
| conjunction feature. Finally, we hash this array into a 64-bit integer,
|
||||
| using the MurmurHash algorithm. You can see this at work in the
|
||||
| #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module.
|
||||
|
||||
p
|
||||
| It's very easy to change the feature templates, to create novel
|
||||
| combinations of the existing atomic predictors. There's currently no API
|
||||
| available to add new atomic predictors, though. You'll have to create a
|
||||
| subclass of the model, and write your own #[code set_featuresC] method.
|
||||
|
||||
p
|
||||
| The feature templates are passed in using the #[code features] keyword
|
||||
| argument to the constructors of the #[+api("tagger") #[code Tagger]],
|
||||
| #[+api("dependencyparser") #[code DependencyParser]] and
|
||||
| #[+api("entityrecognizer") #[code EntityRecognizer]]:
|
||||
|
||||
+code.
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import Tagger
|
||||
from spacy.tagger import P2_orth, P1_orth
|
||||
from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
|
||||
|
||||
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
|
||||
tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
|
||||
(P2_orth,), (P1_orth,), (W_orth,),
|
||||
(N1_orth,), (N2_orth,)])
|
||||
|
||||
p
|
||||
| Custom feature templates can be passed to the #[code DependencyParser]
|
||||
| and #[code EntityRecognizer] as well, also using the #[code features]
|
||||
| keyword argument of the constructor.
|
||||
|
|
|
@ -50,9 +50,10 @@ p
|
|||
|
||||
p
|
||||
| spay's serialization API has been made consistent across classes and
|
||||
| objects. All container classes and pipeline components now have a
|
||||
| #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and
|
||||
| #[code from_disk()] method that supports the Pickle protocol.
|
||||
| objects. All container classes, i.e. #[code Language], #[code Doc],
|
||||
| #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
|
||||
| #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
|
||||
| that supports the Pickle protocol.
|
||||
|
||||
p
|
||||
| The improved #[code spacy.load] makes loading models easier and more
|
||||
|
|
|
@ -334,7 +334,7 @@ p
|
|||
| token #[code <script src="malicious-code.js"><script>].
|
||||
| Instead of relying on the server to render and sanitize HTML, you
|
||||
| can do this on the client in JavaScript. displaCy.js creates
|
||||
| the SVG markup as DOM nodes and will never insert raw HTML.
|
||||
| the markup as DOM nodes and will never insert raw HTML.
|
||||
|
||||
p
|
||||
| The #[code parse_deps] function takes a #[code Doc] object and returns
|
||||
|
|
|
@ -23,41 +23,20 @@ p
|
|||
include _spacy-101/_similarity
|
||||
include _spacy-101/_word-vectors
|
||||
|
||||
|
||||
+h(2, "custom") Customising word vectors
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| By default, #[+api("token#vector") #[code Token.vector]] returns the
|
||||
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
|
||||
| #[+api("doc#vector") #[code Doc.vector]] and
|
||||
| #[+api("span#vector") #[code Span.vector]] return an average of the
|
||||
| vectors of their tokens.
|
||||
|
||||
p
|
||||
| You can customize these
|
||||
| vectors of their tokens. You can customize these
|
||||
| behaviours by modifying the #[code doc.user_hooks],
|
||||
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
|
||||
| dictionaries.
|
||||
|
||||
+code("Example").
|
||||
# TODO
|
||||
|
||||
p
|
||||
| You can load new word vectors from a file-like buffer using the
|
||||
| #[code vocab.load_vectors()] method. The file should be a
|
||||
| whitespace-delimited text file, where the word is in the first column,
|
||||
| and subsequent columns provide the vector data. For faster loading, you
|
||||
| can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
|
||||
| path to a binary file written by #[code vocab.dump_vectors()].
|
||||
|
||||
+code("Example").
|
||||
# TODO
|
||||
|
||||
p
|
||||
| You can also load vectors from memory by writing to the
|
||||
| #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors
|
||||
| you are writing are of different dimensionality
|
||||
| from the ones currently loaded, you should first call
|
||||
| #[code vocab.resize_vectors(new_size)].
|
||||
|
||||
+h(2, "similarity") Similarity
|
||||
|
||||
+under-construction
|
||||
|
|