Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-26 11:31:41 -05:00
commit 8af3100143
40 changed files with 856 additions and 444 deletions

View File

@ -173,13 +173,13 @@ class Language(object):
flat_list.append(pipe) flat_list.append(pipe)
self.pipeline = flat_list self.pipeline = flat_list
def __call__(self, text, **disabled): def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences, """'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
text (unicode): The text to be processed. text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run. disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations. RETURNS (Doc): A container for accessing the annotations.
EXAMPLE: EXAMPLE:
@ -190,7 +190,7 @@ class Language(object):
doc = self.make_doc(text) doc = self.make_doc(text)
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disable:
continue continue
proc(doc) proc(doc)
return doc return doc
@ -322,7 +322,7 @@ class Language(object):
except StopIteration: except StopIteration:
pass pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports """Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading. GIL-free multi-threading.
@ -330,7 +330,7 @@ class Language(object):
n_threads (int): The number of worker threads to use. If -1, OpenMP will n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2. decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer. batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude. disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text. YIELDS (Doc): Documents in the order of the original text.
EXAMPLE: EXAMPLE:
@ -342,7 +342,7 @@ class Language(object):
docs = texts docs = texts
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disable:
continue continue
if hasattr(proc, 'pipe'): if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -352,12 +352,14 @@ class Language(object):
for doc in docs: for doc in docs:
yield doc yield doc
def to_disk(self, path, **exclude): def to_disk(self, path, disable=[]):
"""Save the current state to a directory. """Save the current state to a directory. If a model is loaded, this
will include the model.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved. disable (list): Nameds of pipeline components to disable and prevent
from being saved.
EXAMPLE: EXAMPLE:
>>> nlp.to_disk('/path/to/models') >>> nlp.to_disk('/path/to/models')
@ -369,7 +371,7 @@ class Language(object):
raise IOError("Output path must be a directory") raise IOError("Output path must be a directory")
props = {} props = {}
for name, value in self.__dict__.items(): for name, value in self.__dict__.items():
if name in exclude: if name in disable:
continue continue
if hasattr(value, 'to_disk'): if hasattr(value, 'to_disk'):
value.to_disk(path / name) value.to_disk(path / name)
@ -378,13 +380,14 @@ class Language(object):
with (path / 'props.pickle').open('wb') as file_: with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_) dill.dump(props, file_)
def from_disk(self, path, **exclude): def from_disk(self, path, disable=[]):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects. strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded. disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object. RETURNS (Language): The modified `Language` object.
EXAMPLE: EXAMPLE:
@ -393,35 +396,36 @@ class Language(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)): if name not in disable and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name) getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_: with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read() bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude) self.from_bytes(bytes_data, disable)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized. disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object. RETURNS (bytes): The serialized form of the `Language` object.
""" """
props = dict(self.__dict__) props = dict(self.__dict__)
for key in exclude: for key in disable:
if key in props: if key in props:
props.pop(key) props.pop(key)
return dill.dumps(props, -1) return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded. disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object. RETURNS (Language): The `Language` object.
""" """
props = dill.loads(bytes_data) props = dill.loads(bytes_data)
for key, value in props.items(): for key, value in props.items():
if key not in exclude: if key not in disable:
setattr(self, key, value) setattr(self, key, value)
return self return self

View File

@ -174,6 +174,7 @@ def get_async(stream, numpy_array):
array.set(numpy_array, stream=stream) array.set(numpy_array, stream=stream)
return array return array
def itershuffle(iterable, bufsize=1000): def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back """Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased -- and yielding them sometime later. Obviously, this is not unbiased --

View File

@ -37,7 +37,8 @@ mixin svg(file, name, width, height)
size - [integer] icon width and height (default: 20) size - [integer] icon width and height (default: 20)
mixin icon(name, size) mixin icon(name, size)
+svg("icons", name, size || 20).o-icon&attributes(attributes) - var size = size || 20
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
//- Pro/Con/Neutral icon //- Pro/Con/Neutral icon
@ -185,3 +186,14 @@ mixin landing-header()
mixin landing-badge(url, graphic, alt, size) mixin landing-badge(url, graphic, alt, size)
+a(url)(aria-label=alt title=alt).c-landing__badge +a(url)(aria-label=alt title=alt).c-landing__badge
+svg("graphics", graphic, size || 225) +svg("graphics", graphic, size || 225)
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("🚧 Under construction")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub!

View File

@ -178,7 +178,7 @@ mixin label()
//- Tag //- Tag
mixin tag() mixin tag()
span.u-text-tag.u-text-tag--spaced(aria-hidden="true") span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes)
block block
@ -192,6 +192,17 @@ mixin tag-model(...capabs)
+help(intro + ext + ".").u-color-theme +help(intro + ext + ".").u-color-theme
//- "New" tag to label features new in a specific version
By using a separate mixin with a version ID, it becomes easy to quickly
enable/disable tags without having to modify the markup in the docs.
version - [string or integer] version number, without "v" prefix
mixin tag-new(version)
- var version = (typeof version == 'number') ? version.toFixed(1) : version
+tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.")
| v#{version}
//- List //- List
type - [string] "numbers", "letters", "roman" (bulleted list if none set) type - [string] "numbers", "letters", "roman" (bulleted list if none set)
start - [integer] start number start - [integer] start number

View File

@ -6,10 +6,18 @@ include _sidebar
main.o-main.o-main--sidebar.o-main--aside main.o-main.o-main--sidebar.o-main--aside
article.o-content article.o-content
+grid.o-no-block
+grid-col(source ? "two-thirds" : "full")
+h(1)=title +h(1)=title
if tag if tag
+tag=tag +tag=tag
if source
+grid-col("third").u-text-right
.o-inline-list
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
if ALPHA if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs") +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
| This page is part of the alpha documentation for spaCy v2.0 | This page is part of the alpha documentation for spaCy v2.0

View File

@ -1,128 +1,128 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="736"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
<style> <style>
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" } .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" } .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" } .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" } .svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
</style> </style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text> <text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/> <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/>
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/> <path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/> <rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/>
<text class="text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text> <text class="svg__architecture__text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/>
<rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/> <rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text> <text class="svg__architecture__text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/>
<rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" /> <rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text> <text class="svg__architecture__text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/> <rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/>
<rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" /> <rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/> <path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/>
<text class="text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text> <text class="svg__architecture__text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/>
<rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/> <rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/> <rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/>
<rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/> <rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/>
<text class="text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/>
<text class="text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/>
<text class="text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/>
<text class="text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/>
<text class="text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/>
<text class="text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/>
<text class="text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/>
<text transform="translate(221.5 77.5)" class="text-small" dy="0.85em" width="10" height="14">ja</text> <text transform="translate(221.5 77.5)" class="svg__architecture__text-small" dy="0.85em" width="10" height="14">ja</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/> <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/>
<text class="text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text> <text class="svg__architecture__text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/>
<rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/> <rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/> <path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/> <path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/> <rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/>
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text> <text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/> <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/>
<text class="text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text> <text class="svg__architecture__text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/> <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/>
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/> <path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/> <rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/>
<text class="text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text> <text class="svg__architecture__text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/> <rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/>
<text class="text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/>
<text class="text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/> <rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/>
<text class="text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/> <path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/> <path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" /> <rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" />
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text> <text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/> <rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text> <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/>
<text class="text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/>
<text class="text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text> <text class="svg__architecture__text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/>
<text class="text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/>
<text class="text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/>
<text class="text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text> <text class="svg__architecture__text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text> <text class="svg__architecture__text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,13 +1,13 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style> <style>
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" } .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" } .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" } .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
</style> </style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/> <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/> <path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
<text class="text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text> <text class="svg__langdata__text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
@ -17,7 +17,7 @@
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
<ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(815 308)" width="119" height="46">Base data</text> <text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/> <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/> <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/> <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
@ -33,50 +33,50 @@
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/> <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/> <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
<ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/> <ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text> <text class="svg__langdata__text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/> <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/> <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
<ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text> <text class="svg__langdata__text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
<path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/> <path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
<ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text> <text class="svg__langdata__text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/> <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/> <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/> <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
<ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text> <text class="svg__langdata__text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/> <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/> <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/> <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
<ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan> <text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
</text> </text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/> <path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
<ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text> <text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/> <path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
<text class="text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text> <text class="svg__langdata__text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
<ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/> <ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
<text class="text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text> <text class="svg__langdata__text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/> <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
<text class="text-large" transform="translate(443.5 410)" width="51" height="22">Token</text> <text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/> <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
<ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text> <text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/> <path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/> <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
<ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/> <ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text> <text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
<ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/> <ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
<text class="text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text> <text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 8.8 KiB

After

Width:  |  Height:  |  Size: 9.0 KiB

View File

@ -1,30 +1,30 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200"> <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style> <style>
.text { fill: #1a1e23; font: 20px "Source Sans Pro" } .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" } .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" } .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
</style> </style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/> <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/> <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
<text class="text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text> <text class="svg__pipeline__text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
<path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/> <path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
<text class="text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text> <text class="svg__pipeline__text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
<rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/> <rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
<text class="text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text> <text class="svg__pipeline__text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/> <path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text> <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text> <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text> <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text> <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/> <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
<text class="text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text> <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 2.9 KiB

After

Width:  |  Height:  |  Size: 3.1 KiB

View File

@ -0,0 +1,77 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
<style>
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
</style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
<rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
<rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
<rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
<rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text>
<rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text>
<rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">508</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text>
<rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
<rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
<rect fill="#E1D5E7" width="50" height="12" transform="translate(202.5 53.5)"/>
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="50" height="12" transform="translate(202.5 53.5)">nsubj</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M363 60h72.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M441.8 60l-8 4 2-4-2-4z"/>
<rect fill="#E1D5E7" width="43" height="12" transform="translate(375.5 54.5)"/>
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="43" height="12" transform="translate(375.5 54.5)">dobj</text>
<rect width="50" height="88" x="1" y="246" fill="#666" stroke="#666" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="1em" width="53" height="36" transform="rotate(-90 162 155)">String</text>
<text class="svg__vocab__text-large" dy="2em" width="53" height="36" transform="rotate(-90 162 155)">Store</text>
<rect width="50" height="88" x="1" y="135" fill="#82b366" stroke="#82b366" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="47" height="17" transform="rotate(-90 109.5 93)">Vocab</text>
<rect width="50" height="110" x="1" y="1" fill="#9673a6" stroke="#9673a6" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="31" height="17" transform="rotate(-90 44 27.5)">Doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 27h100v66H263z"/>
<text class="svg__vocab__text" dy="1em" width="31" height="33" transform="translate(296.5 42.5)">love</text>
<text class="svg__vocab__text-code" dy="2.8em" width="31" height="33" transform="translate(296.5 42.5)">VERB</text>
<rect width="50" height="20" x="288" y="16" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" transform="translate(294.5 19.5)">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 27h100v66H76z"/>
<text class="svg__vocab__text" dx="0.8em" dy="1em" width="29" height="33" transform="translate(110.5 42.5)">I</text>
<text class="svg__vocab__text-code" dy="2.8em" width="29" height="33" transform="translate(110.5 42.5)">PRON</text>
<rect width="50" height="20" x="105" y="17" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(111.5 20.5)">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 27h100v66H444z"/>
<text class="svg__vocab__text" dy="1em" width="45" height="33" transform="translate(470.5 42.5)">coffee</text>
<text class="svg__vocab__text-code" dx="0.6em" dy="2.8em" width="45" height="33" transform="translate(470.5 42.5)">NOUN</text>
<rect width="50" height="20" x="469" y="16" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(475.5 19.5)">Token</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 141.8v-38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 149.8l-2.7-8h5.4zM126 95.2l2.7 8h-5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 206.2l2.7 8h-5.4zM126 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 103.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 95.2l2.7 8h-5.4zM313 149.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 206.2l2.7 8h-5.4zM313 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 206.2l2.7 8h-5.4zM494 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 103.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 95.2l2.7 8h-5.4zM494 149.8l-2.7-8h5.4z"/>
</svg>

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@ -24,7 +24,8 @@
"Vocab": "vocab", "Vocab": "vocab",
"StringStore": "stringstore", "StringStore": "stringstore",
"GoldParse": "goldparse", "GoldParse": "goldparse",
"GoldCorpus": "goldcorpus" "GoldCorpus": "goldcorpus",
"Binder": "binder"
}, },
"Other": { "Other": {
"Annotation Specs": "annotation" "Annotation Specs": "annotation"
@ -47,62 +48,74 @@
"spacy": { "spacy": {
"title": "spaCy top-level functions", "title": "spaCy top-level functions",
"source": "spacy/__init__.py",
"next": "displacy" "next": "displacy"
}, },
"displacy": { "displacy": {
"title": "displaCy", "title": "displaCy",
"tag": "module", "tag": "module",
"source": "spacy/displacy",
"next": "util" "next": "util"
}, },
"util": { "util": {
"title": "Utility Functions", "title": "Utility Functions",
"source": "spacy/util.py",
"next": "cli" "next": "cli"
}, },
"cli": { "cli": {
"title": "Command Line Interface" "title": "Command Line Interface",
"source": "spacy/cli"
}, },
"language": { "language": {
"title": "Language", "title": "Language",
"tag": "class" "tag": "class",
"source": "spacy/language.py"
}, },
"doc": { "doc": {
"title": "Doc", "title": "Doc",
"tag": "class" "tag": "class",
"source": "spacy/tokens/doc.pyx"
}, },
"token": { "token": {
"title": "Token", "title": "Token",
"tag": "class" "tag": "class",
"source": "spacy/tokens/token.pyx"
}, },
"span": { "span": {
"title": "Span", "title": "Span",
"tag": "class" "tag": "class",
"source": "spacy/tokens/span.pyx"
}, },
"lexeme": { "lexeme": {
"title": "Lexeme", "title": "Lexeme",
"tag": "class" "tag": "class",
"source": "spacy/lexeme.pyx"
}, },
"vocab": { "vocab": {
"title": "Vocab", "title": "Vocab",
"tag": "class" "tag": "class",
"source": "spacy/vocab.pyx"
}, },
"stringstore": { "stringstore": {
"title": "StringStore", "title": "StringStore",
"tag": "class" "tag": "class",
"source": "spacy/strings.pyx"
}, },
"matcher": { "matcher": {
"title": "Matcher", "title": "Matcher",
"tag": "class" "tag": "class",
"source": "spacy/matcher.pyx"
}, },
"dependenyparser": { "dependenyparser": {
@ -122,7 +135,8 @@
"tokenizer": { "tokenizer": {
"title": "Tokenizer", "title": "Tokenizer",
"tag": "class" "tag": "class",
"source": "spacy/tokenizer.pyx"
}, },
"tagger": { "tagger": {
@ -132,11 +146,18 @@
"goldparse": { "goldparse": {
"title": "GoldParse", "title": "GoldParse",
"tag": "class" "tag": "class",
"source": "spacy/gold.pyx"
}, },
"goldcorpus": { "goldcorpus": {
"title": "GoldCorpus", "title": "GoldCorpus",
"tag": "class",
"source": "spacy/gold.pyx"
},
"binder": {
"title": "Binder",
"tag": "class" "tag": "class"
}, },

View File

@ -0,0 +1,5 @@
//- 💫 DOCS > API > BINDER
include ../../_includes/_mixins
+under-construction

View File

@ -166,7 +166,7 @@ p
| #[+a("/docs/api/annotation#json-input") JSON format]. | #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash"). +code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner] python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -192,18 +192,13 @@ p
+row +row
+cell #[code --n-iter], #[code -n] +cell #[code --n-iter], #[code -n]
+cell option +cell option
+cell Number of iterations (default: #[code 15]). +cell Number of iterations (default: #[code 20]).
+row +row
+cell #[code --n_sents], #[code -ns] +cell #[code --n-sents], #[code -ns]
+cell option +cell option
+cell Number of sentences (default: #[code 0]). +cell Number of sentences (default: #[code 0]).
+row
+cell #[code --parser-L1], #[code -L]
+cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row +row
+cell #[code --use-gpu], #[code -G] +cell #[code --use-gpu], #[code -G]
+cell flag +cell flag
@ -220,7 +215,7 @@ p
+cell Don't train parser. +cell Don't train parser.
+row +row
+cell #[code --no-ner], #[code -N] +cell #[code --no-entities], #[code -N]
+cell flag +cell flag
+cell Don't train NER. +cell Don't train NER.
@ -229,6 +224,106 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(3, "train-hyperparams") Environment variables for hyperparameters
p
| spaCy lets you set hyperparameters for training via environment variables.
| This is useful, because it keeps the command simple and allows you to
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias]
| for your custom #[code train] command while still being able to easily
| tweak the hyperparameters. For example:
+code(false, "bash").
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
+under-construction
+table(["Name", "Description", "Default"])
+row
+cell #[code dropout_from]
+cell
+cell #[code 0.2]
+row
+cell #[code dropout_to]
+cell
+cell #[code 0.2]
+row
+cell #[code dropout_decay]
+cell
+cell #[code 0.0]
+row
+cell #[code batch_from]
+cell
+cell #[code 1]
+row
+cell #[code batch_to]
+cell
+cell #[code 64]
+row
+cell #[code batch_compound]
+cell
+cell #[code 1.001]
+row
+cell #[code token_vector_width]
+cell
+cell #[code 128]
+row
+cell #[code embed_size]
+cell
+cell #[code 7500]
+row
+cell #[code parser_maxout_pieces]
+cell
+cell #[code 2]
+row
+cell #[code parser_hidden_depth]
+cell
+cell #[code 1]
+row
+cell #[code hidden_width]
+cell
+cell #[code 128]
+row
+cell #[code learn_rate]
+cell
+cell #[code 0.001]
+row
+cell #[code optimizer_B1]
+cell
+cell #[code 0.9]
+row
+cell #[code optimizer_B2]
+cell
+cell #[code 0.999]
+row
+cell #[code optimizer_eps]
+cell
+cell #[code 1e-08]
+row
+cell #[code L2_penalty]
+cell
+cell #[code 1e-06]
+row
+cell #[code grad_norm_clip]
+cell
+cell #[code 1.0]
+h(2, "package") Package +h(2, "package") Package
p p

View File

@ -10,6 +10,7 @@ p
+h(2, "serve") displacy.serve +h(2, "serve") displacy.serve
+tag method +tag method
+tag-new(2)
p p
| Serve a dependency parse tree or named entity visualization to view it | Serve a dependency parse tree or named entity visualization to view it
@ -71,6 +72,7 @@ p
+h(2, "render") displacy.render +h(2, "render") displacy.render
+tag method +tag method
+tag-new(2)
p Render a dependency parse tree or named entity visualization. p Render a dependency parse tree or named entity visualization.

View File

@ -255,6 +255,7 @@ p
+h(2, "to_disk") Doc.to_disk +h(2, "to_disk") Doc.to_disk
+tag method +tag method
+tag-new(2)
p Save the current state to a directory. p Save the current state to a directory.
@ -271,12 +272,14 @@ p Save the current state to a directory.
+h(2, "from_disk") Doc.from_disk +h(2, "from_disk") Doc.from_disk
+tag method +tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it. p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example"). +aside-code("Example").
from spacy.tokens import Doc from spacy.tokens import Doc
doc = Doc().from_disk('/path/to/doc') from spacy.vocab import Vocab
doc = Doc(Vocab()).from_disk('/path/to/doc')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row

View File

@ -8,6 +8,7 @@ p
+h(2, "init") GoldCorpus.__init__ +h(2, "init") GoldCorpus.__init__
+tag method +tag method
+tag-new(2)
p Create a #[code GoldCorpus]. p Create a #[code GoldCorpus].

View File

@ -73,15 +73,26 @@ p
+cell The text to be processed. +cell The text to be processed.
+row +row
+cell #[code **disabled] +cell #[code disable]
+cell - +cell list
+cell Elements of the pipeline that should not be run. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Doc] +cell #[code Doc]
+cell A container for accessing the annotations. +cell A container for accessing the annotations.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old doc = nlp(u"I don't want parsed", parse=False)
+h(2, "pipe") Language.pipe +h(2, "pipe") Language.pipe
+tag method +tag method
@ -112,6 +123,13 @@ p
+cell int +cell int
+cell The number of texts to buffer. +cell The number of texts to buffer.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell yields +cell yields
+cell #[code Doc] +cell #[code Doc]
@ -227,8 +245,11 @@ p
+h(2, "to_disk") Language.to_disk +h(2, "to_disk") Language.to_disk
+tag method +tag method
+tag-new(2)
p Save the current state to a directory. p
| Save the current state to a directory. If a model is loaded, this will
| #[strong include the model].
+aside-code("Example"). +aside-code("Example").
nlp.to_disk('/path/to/models') nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
| Paths may be either strings or #[code Path]-like objects. | Paths may be either strings or #[code Path]-like objects.
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being saved. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being saved.
+h(2, "from_disk") Language.from_disk +h(2, "from_disk") Language.from_disk
+tag method +tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it. p
| Loads state from a directory. Modifies the object in place and returns
| it. If the saved #[code Language] object contains a model, the
| #[strong model will be loaded].
+aside-code("Example"). +aside-code("Example").
from spacy.language import Language from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
| #[code Path]-like objects. | #[code Path]-like objects.
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being loaded. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell The modified #[code Language] object. +cell The modified #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy v2.0, the #[code save_to_directory] method has been
| renamed to #[code to_disk], to improve consistency across classes.
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+code-old nlp = spacy.load('en', tagger=False, entity=False)
+h(2, "to_bytes") Language.to_bytes +h(2, "to_bytes") Language.to_bytes
+tag method +tag method
@ -283,9 +324,12 @@ p Serialize the current state to a binary string.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being serialized. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being serialized.
+footrow +footrow
+cell returns +cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
+cell The data to load from. +cell The data to load from.
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being loaded. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell The #[code Language] object. +cell The #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -327,6 +382,11 @@ p Load state from a binary string.
+cell #[code Vocab] +cell #[code Vocab]
+cell A container for the lexical types. +cell A container for the lexical types.
+row
+cell #[code tokenizer]
+cell #[code Tokenizer]
+cell The tokenizer.
+row +row
+cell #[code make_doc] +cell #[code make_doc]
+cell #[code lambda text: Doc] +cell #[code lambda text: Doc]

View File

@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code is_alpha] +cell #[code is_alpha]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.isalpha()]. +cell
| Does the lexeme consist of alphabetic characters? Equivalent to
| #[code lexeme.text.isalpha()].
+row +row
+cell #[code is_ascii] +cell #[code is_ascii]
+cell bool +cell bool
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. +cell
| Does the lexeme consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
+row +row
+cell #[code is_digit] +cell #[code is_digit]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.isdigit()]. +cell
| Does the lexeme consist of digits? Equivalent to
| #[code lexeme.text.isdigit()].
+row +row
+cell #[code is_lower] +cell #[code is_lower]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.islower()]. +cell
| Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()].
+row +row
+cell #[code is_title] +cell #[code is_title]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.istitle()]. +cell
| Is the lexeme in titlecase? Equivalent to
| #[code lexeme.text.istitle()].
+row +row
+cell #[code is_punct] +cell #[code is_punct]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.ispunct()]. +cell Is the lexeme punctuation?
+row +row
+cell #[code is_space] +cell #[code is_space]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.isspace()]. +cell
| Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()].
+row +row
+cell #[code like_url] +cell #[code like_url]
+cell bool +cell bool
+cell Does the word resemble a URL? +cell Does the lexeme resemble a URL?
+row +row
+cell #[code like_num] +cell #[code like_num]
+cell bool +cell bool
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc. +cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
+row +row
+cell #[code like_email] +cell #[code like_email]
+cell bool +cell bool
+cell Does the word resemble an email address? +cell Does the lexeme resemble an email address?
+row +row
+cell #[code is_oov] +cell #[code is_oov]
+cell bool +cell bool
+cell Is the word out-of-vocabulary? +cell Is the lexeme out-of-vocabulary?
+row +row
+cell #[code is_stop] +cell #[code is_stop]
+cell bool +cell bool
+cell Is the word part of a "stop list"? +cell Is the lexeme part of a "stop list"?
+row +row
+cell #[code lang] +cell #[code lang]

View File

@ -5,6 +5,7 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules. p Match sequences of tokens, based on pattern rules.
+infobox("⚠️ Deprecation note") +infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler | are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
doc = nlp(u'hello world!') doc = nlp(u'hello world!')
matches = matcher(doc) matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code doc] +cell #[code doc]
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
| matches. A match tuple describes a span #[code doc[start:end]]. | matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern. | The #[code match_id] is the ID of the added match pattern.
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+h(2, "pipe") Matcher.pipe +h(2, "pipe") Matcher.pipe
+tag method +tag method
@ -118,6 +119,7 @@ p Match a stream of documents, yielding them in turn.
+h(2, "len") Matcher.__len__ +h(2, "len") Matcher.__len__
+tag method +tag method
+tag-new(2)
p p
| Get the number of rules added to the matcher. Note that this only returns | Get the number of rules added to the matcher. Note that this only returns
@ -138,6 +140,7 @@ p
+h(2, "contains") Matcher.__contains__ +h(2, "contains") Matcher.__contains__
+tag method +tag method
+tag-new(2)
p Check whether the matcher contains rules for a match ID. p Check whether the matcher contains rules for a match ID.
@ -159,6 +162,7 @@ p Check whether the matcher contains rules for a match ID.
+h(2, "add") Matcher.add +h(2, "add") Matcher.add
+tag method +tag method
+tag-new(2)
p p
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and | Add a rule to the matcher, consisting of an ID key, one or more patterns, and
@ -198,8 +202,23 @@ p
| Match pattern. A pattern consists of a list of dicts, where each | Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token. | dict describes a token.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID.
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+h(2, "remove") Matcher.remove +h(2, "remove") Matcher.remove
+tag method +tag method
+tag-new(2)
p p
| Remove a rule from the matcher. A #[code KeyError] is raised if the match | Remove a rule from the matcher. A #[code KeyError] is raised if the match
@ -219,6 +238,7 @@ p
+h(2, "get") Matcher.get +h(2, "get") Matcher.get
+tag method +tag method
+tag-new(2)
p p
| Retrieve the pattern stored for a key. Returns the rule as an | Retrieve the pattern stored for a key. Returns the rule as an

View File

@ -20,12 +20,7 @@ p
nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+infobox("⚠️ Deprecation note") nlp = spacy.load('en', disable['parser', 'tagger'])
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -34,15 +29,28 @@ p
+cell Model to load, i.e. shortcut link, package name or path. +cell Model to load, i.e. shortcut link, package name or path.
+row +row
+cell #[code **overrides] +cell #[code disable]
+cell - +cell list
+cell Override or disable components. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell A #[code Language] object with the loaded model. +cell A #[code Language] object with the loaded model.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+code-new nlp = spacy.load('/model')
+code-old nlp = spacy.load('en', path='/model')
+h(2, "info") spacy.info +h(2, "info") spacy.info
+tag function +tag function
@ -98,3 +106,37 @@ p
+cell returns +cell returns
+cell unicode +cell unicode
+cell The explanation, or #[code None] if not found in the glossary. +cell The explanation, or #[code None] if not found in the glossary.
+h(2, "set_factory") spacy.set_factory
+tag function
+tag-new(2)
p
| Set a factory that returns a custom
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example").
def my_factory(vocab):
def my_component(doc):
return doc
return my_component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory'])
+table(["Name", "Type", "Description"])
+row
+cell #[code factory_id]
+cell unicode
+cell
| Unique name of factory. If added to a new pipeline, spaCy will
| look up the factory for this ID and use it to create the
| component.
+row
+cell #[code factory]
+cell callable
+cell
| Callable that takes a #[code Vocab] object and returns a pipeline
| component.

View File

@ -104,6 +104,7 @@ p
+h(2, "to_disk") StringStore.to_disk +h(2, "to_disk") StringStore.to_disk
+tag method +tag method
+tag-new(2)
p Save the current state to a directory. p Save the current state to a directory.
@ -118,8 +119,9 @@ p Save the current state to a directory.
| A path to a directory, which will be created if it doesn't exist. | A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects. | Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk +h(2, "from_disk") StringStore.from_disk
+tag method +tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it. p Loads state from a directory. Modifies the object in place and returns it.
@ -137,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.
+footrow +footrow
+cell returns +cell returns
+cell #[code Tokenizer] +cell #[code StringStore]
+cell The modified #[code Tokenizer] object. +cell The modified #[code StringStore] object.
+h(2, "to_bytes") Tokenizer.to_bytes +h(2, "to_bytes") StringStore.to_bytes
+tag method +tag method
p Serialize the current state to a binary string. p Serialize the current state to a binary string.
@ -157,9 +159,9 @@ p Serialize the current state to a binary string.
+footrow +footrow
+cell returns +cell returns
+cell bytes +cell bytes
+cell The serialized form of the #[code Tokenizer] object. +cell The serialized form of the #[code StringStore] object.
+h(2, "from_bytes") Tokenizer.from_bytes +h(2, "from_bytes") StringStore.from_bytes
+tag method +tag method
p Load state from a binary string. p Load state from a binary string.

View File

@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation.
+cell #[code lemma] +cell #[code lemma]
+cell int +cell int
+cell +cell
| Base form of the word, with no inflectional suffixes. | Base form of the token, with no inflectional suffixes.
+row +row
+cell #[code lemma_] +cell #[code lemma_]
+cell unicode +cell unicode
+cell Base form of the word, with no inflectional suffixes. +cell Base form of the token, with no inflectional suffixes.
+row +row
+cell #[code lower] +cell #[code lower]
+cell int +cell int
+cell Lower-case form of the word. +cell Lower-case form of the token.
+row +row
+cell #[code lower_] +cell #[code lower_]
+cell unicode +cell unicode
+cell Lower-case form of the word. +cell Lower-case form of the token.
+row +row
+cell #[code shape] +cell #[code shape]
+cell int +cell int
+cell Transform of the word's string, to show orthographic features. +cell
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
+row +row
+cell #[code shape_] +cell #[code shape_]
+cell unicode +cell unicode
+cell A transform of the word's string, to show orthographic features. | Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
+row +row
+cell #[code prefix] +cell #[code prefix]
+cell int +cell int
+cell Integer ID of a length-N substring from the start of the +cell Integer ID of a length-N substring from the start of the
| word. Defaults to #[code N=1]. | token. Defaults to #[code N=1].
+row +row
+cell #[code prefix_] +cell #[code prefix_]
+cell unicode +cell unicode
+cell +cell
| A length-N substring from the start of the word. Defaults to | A length-N substring from the start of the token. Defaults to
| #[code N=1]. | #[code N=1].
+row +row
+cell #[code suffix] +cell #[code suffix]
+cell int +cell int
+cell +cell
| Length-N substring from the end of the word. Defaults to #[code N=3]. | Length-N substring from the end of the token. Defaults to #[code N=3].
+row +row
+cell #[code suffix_] +cell #[code suffix_]
+cell unicode +cell unicode
+cell Length-N substring from the end of the word. Defaults to #[code N=3]. +cell Length-N substring from the end of the token. Defaults to #[code N=3].
+row +row
+cell #[code is_alpha] +cell #[code is_alpha]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.isalpha()]. +cell
| Does the token consist of alphabetic characters? Equivalent to
| #[code token.text.isalpha()].
+row +row
+cell #[code is_ascii] +cell #[code is_ascii]
+cell bool +cell bool
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. +cell
| Does the token consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in token.text)]].
+row +row
+cell #[code is_digit] +cell #[code is_digit]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.isdigit()]. +cell
| Does the token consist of digits? Equivalent to
| #[code token.text.isdigit()].
+row +row
+cell #[code is_lower] +cell #[code is_lower]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.islower()]. +cell
| Is the token in lowercase? Equivalent to
| #[code token.text.islower()].
+row +row
+cell #[code is_title] +cell #[code is_title]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.istitle()]. +cell
| Is the token in titlecase? Equivalent to
| #[code token.text.istitle()].
+row +row
+cell #[code is_punct] +cell #[code is_punct]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.ispunct()]. +cell Is the token punctuation?
+row +row
+cell #[code is_space] +cell #[code is_space]
+cell bool +cell bool
+cell Equivalent to #[code word.orth_.isspace()]. +cell
| Does the token consist of whitespace characters? Equivalent to
| #[code token.text.isspace()].
+row +row
+cell #[code like_url] +cell #[code like_url]
+cell bool +cell bool
+cell Does the word resemble a URL? +cell Does the token resemble a URL?
+row +row
+cell #[code like_num] +cell #[code like_num]
+cell bool +cell bool
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc. +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
+row +row
+cell #[code like_email] +cell #[code like_email]
+cell bool +cell bool
+cell Does the word resemble an email address? +cell Does the token resemble an email address?
+row +row
+cell #[code is_oov] +cell #[code is_oov]
+cell bool +cell bool
+cell Is the word out-of-vocabulary? +cell Is the token out-of-vocabulary?
+row +row
+cell #[code is_stop] +cell #[code is_stop]
+cell bool +cell bool
+cell Is the word part of a "stop list"? +cell Is the token part of a "stop list"?
+row +row
+cell #[code pos] +cell #[code pos]

View File

@ -198,91 +198,6 @@ p
| attributes. The #[code ORTH] fields of the attributes must | attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated. | exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])

View File

@ -76,6 +76,7 @@ p
+h(2, "resolve_model_path") util.resolve_model_path +h(2, "resolve_model_path") util.resolve_model_path
+tag function +tag function
+tag-new(2)
p Resolve a model name or string to a model path. p Resolve a model name or string to a model path.
@ -169,6 +170,7 @@ p
+h(2, "is_in_jupyter") util.is_in_jupyter +h(2, "is_in_jupyter") util.is_in_jupyter
+tag function +tag function
+tag-new(2)
p p
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter] | Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
@ -221,6 +223,7 @@ p
+h(2, "prints") util.prints +h(2, "prints") util.prints
+tag function +tag function
+tag-new(2)
p p
| Print a formatted, text-wrapped message with optional title. If a text | Print a formatted, text-wrapped message with optional title. If a text

View File

@ -159,6 +159,7 @@ p
+h(2, "to_disk") Vocab.to_disk +h(2, "to_disk") Vocab.to_disk
+tag method +tag method
+tag-new(2)
p Save the current state to a directory. p Save the current state to a directory.
@ -175,6 +176,7 @@ p Save the current state to a directory.
+h(2, "from_disk") Vocab.from_disk +h(2, "from_disk") Vocab.from_disk
+tag method +tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it. p Loads state from a directory. Modifies the object in place and returns it.

View File

@ -80,7 +80,7 @@
}, },
"customizing-tokenizer": { "customizing-tokenizer": {
"title": "Customizing the tokenizer", "title": "Customising the tokenizer",
"next": "rule-based-matching" "next": "rule-based-matching"
}, },

View File

@ -48,3 +48,13 @@ p
+cell ner +cell ner
+cell #[+api("entityrecognizer") #[code EntityRecognizer]] +cell #[+api("entityrecognizer") #[code EntityRecognizer]]
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
p
| The processing pipeline always #[strong depends on the statistical model]
| and its capabilities. For example, a pipeline can only include an entity
| recognizer component if the model includes data to make predictions of
| entity labels. This is why each model will specify the pipeline to use
| in its meta data, as a simple list containing the component names:
+code(false, "json").
"pipeline": ["vectorizer", "tagger", "parser", "ner"]

View File

@ -22,10 +22,10 @@ p
| untrusted sources. | untrusted sources.
p p
| All container classes and pipeline components, i.e. | All container classes, i.e. #[+api("language") #[code Language]],
for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"] | #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and
| #[+api(cls.toLowerCase()) #[code=cls]], | #[+api("stringstore") #[code StringStore]] have the following methods
| have the following methods available: | available:
+table(["Method", "Returns", "Example"]) +table(["Method", "Returns", "Example"])
- style = [1, 0, 1] - style = [1, 0, 1]
@ -34,7 +34,35 @@ p
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
p
| For example, if you've processed a very large document, you can use
| #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
| local machine. This will save the document and its tokens, as well as
| the vocabulary associated with the #[code Doc].
+aside("Why saving the vocab?")
| Saving the vocabulary with the #[code Doc] is important, because the
| #[code Vocab] holds the context-independent information about the words,
| tags and labels, and their #[strong integer IDs]. If the #[code Vocab]
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
| those IDs for example, the word text or the dependency labels. You
| might be saving #[code 446] for "whale", but in a different vocabulary,
| this ID could map to "VERB". Similarly, if your document was processed by
| a German model, its vocab will include the specific
| #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
+code. +code.
moby_dick = open('moby_dick.txt', 'r') # open a large document moby_dick = open('moby_dick.txt', 'r') # open a large document
doc = nlp(moby_dick) # process it doc = nlp(moby_dick) # process it
doc.to_disk('/moby_dick.bin') # save the processed Doc doc.to_disk('/moby_dick.bin') # save the processed Doc
p
| If you need it again later, you can load it back into an empty #[code Doc]
| with an empty #[code Vocab] by calling
| #[+api("doc#from_disk") #[code from_disk()]]:
+code.
from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab
doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc

View File

@ -1,3 +1,3 @@
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING //- 💫 DOCS > USAGE > SPACY 101 > TRAINING
p +under-construction

View File

@ -0,0 +1,92 @@
//- 💫 DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE
p
| Whenever possible, spaCy tries to store data in a vocabulary, the
| #[+api("vocab") #[code Vocab]], that will be
| #[strong shared by multiple documents]. To save memory, spaCy also
| encodes all strings to #[strong integer IDs] in this case for example,
| "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
| part-of-speech tags like "VERB" are also encoded. Internally, spaCy
| only "speaks" in integer IDs.
+aside
| #[strong Token]: A word, punctuation mark etc. #[em in context], including
| its attributes, tags and dependencies.#[br]
| #[strong Lexeme]: A "word type" with no context. Includes the word shape
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
| #[strong Doc]: A processed container of tokens in context.#[br]
| #[strong Vocab]: The collection of lexemes.#[br]
| #[strong StringStore]: The dictionary mapping integer IDs to strings, for
| example #[code 3672] &rarr; "coffee".
+image
include ../../../assets/img/docs/vocab_stringstore.svg
.u-text-right
+button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
p
| If you process lots of documents containing the word "coffee" in all
| kinds of different contexts, storing the exact string "coffee" every time
| would take up way too much space. So instead, spaCy assigns it an ID
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can
| think of the #[code StringStore] as a
| #[strong lookup table that works in both directions] you can look up a
| string to get its ID, or an ID to get its string:
+code.
doc = nlp(u'I like coffee')
assert doc.vocab.strings[u'coffee'] == 3572
assert doc.vocab.strings[3572] == u'coffee'
p
| Now that all strings are encoded, the entries in the vocabulary
| #[strong don&apos;t need to include the word text] themselves. Instead,
| they can look it up in the #[code StringStore] via its integer ID. Each
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
| contains the #[strong context-independent] information about a word.
| For example, no matter if "love" is used as a verb or a noun in some
| context, its spelling and whether it consists of alphabetic characters
| won't ever change.
+code.
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
+aside
| #[strong Text]: The original text of the lexeme.#[br]
| #[strong Orth]: The integer ID of the lexeme.#[br]
| #[strong Shape]: The abstract word shape of the lexeme.#[br]
| #[strong Prefix]: By default, the first letter of the word string.#[br]
| #[strong Suffix]: By default, the last three letters of the word string.#[br]
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong is digit]: Does the lexeme consist of digits?#[br]
| #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong Lang]: The language of the parent vocabulary.
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
- var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
+annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
+annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
+annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
p
| The specific entries in the voabulary and their IDs don't really matter
| #[strong as long as they match]. That's why you always need to make sure
| all objects you create have access to the same vocabulary. If they don't,
| the IDs won't match and spaCy will either produce very confusing results,
| or fail alltogether.
+code.
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = nlp(u'I like coffee') # original Doc
new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
p
| Even though both #[code Doc] objects contain the same words, the internal
| integer IDs are very different.

View File

@ -107,7 +107,6 @@ p
.u-text-right .u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+table(["File name", "Variables", "Description"]) +table(["File name", "Variables", "Description"])
+row +row
+cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py] +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
@ -439,7 +438,7 @@ p
+h(3, "morph-rules") Morph rules +h(3, "morph-rules") Morph rules
//- TODO: write morph rules section +under-construction
+h(2, "testing") Testing the new language tokenizer +h(2, "testing") Testing the new language tokenizer
@ -631,7 +630,7 @@ p
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
| The #[code vectors.bin] file should consist of one word and vector per line. | The #[code vectors.bin] file should consist of one word and vector per line.
+aside-code("your_data_directory", "yaml"). //-+aside-code("your_data_directory", "yaml").
├── vocab/ ├── vocab/
| ├── lexemes.bin | ├── lexemes.bin
| ├── strings.json | ├── strings.json
@ -662,4 +661,4 @@ p
| model use the using spaCy's #[+api("cli#train") #[code train]] command: | model use the using spaCy's #[+api("cli#train") #[code train]] command:
+code(false, "bash"). +code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]

View File

@ -17,6 +17,8 @@ p
| #[+a("http://deeplearning.net/software/theano/") Theano] is also | #[+a("http://deeplearning.net/software/theano/") Theano] is also
| supported. | supported.
+under-construction
+code("Runtime usage"). +code("Runtime usage").
def count_entity_sentiment(nlp, texts): def count_entity_sentiment(nlp, texts):
'''Compute the net document sentiment for each entity in the texts.''' '''Compute the net document sentiment for each entity in the texts.'''
@ -153,7 +155,9 @@ p
| adding another LSTM layer, using attention mechanism, using character | adding another LSTM layer, using attention mechanism, using character
| features, etc. | features, etc.
+h(2, "attribute-hooks") Attribute hooks (experimental) +h(2, "attribute-hooks") Attribute hooks
+under-construction
p p
| Earlier, we saw how to store data in the new generic #[code user_data] | Earlier, we saw how to store data in the new generic #[code user_data]

View File

@ -322,8 +322,9 @@ p
| If you don't need a particular component of the pipeline for | If you don't need a particular component of the pipeline for
| example, the tagger or the parser, you can disable loading it. This can | example, the tagger or the parser, you can disable loading it. This can
| sometimes make a big difference and improve loading speed. Disabled | sometimes make a big difference and improve loading speed. Disabled
| component names can be provided to #[code spacy.load], #[code from_disk] | component names can be provided to #[+api("spacy#load") #[code spacy.load]],
| or the #[code nlp] object itself as a list: | #[+api("language#from_disk") #[code Language.from_disk]] or the
| #[code nlp] object itself as a list:
+code. +code.
nlp = spacy.load('en', disable['parser', 'tagger']) nlp = spacy.load('en', disable['parser', 'tagger'])

View File

@ -35,7 +35,7 @@ p
assert doc[0].text == u'Peach' assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji' assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑' assert doc[-1].text == u'🍑'
assert doc[17:19] == u'outranking eggplant' assert doc[17:19].text == u'outranking eggplant'
assert doc.noun_chunks[0].text == u'Peach emoji' assert doc.noun_chunks[0].text == u'Peach emoji'
sentences = list(doc.sents) sentences = list(doc.sents)

View File

@ -2,16 +2,18 @@
include ../../_includes/_mixins include ../../_includes/_mixins
+under-construction
+h(2, "multithreading") Multi-threading with #[code .pipe()] +h(2, "multithreading") Multi-threading with #[code .pipe()]
p p
| If you have a sequence of documents to process, you should use the | If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code .pipe()]] method. The method takes an | #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
| iterator of texts, and accumulates an internal buffer, | an iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order, | which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter | one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means | lock was freed around spaCy's main parsing loop in v0.100.3. This means
| that the #[code .pipe()] method will be significantly faster in most | that #[code .pipe()] will be significantly faster in most
| practical situations, because it allows shared memory parallelism. | practical situations, because it allows shared memory parallelism.
+code. +code.
@ -20,23 +22,27 @@ p
p p
| To make full use of the #[code .pipe()] function, you might want to | To make full use of the #[code .pipe()] function, you might want to
| brush up on Python generators. Here are a few quick hints: | brush up on #[strong Python generators]. Here are a few quick hints:
+list +list
+item +item
| Generator comprehensions can be written | Generator comprehensions can be written as
| (#[code item for item in sequence]) | #[code (item for item in sequence)].
+item +item
| The #[code itertools] built-in library and the #[code cytoolz] | The
| package provide a lot of handy generator tools | #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
| and the
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
| provide a lot of handy #[strong generator tools].
+item +item
| Often you'll have an input stream that pairs text with some | Often you'll have an input stream that pairs text with some
| important metadata, e.g. a JSON document. To pair up the metadata | important meta data, e.g. a JSON document. To
| with the processed #[code Doc] object, you should use the tee | #[strong pair up the meta data] with the processed #[code Doc]
| function to split the generator in two, and then #[code izip] the | object, you should use the #[code itertools.tee] function to split
| extra stream to the document stream. | the generator in two, and then #[code izip] the extra stream to the
| document stream.
+h(2, "own-annotations") Bringing your own annotations +h(2, "own-annotations") Bringing your own annotations

View File

@ -4,6 +4,8 @@ include ../../_includes/_mixins
+h(2, "features") Features +h(2, "features") Features
+under-construction
+aside +aside
| If one of spaCy's functionalities #[strong needs a model], it means that | If one of spaCy's functionalities #[strong needs a model], it means that
| you need to have one our the available | you need to have one our the available
@ -91,17 +93,35 @@ p
include _spacy-101/_tokenization include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenizer and its rules work in detail,
| how to #[strong customise] it and how to #[strong add your own tokenizer]
| to a processing pipeline, see the usage guide on
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+tag-model("dependency parse") +tag-model("dependency parse")
include _spacy-101/_pos-deps include _spacy-101/_pos-deps
+infobox
| To learn more about #[strong part-of-speech tagging] and rule-based
| morphology, and how to #[strong navigate and use the parse tree]
| effectively, see the usage guides on
| #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
| #[+a("/docs/usage/dependency-parse") using the dependency parse].
+h(3, "annotations-ner") Named Entities +h(3, "annotations-ner") Named Entities
+tag-model("named entities") +tag-model("named entities")
include _spacy-101/_named-entities include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to train and update
| the entity predictions of a model, see the usage guide on
| #[+a("/docs/usage/entity-recognition") named entity recognition].
+h(2, "vectors-similarity") Word vectors and similarity +h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors") +tag-model("vectors")
@ -109,20 +129,43 @@ include _spacy-101/_similarity
include _spacy-101/_word-vectors include _spacy-101/_word-vectors
+infobox
| To learn more about word vectors, how to #[strong customise them] and
| how to load #[strong your own vectors] into spaCy, see the usage
| guide on
| #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
+h(2, "pipelines") Pipelines +h(2, "pipelines") Pipelines
include _spacy-101/_pipelines include _spacy-101/_pipelines
+infobox
| To learn more about #[strong how processing pipelines work] in detail,
| how to enable and disable their components, and how to
| #[strong create your own], see the usage guide on
| #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
include _spacy-101/_vocab-stringstore
+h(2, "serialization") Serialization +h(2, "serialization") Serialization
include _spacy-101/_serialization include _spacy-101/_serialization
+infobox
| To learn more about #[strong serialization] and how to
| #[strong save and load your own models], see the usage guide on
| #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
+h(2, "training") Training +h(2, "training") Training
include _spacy-101/_training include _spacy-101/_training
+h(2, "architecture") Architecture +h(2, "architecture") Architecture
+under-construction
+image +image
include ../../assets/img/docs/architecture.svg include ../../assets/img/docs/architecture.svg
.u-text-right .u-text-right

View File

@ -64,44 +64,10 @@ p
| predicts the new category with minimal difference from the previous | predicts the new category with minimal difference from the previous
| output. | output.
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the #[code Language.save_to_directory()]
| method:
+code.
nlp.save_to_directory('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading") saving and loading models].
p
| After you've generated and installed the package, you'll be able to
| load the model as follows:
+code.
import en_technology
nlp = en_technology.load()
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity +h(2, "example") Example: Adding and training an #[code ANIMAL] entity
+under-construction
p p
| This script shows how to add a new entity type to an existing pre-trained | This script shows how to add a new entity type to an existing pre-trained
| NER model. To keep the example short and simple, only four sentences are | NER model. To keep the example short and simple, only four sentences are
@ -170,5 +136,33 @@ p
p p
| After training your model, you can | After training your model, you can
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
| models as Python packages, for ease of deployment. | wrapping models as Python packages, for ease of deployment.
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the
| #[+api("language#to_disk") #[code Language.to_disk()]] method:
+code.
nlp.to_disk('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].

View File

@ -81,59 +81,3 @@ p.o-inline-list
p p
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
+h(2, "feature-templates") Customizing the feature extraction
p
| spaCy currently uses linear models for the tagger, parser and entity
| recognizer, with weights learned using the
| #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
+aside("Linear Model Feature Scheme")
| For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
p
| Because it's a linear model, it's important for accuracy to build
| conjunction features out of the atomic predictors. Let's say you have
| two atomic predictors asking, "What is the part-of-speech of the
| previous token?", and "What is the part-of-speech of the previous
| previous token?". These predictors will introduce a number of features,
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
p
| The feature extraction proceeds in two passes. In the first pass, we
| fill an array with the values of all of the atomic predictors. In the
| second pass, we iterate over the feature templates, and fill a small
| temporary array with the predictors that will be combined into a
| conjunction feature. Finally, we hash this array into a 64-bit integer,
| using the MurmurHash algorithm. You can see this at work in the
| #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module.
p
| It's very easy to change the feature templates, to create novel
| combinations of the existing atomic predictors. There's currently no API
| available to add new atomic predictors, though. You'll have to create a
| subclass of the model, and write your own #[code set_featuresC] method.
p
| The feature templates are passed in using the #[code features] keyword
| argument to the constructors of the #[+api("tagger") #[code Tagger]],
| #[+api("dependencyparser") #[code DependencyParser]] and
| #[+api("entityrecognizer") #[code EntityRecognizer]]:
+code.
from spacy.vocab import Vocab
from spacy.pipeline import Tagger
from spacy.tagger import P2_orth, P1_orth
from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
(P2_orth,), (P1_orth,), (W_orth,),
(N1_orth,), (N2_orth,)])
p
| Custom feature templates can be passed to the #[code DependencyParser]
| and #[code EntityRecognizer] as well, also using the #[code features]
| keyword argument of the constructor.

View File

@ -50,9 +50,10 @@ p
p p
| spay's serialization API has been made consistent across classes and | spay's serialization API has been made consistent across classes and
| objects. All container classes and pipeline components now have a | objects. All container classes, i.e. #[code Language], #[code Doc],
| #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and | #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
| #[code from_disk()] method that supports the Pickle protocol. | #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
| that supports the Pickle protocol.
p p
| The improved #[code spacy.load] makes loading models easier and more | The improved #[code spacy.load] makes loading models easier and more

View File

@ -334,7 +334,7 @@ p
| token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;]. | token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;].
| Instead of relying on the server to render and sanitize HTML, you | Instead of relying on the server to render and sanitize HTML, you
| can do this on the client in JavaScript. displaCy.js creates | can do this on the client in JavaScript. displaCy.js creates
| the SVG markup as DOM nodes and will never insert raw HTML. | the markup as DOM nodes and will never insert raw HTML.
p p
| The #[code parse_deps] function takes a #[code Doc] object and returns | The #[code parse_deps] function takes a #[code Doc] object and returns

View File

@ -23,41 +23,20 @@ p
include _spacy-101/_similarity include _spacy-101/_similarity
include _spacy-101/_word-vectors include _spacy-101/_word-vectors
+h(2, "custom") Customising word vectors +h(2, "custom") Customising word vectors
+under-construction
p p
| By default, #[+api("token#vector") #[code Token.vector]] returns the | By default, #[+api("token#vector") #[code Token.vector]] returns the
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while | vector for its underlying #[+api("lexeme") #[code Lexeme]], while
| #[+api("doc#vector") #[code Doc.vector]] and | #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] return an average of the | #[+api("span#vector") #[code Span.vector]] return an average of the
| vectors of their tokens. | vectors of their tokens. You can customize these
p
| You can customize these
| behaviours by modifying the #[code doc.user_hooks], | behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks] | #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries. | dictionaries.
+code("Example").
# TODO
p
| You can load new word vectors from a file-like buffer using the
| #[code vocab.load_vectors()] method. The file should be a
| whitespace-delimited text file, where the word is in the first column,
| and subsequent columns provide the vector data. For faster loading, you
| can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
| path to a binary file written by #[code vocab.dump_vectors()].
+code("Example").
# TODO
p
| You can also load vectors from memory by writing to the
| #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors
| you are writing are of different dimensionality
| from the ones currently loaded, you should first call
| #[code vocab.resize_vectors(new_size)].
+h(2, "similarity") Similarity +h(2, "similarity") Similarity
+under-construction