Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-01 13:03:57 -05:00
commit c650bc481c
8 changed files with 57 additions and 21 deletions

View File

@ -10,6 +10,7 @@ cimport numpy as np
import cytoolz import cytoolz
import util import util
from collections import OrderedDict from collections import OrderedDict
import ujson
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural import Model, Maxout, Softmax, Affine
@ -33,6 +34,7 @@ from .gold cimport GoldParse
from .morphology cimport Morphology from .morphology cimport Morphology
from .vocab cimport Vocab from .vocab cimport Vocab
from .syntax import nonproj from .syntax import nonproj
from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
@ -317,17 +319,33 @@ class NeuralTagger(object):
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
serialize = { serialize = OrderedDict((
'model': lambda p: p.open('wb').write(self.model.to_bytes()), ('vocab', lambda p: self.vocab.to_disk(p)),
'vocab': lambda p: self.vocab.to_disk(p) ('tag_map', lambda p: p.open('w').write(json_dumps(
} self.vocab.morphology.tag_map))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
))
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
deserialize = { def load_model(p):
'model': lambda p: self.model.from_bytes(p.open('rb').read()), if self.model is True:
'vocab': lambda p: self.vocab.from_disk(p) token_vector_width = util.env_opt('token_vector_width', 128)
} self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read())
def load_tag_map(p):
with p.open() as file_:
tag_map = ujson.loads(file_.read())
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer)
deserialize = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
))
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self

View File

@ -315,7 +315,6 @@ cdef class Vocab:
getters = OrderedDict(( getters = OrderedDict((
('strings', lambda: self.strings.to_bytes()), ('strings', lambda: self.strings.to_bytes()),
('lexemes', lambda: self.lexemes_to_bytes()), ('lexemes', lambda: self.lexemes_to_bytes()),
('tag_map', lambda: self.morphology.tag_map),
)) ))
return util.to_bytes(getters, exclude) return util.to_bytes(getters, exclude)
@ -326,13 +325,9 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object. RETURNS (Vocab): The `Vocab` object.
""" """
def set_tag_map(tag_map):
self.morphology = Morphology(self.strings, tag_map,
self.morphology.lemmatizer)
setters = OrderedDict(( setters = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)), ('strings', lambda b: self.strings.from_bytes(b)),
('lexemes', lambda b: self.lexemes_from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)),
('tag_map', lambda b: set_tag_map(b))
)) ))
return util.from_bytes(bytes_data, setters, exclude) return util.from_bytes(bytes_data, setters, exclude)

View File

@ -77,7 +77,8 @@
{ "id": "model", "title": "Models", "multiple": true, "options": [ { "id": "model", "title": "Models", "multiple": true, "options": [
{ "id": "en", "title": "English", "meta": "50MB" }, { "id": "en", "title": "English", "meta": "50MB" },
{ "id": "de", "title": "German", "meta": "645MB" }, { "id": "de", "title": "German", "meta": "645MB" },
{ "id": "fr", "title": "French", "meta": "1.33GB" }] { "id": "fr", "title": "French", "meta": "1.33GB" },
{ "id": "es", "title": "Spanish", "meta": "378MB"}]
} }
], ],
@ -85,7 +86,8 @@
{ "id": "lang", "title": "Language", "options": [ { "id": "lang", "title": "Language", "options": [
{ "id": "en", "title": "English", "checked": true }, { "id": "en", "title": "English", "checked": true },
{ "id": "de", "title": "German" }, { "id": "de", "title": "German" },
{ "id": "fr", "title": "French" }] { "id": "fr", "title": "French" },
{ "id": "es", "title": "Spanish" }]
}, },
{ "id": "load", "title": "Loading style", "options": [ { "id": "load", "title": "Loading style", "options": [
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
@ -108,9 +110,19 @@
], ],
"fr": [ "fr": [
{ "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" } { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
],
"es": [
{ "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "378 MB", "license": "CC BY-SA"}
] ]
}, },
"EXAMPLE_SENTENCES": {
"en": "This is a sentence.",
"de": "Dies ist ein Satz.",
"fr": "C'est une phrase.",
"es": "Esto es una frase."
},
"ALPHA": true, "ALPHA": true,
"V_CSS": "1.6", "V_CSS": "1.6",
"V_JS": "1.2", "V_JS": "1.2",

View File

@ -107,13 +107,13 @@ mixin button(url, trusted, ...style)
height - [integer] optional height to clip code block to height - [integer] optional height to clip code block to
mixin code(label, language, icon, height) mixin code(label, language, icon, height)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes) pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label if label
h4.u-text-label.u-text-label--dark=label h4.u-text-label.u-text-label--dark=label
if icon if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || "" class=classes[icon] ? "c-code-block__icon--border" : "") .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18) +icon(icon, 18)
code.c-code-block__content code.c-code-block__content

View File

@ -1,5 +1,16 @@
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> <svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<defs> <defs>
<symbol id="v2alpha" viewBox="0 0 200 111">
<title>spaCy v2.0.0 alpha</title>
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
<path fill="#a3cad3" d="M45.4 105.4L19.6 94.6l25.4-1"/>
<path fill="#ddd" d="M196.6 2L155 3.4l1 29.2 41.6-1.4L187.2 17"/>
<path fill="#a3cad3" d="M155 3.4l25.8 10.8-25.4 1"/>
<path fill="#fff" d="M17.6 19.4l163-5.6 1 29.2-163 5.6zM19.2 65.6l163-5.6 1 29.2-163 5.6z"/>
<path fill="#008EBC" d="M45.8 29h-3.6v-2.4l10-.4.2 2.5h-3.6l.4 10.8h-3L45.8 29zM62 39L59 34.5h-1.6l.2 5h-3l-.5-13.2L59 26c3 0 5.2.8 5.3 4 0 1.8-.8 3-2.2 3.8l3.3 5.2H62zm-4.5-6.8H59c1.6-.2 2.4-.8 2.3-2 0-1.4-1-1.8-2.5-1.8h-1.5l.2 3.8zM69 34.2l-4.3-8.4H68l1.2 3 1.2 2.8c.4-1 .8-2 1-3l1.2-3 3-.2L72 34l.2 4.7h-3l-.2-4.5zM79.5 25.3h3.2l1.8 6 1.2 4.2c.5-1.5.7-2.8 1-4.3L88 25h3L87.7 38H84l-4.5-13zM92.4 25l8.3-.4V27l-5.2.3V30l4.6-.3.2 2.5-4.5.2v3l5.6-.2v2.5L93 38l-.6-13zM111 37.4l-2.6-4.7h-1.6l.2 5h-3l-.5-13.2 4.8-.2c2.8 0 5 .8 5.2 4 0 1.8-.8 3-2.2 3.8l3.2 5.3H111zm-4.3-7h1.5c1.6 0 2.4-.7 2.3-2 0-1.3-1-1.7-2.5-1.7h-1.5l.2 3.8zM116.8 33.5c1 .8 2.2 1.3 3.3 1.3 1.3 0 2-.5 2-1.3s-1-1-2-1.5l-1.8-.7c-1.4-.5-2.7-1.6-2.8-3.5 0-2.2 1.8-4 4.6-4 1.5-.2 3 .4 4.3 1.5l-1.4 2c-1-.7-1.8-1-3-1-1 0-1.6.4-1.5 1.2 0 .8 1 1 2 1.5l1.8.6c1.6.6 2.7 1.6 2.7 3.5 0 2.3-1.7 4.2-4.8 4.4-1.7 0-3.6-.5-5-1.7l1.6-2.2zM126.8 23.7h3l.5 13-3 .2-.5-13.3zM132.5 30c0-4.3 2.2-7 5.8-7 3.6 0 6 2.3 6.2 6.6 0 4.3-2.2 7-5.8 7-3.5.3-6-2.3-6.2-6.6zm9-.3c-.2-2.6-1.4-4.2-3.2-4-1.8 0-3 1.6-2.8 4.2 0 2.5 1.3 4.2 3 4 2 0 3-1.6 3-4.3zM146.7 23h3l3.8 6.3 1.4 3c-.2-1.5-.5-3.3-.5-5l-.2-4.6h2.8l.6 13-3 .2-3.8-6.6-1.4-2.8c0 1.5.4 3.2.4 4.8l.2 4.7-3 .2-.3-13.2z"/>
<path fill="#1A1E23" d="M50.2 84.7c3.2-3.2 5.4-5.5 5.3-7.3 0-1.3-.8-2-2-2-.8 0-1.5.8-2 1.5l-1.8-1.6c1.2-1.4 2.4-2 4.2-2.2 2.4 0 4.2 1.5 4.3 4 0 2-2 4.4-4 6.7.7-.2 1.6-.3 2.2-.3H59l.2 2.4-9 .4v-1.7zM63 82.4c1 0 2 .7 2 1.8 0 1-.7 2-1.7 2s-1.8-.8-2-2c0-1 .7-1.8 1.8-1.8zM66.7 79.3c-.2-4.4 1.6-6.7 4.4-6.8 3 0 4.8 2 5 6.5s-1.7 6.8-4.5 7c-2.7 0-4.6-2.3-4.8-6.7zM73 79c0-3.4-.8-4.2-1.8-4-1 0-1.8.7-1.6 4.3 0 3.5 1 4.4 2 4.3 1 0 1.6-1 1.5-4.5zM79.8 81.8c1 0 1.8.7 2 1.8 0 1-.8 2-1.8 2s-1.8-.8-2-2c0-1 .8-1.7 1.8-1.8zM83.5 78.7C83.3 74.3 85 72 88 72c2.7-.2 4.6 2 4.7 6.4s-1.6 6.8-4.4 7c-2.8 0-4.7-2.3-4.8-6.7zm6.3-.2c0-3.5-1-4.3-2-4.2-1 0-1.7.8-1.5 4.4 0 3.5 1 4.4 2 4.3 1 0 1.7-1 1.5-4.5zM105.5 81.3h-4l-.7 3.3h-3l3.7-13.2h3.6l4.7 13h-3.2l-1-3zm-.7-2.3l-.4-1.2-1.2-4.2-1 4.3-.3 1h2.8zM110.5 71h3l.4 10.7 5-.2.2 2.5-8.2.3-.5-13.2zM121 70.7l4.7-.2c3 0 5.2 1 5.3 4 0 3.2-2.2 4.7-5 4.7h-1.8l.2 4.6h-3l-.5-13zm4.7 6.2c1.6-.2 2.4-1 2.4-2.3 0-1.4-.8-2-2.4-1.8H124v4h1.7zM133 70.3h3l.3 5 4.5-.2-.2-5h3l.5 13-3 .2v-5.5l-4.6.2.2 5.4h-3l-.5-13zM153.3 79.7h-4l-.7 3.3h-3l3.7-13.2h3.6l4.5 13h-3.2l-1-3zm-.7-2.3l-.4-1.2L151 72l-1 4.3-.3 1.2h3z"/>
</symbol>
<symbol id="usersurvey" viewBox="0 0 200 111"> <symbol id="usersurvey" viewBox="0 0 200 111">
<title>spaCy user survey 2017</title> <title>spaCy user survey 2017</title>
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/> <path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@ -40,6 +40,7 @@ p
+qs({model: 'en'}) python -m spacy download en +qs({model: 'en'}) python -m spacy download en
+qs({model: 'de'}) python -m spacy download de +qs({model: 'de'}) python -m spacy download de
+qs({model: 'fr'}) python -m spacy download fr +qs({model: 'fr'}) python -m spacy download fr
+qs({model: 'es'}) python -m spacy download es
+h(2, "installation") Installation instructions +h(2, "installation") Installation instructions

View File

@ -18,7 +18,6 @@ p
| skew, which might decrease your accuracy. | skew, which might decrease your accuracy.
+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.") +quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
- var examples = {en: "This is a sentence.", de: "Dies ist ein Satz.", fr: "C'est une phrase."}
for models, lang in MODELS for models, lang in MODELS
- var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def }) - var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
+qs({lang: lang}) python -m spacy download #{lang} +qs({lang: lang}) python -m spacy download #{lang}
@ -26,7 +25,7 @@ p
+qs({lang: lang, load: "module"}, "python") import #{package.id} +qs({lang: lang, load: "module"}, "python") import #{package.id}
+qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load() +qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
+qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}') +qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
+qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{examples[lang]}") +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
+qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc]) +qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])
+h(2, "available") Available models +h(2, "available") Available models

View File

@ -11,7 +11,7 @@ include _includes/_mixins
h2.c-landing__title.o-block.u-heading-1 h2.c-landing__title.o-block.u-heading-1
| in Python | in Python
+landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!") +landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")
+grid.o-content +grid.o-content
+grid-col("third").o-card +grid-col("third").o-card