Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-17 12:09:29 +02:00
commit 877f83807f
23 changed files with 402 additions and 25 deletions

View File

@ -13,3 +13,4 @@ requests>=2.13.0,<3.0.0
regex==2017.4.5
ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0

View File

@ -193,6 +193,7 @@ def setup_package():
'preshed>=1.0.0,<2.0.0',
'thinc>=6.6.0,<6.7.0',
'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six',
'pathlib',
'ujson>=1.35',

View File

@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
@ -23,6 +24,7 @@ class German(Language):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):

View File

@ -0,0 +1,38 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and
# measurement construction, the span is sometimes extended to the right of
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
# and not just "eine Tasse", same for "das Thema Familie".
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk']
rbracket = 0
for i, word in enumerate(obj):
if i < rbracket:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
rbracket = word.i+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in doc[word.i].rights:
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
rbracket = rdep.i+1
yield word.left_edge.i, rbracket, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -7,6 +7,7 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
@ -29,6 +30,7 @@ class English(Language):
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
sytax_iterators = dict(SYNTAX_ITERATORS)
__all__ = ['English']

View File

@ -0,0 +1,43 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP']
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i+1))
yield word.left_edge.i, word.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i+1))
yield word.left_edge.i, word.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -29,7 +29,7 @@
"NAVIGATION": {
"Home": "/",
"Usage": "/docs/usage",
"API": "/docs/api",
"Reference": "/docs/api",
"Demos": "/docs/usage/showcase",
"Blog": "https://explosion.ai/blog"
},
@ -55,6 +55,31 @@
}
},
"QUICKSTART": [
{ "id": "os", "title": "Operating system", "options": [
{ "id": "mac", "title": "macOS / OSX", "checked": true },
{ "id": "windows", "title": "Windows" },
{ "id": "linux", "title": "Linux" }]
},
{ "id": "package", "title": "Package manager", "options": [
{ "id": "pip", "title": "pip", "checked": true },
{ "id": "conda", "title": "conda" },
{ "id": "source", "title": "from source" }]
},
{ "id": "python", "title": "Python version", "options": [
{ "id": 2, "title": "2.x" },
{ "id": 3, "title": "3.x", "checked": true }]
},
{ "id": "config", "title": "Configuration", "multiple": true, "options": [
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
},
{ "id": "model", "title": "Models", "multiple": true, "options": [
{ "id": "en", "title": "English", "meta": "50MB" },
{ "id": "de", "title": "German", "meta": "645MB" },
{ "id": "fr", "title": "French", "meta": "1.33GB" }]
}
],
"V_CSS": "1.6",
"V_JS": "1.2",
"DEFAULT_SYNTAX": "python",

View File

@ -86,6 +86,44 @@ mixin permalink(id)
block
//- Quickstart widget
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
groups - [object] option groups, uses global variable QUICKSTART
headline - [string] optional text to be rendered as widget headline
mixin quickstart(groups, headline)
.c-quickstart.o-block#qs
.c-quickstart__content
if headline
+h(2)=headline
for group in groups
.c-quickstart__group.u-text-small(data-qs-group=group.id)
.c-quickstart__legend=group.title
.c-quickstart__fields
for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked)
label.c-quickstart__label(for=option.id)=option.title
if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})]
if option.help
| #[+help(option.help).c-quickstart__label__meta]
pre.c-code-block
code.c-code-block__content.c-quickstart__code(data-qs-results="")
block
//- Quickstart code item
data [object] - Rendering conditions (keyed by option group ID, value: option)
mixin qs(data)
- args = {}
for value, setting in data
- args['data-qs-' + setting] = value
span.c-quickstart__line&attributes(args)
block
//- Terminal-style code window
label - [string] title displayed in top bar of terminal window

View File

@ -47,6 +47,14 @@ mixin api(path)
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
//- Help icon with tooltip
tooltip - [string] Tooltip text
mixin help(tooltip)
span(data-tooltip=tooltip)&attributes(attributes)
+icon("help", 16).i-icon--inline
//- Aside for text
label - [string] aside title (optional)

View File

@ -1,9 +1,13 @@
//- 💫 INCLUDES > SCRIPTS
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
script(src="/assets/js/prism.js", type="text/javascript")
script(src="/assets/js/main.js?v#{V_JS}")
script(src="/assets/js/prism.js")
if SECTION == "docs"
if quickstart
script(src="/assets/js/quickstart.js")
script var qs = new Quickstart("#qs")
script.
((window.gitter = {}).chat = {}).options = {
useStyles: false,

View File

@ -0,0 +1,83 @@
//- 💫 CSS > COMPONENTS > QUICKSTART
.c-quickstart
border: 1px solid $color-subtle
border-radius: 2px
display: none
background: $color-subtle-light
.c-quickstart__content
padding: 2rem 3rem
.c-quickstart__input
display: none
.c-quickstart__label
cursor: pointer
background: $color-back
border: 1px solid $color-subtle
border-radius: 2px
display: inline-block
padding: 0.75rem 1.25rem
margin: 0 0.5rem 0.5rem 0
font-weight: bold
&:hover
background: lighten($color-theme-light, 5)
.c-quickstart__input--radio:checked + &
color: $color-back
border-color: $color-theme
background: $color-theme
.c-quickstart__input--check + &:before
content: ""
background: $color-back
display: inline-block
width: 20px
height: 20px
position: relative
bottom: 0.2rem
border: 1px solid $color-subtle
vertical-align: middle
margin-right: 1rem
cursor: pointer
border-radius: 50%
.c-quickstart__input--check:checked + &:before
background: $color-theme url()
background-size: contain
border-color: $color-theme
.c-quickstart__label__meta
font-weight: normal
color: $color-subtle-dark
.c-quickstart__group
@include breakpoint(min, md)
display: flex
flex-flow: row nowrap
&:not(:last-child)
margin-bottom: 1rem
.c-quickstart__fields
flex: 100%
.c-quickstart__legend
color: $color-subtle-dark
margin-right: 2rem
padding-top: 0.75rem
flex: 1 1 35%
font-weight: bold
.c-quickstart__line
display: block
&:before
color: $color-theme
margin-right: 1em
content: "$"
.c-quickstart__code
font-size: 1.6rem

View File

@ -0,0 +1,30 @@
//- 💫 CSS > COMPONENTS > TOOLTIPS
[data-tooltip]
position: relative
@include breakpoint(min, sm)
&:before
@include position(absolute, top, left, 125%, 50%)
display: inline-block
content: attr(data-tooltip)
background: $color-front
border-radius: 2px
color: $color-back
font-family: inherit
font-size: 1.3rem
line-height: 1.25
opacity: 0
padding: 0.5em 0.75em
transform: translateX(-50%) translateY(-2px)
transition: opacity 0.1s ease-out, transform 0.1s ease-out
visibility: hidden
//white-space: nowrap
min-width: 200px
max-width: 300px
z-index: 200
&:hover:before
opacity: 1
transform: translateX(-50%) translateY(0)
visibility: visible

View File

@ -32,3 +32,5 @@ $theme: blue !default
@import _components/navigation
@import _components/sidebar
@import _components/tables
@import _components/quickstart
@import _components/tooltips

View File

@ -27,5 +27,8 @@
<symbol id="star" viewBox="0 0 24 24">
<path d="M12 17.25l-6.188 3.75 1.641-7.031-5.438-4.734 7.172-0.609 2.813-6.609 2.813 6.609 7.172 0.609-5.438 4.734 1.641 7.031z"></path>
</symbol>
<symbol id="help" viewBox="0 0 24 24">
<path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
</symbol>
</defs>
</svg>

Before

Width:  |  Height:  |  Size: 4.9 KiB

After

Width:  |  Height:  |  Size: 5.4 KiB

View File

@ -1,6 +1,4 @@
//- ----------------------------------
//- 💫 MAIN JAVASCRIPT
//- ----------------------------------
'use strict'

View File

@ -0,0 +1,7 @@
/**
* quickstart.js
*
* @author Ines Montani <ines@ines.io>
* @version 0.0.1
* @license MIT
*/'use strict';var _createClass=function(){function a(b,c){for(var e,d=0;d<c.length;d++)e=c[d],e.enumerable=e.enumerable||!1,e.configurable=!0,'value'in e&&(e.writable=!0),Object.defineProperty(b,e.key,e)}return function(b,c,d){return c&&a(b.prototype,c),d&&a(b,d),b}}();function _toConsumableArray(a){if(Array.isArray(a)){for(var b=0,c=Array(a.length);b<a.length;b++)c[b]=a[b];return c}return Array.from(a)}function _classCallCheck(a,b){if(!(a instanceof b))throw new TypeError('Cannot call a class as a function')}var Quickstart=function(){function a(){var b=0<arguments.length&&void 0!==arguments[0]?arguments[0]:'#quickstart',d=arguments[1],c=2<arguments.length&&void 0!==arguments[2]?arguments[2]:{};_classCallCheck(this,a),this.container='string'==typeof b?this._$(b):b,this.groups=d,this.pfx=c.prefix||'qs',this.dpfx='data-'+this.pfx,this.init=this.init.bind(this),c.noInit||document.addEventListener('DOMContentLoaded',this.init)}return _createClass(a,[{key:'init',value:function init(){this.updateContainer(),this.container.style.display='block',this.container.classList.add(''+this.pfx);var b=this.groups;b instanceof Array?b.reverse().forEach(this.createGroup.bind(this)):this._$$('['+this.dpfx+'-group]').forEach(this.updateGroup.bind(this))}},{key:'initGroup',value:function initGroup(b,c){b.addEventListener('change',this.update.bind(this)),b.dispatchEvent(new CustomEvent('change',{detail:c}))}},{key:'updateGroup',value:function updateGroup(b){var c=b.getAttribute(this.dpfx+'-group'),d=this.createStyles(c);b.insertBefore(d,b.firstChild),this.initGroup(b,c)}},{key:'update',value:function update(b){var f=this,c=b.detail||b.target.name,d=this._$$('[name='+c+']').filter(function(h){return h.checked}).map(function(h){return h.value}),e=d.map(function(h){return':not(['+f.dpfx+'-'+c+'="'+h+'"])'}).join(''),g='['+this.dpfx+'-results]>['+this.dpfx+'-'+c+']'+e+' {display: none}';this._$('['+this.dpfx+'-style="'+c+'"]').textContent=g}},{key:'updateContainer',value:function updateContainer(){if(!this._$('['+this.dpfx+'-results]')){var b=this.childNodes(this.container,'pre'),c=b?b[0]:this._c('pre',this.pfx+'-code'),d=this.childNodes(c,'code')||this.childNodes(this.container,'code'),e=d?d[0]:this._c('code',this.pfx+'-results');e.setAttribute(this.dpfx+'-results','');var f=this.childNodes(e,'span')||this.childNodes(c,'span')||this.childNodes(this.container,'span');f&&f.forEach(function(g){return e.appendChild(g)}),c.appendChild(e),this.container.appendChild(c)}}},{key:'createGroup',value:function createGroup(b){var d=this,c=this._c('div',this.pfx+'-group');c.setAttribute(this.dpfx+'-group',b.id),c.innerHTML=this.createStyles(b.id).outerHTML,c.innerHTML+='<div class="'+this.pfx+'-legend">'+b.title+'</div>',c.innerHTML+='<div class="'+this.pfx+'-fields">'+b.options.map(function(e){var f=b.multiple?'checkbox':'radio';return'<input class="'+d.pfx+'-input '+d.pfx+'-input--'+f+'" type="'+f+'" name="'+b.id+'" id="'+e.id+'" value="'+e.id+'" '+(e.checked?'checked':'')+' /><label class="'+d.pfx+'-label" for="'+e.id+'">'+e.title+'</label>'}).join('')+'</div>',this.container.insertBefore(c,this.container.firstChild),this.initGroup(c,b.id)}},{key:'createStyles',value:function createStyles(b){var c=this._c('style');return c.setAttribute(this.dpfx+'-style',b),c.textContent='['+this.dpfx+'-results]>['+this.dpfx+'-'+b+'] {display: none}',c}},{key:'childNodes',value:function childNodes(b,c){var d=c.toUpperCase();if(!b.hasChildNodes)return!1;var e=[].concat(_toConsumableArray(b.childNodes)).filter(function(f){return f.nodeName===d});return!!e.length&&e}},{key:'_$',value:function _$(b){return document.querySelector(b)}},{key:'_$$',value:function _$$(b){return[].concat(_toConsumableArray(document.querySelectorAll(b)))}},{key:'_c',value:function _c(b,c){var d=document.createElement(b);return c&&(d.className=c),d}}]),a}();

View File

@ -21,6 +21,7 @@
"GoldParse": "goldparse"
},
"Other": {
"Command line": "cli",
"displaCy": "displacy",
"Utility Functions": "util",
"Annotation Specs": "annotation",
@ -112,6 +113,11 @@
"tag": "class"
},
"cli": {
"title": "Command Line Interface",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module"

View File

@ -5,10 +5,11 @@
"Models": "models",
"Lightning tour": "lightning-tour",
"Visualizers": "visualizers",
"Command line": "cli",
"Troubleshooting": "troubleshooting"
"Troubleshooting": "troubleshooting",
"What's new in v2.0": "v2"
},
"Workflows": {
"spaCy 101": "spacy-101",
"Loading the pipeline": "language-processing-pipeline",
"Processing text": "processing-text",
"spaCy's data model": "data-model",
@ -33,7 +34,12 @@
"index": {
"title": "Install spaCy",
"next": "models"
"next": "models",
"quickstart": true
},
"v2": {
"title": "What's new in v2.0"
},
"models": {
@ -43,17 +49,11 @@
"lightning-tour": {
"title": "Lightning tour",
"next": "visualizers"
"next": "spacy-101"
},
"visualizers": {
"title": "Visualizers",
"next": "cli"
},
"cli": {
"title": "Command Line Interface",
"next": "troubleshooting"
"title": "Visualizers"
},
"troubleshooting": {
@ -65,6 +65,10 @@
"title": "Resources"
},
"spacy-101": {
"title": "spaCy 101"
},
"language-processing-pipeline": {
"title": "Loading a language processing pipeline",
"next": "processing-text"

View File

@ -12,6 +12,39 @@ p
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
| and #[a(href="#source-windows") Windows] for details.
+quickstart(QUICKSTART, "Quickstart")
+qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
+qs({config: 'venv', python: 3}) python -m pip install -U venv
+qs({config: 'venv', python: 2}) virtualenv .env
+qs({config: 'venv', python: 3}) venv .env
+qs({config: 'venv', os: 'mac'}) source .env/bin/activate
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
+qs({package: 'pip'}) pip install -U spacy
+qs({package: 'conda'}) conda config --add channels conda-forge
+qs({package: 'conda'}) conda install spacy
+qs({package: 'source'}) git clone https://github.com/explosion/spaCy
+qs({package: 'source'}) cd spaCy
+qs({package: 'source'}) pip install -r requirements.txt
+qs({package: 'source'}) pip install -e .
+qs({model: 'en'}) python -m spacy download en
+qs({model: 'de'}) python -m spacy download de
+qs({model: 'fr'}) python -m spacy download fr
+h(2, "installation") Installation instructions
+h(3, "pip") pip
+badge("pipy")
p Using pip, spaCy releases are currently only available as source packages.
+code(false, "bash").
pip install -U spacy
+aside("Download models")
| After installation you need to download a language model. For more info
| and available models, see the #[+a("/docs/usage/models") docs on models].
@ -22,14 +55,6 @@ p
&gt;&gt;&gt; import spacy
&gt;&gt;&gt; nlp = spacy.load('en')
+h(2, "pip") pip
+badge("pipy")
p Using pip, spaCy releases are currently only available as source packages.
+code(false, "bash").
pip install -U spacy
p
| When using pip it is generally recommended to install packages in a
| #[code virtualenv] to avoid modifying system state:
@ -39,7 +64,7 @@ p
source .env/bin/activate
pip install spacy
+h(2, "conda") conda
+h(3, "conda") conda
+badge("conda")
p

View File

@ -0,0 +1,10 @@
//- 💫 DOCS > USAGE > SPACY 101
include ../../_includes/_mixins
+h(2, "architecture") Architecture
+image
include ../../assets/img/docs/architecture.svg
.u-text-right
+button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic

View File

@ -0,0 +1,10 @@
//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0
include ../../_includes/_mixins
+h(2, "features") New features
+h(2, "incompat") Backwards incompatibilities
+h(2, "migrating") Migrating from spaCy 1.x

View File

@ -153,6 +153,24 @@ p
| #[+a("https://tympanus.net/codrops/css_reference/background/") valid background value]
| or shorthand — including gradients and even images!
+h(3, "ent-titles") Adding titles to documents
p
| Rendering several large documents on one page can easily become confusing.
| To add a headline to each visualization, you can add a #[code title] to
| its #[code user_data]. User data is never touched or modified by spaCy.
+code.
doc = nlp(u'This is a sentence about Google.')
doc.user_data['title'] = 'This is a title'
displacy.serve(doc, style='ent')
p
| This feature is espeically handy if you're using displaCy to compare
| performance at different stages of a process, e.g. during training. Here
| you could use the title for a brief description of the text example and
| the number of iterations.
+h(2, "render") Rendering visualizations
p
@ -232,6 +250,25 @@ p
+h(2, "examples") Usage examples
+h(3, "examples-export-svg") Export SVG graphics of dependency parses
+code("Example").
import spacy
from spacy import displacy
from pathlib import Path
nlp = spacy.load('en')
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
doc = nlp(sentence)
svg = displacy.render(doc, style='dep')
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
output_path = Path('/images/' + file_name)
output_path.open('w', encoding='utf-8').write(svg)
p
| The above code will generate the dependency visualizations and them to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+h(2, "manual-usage") Rendering data manually