-1,o=n&&!i,u=!n&&i;o&&(t.current.push(e),t.emit("enter",e)),u&&(t.current.splice(r,1),t.emit("exit",e))}),this}},{key:"on",value:function(t,e){return this.handlers[t].push(e),this}},{key:"once",value:function(t,e){return this.singles[t].unshift(e),this}},{key:"emit",value:function(t,e){for(;this.singles[t].length;)this.singles[t].pop()(e);for(var n=this.handlers[t].length;--n>-1;)this.handlers[t][n](e);return this}}]),t}();e["default"]=function(t,e){return new i(t,e)}},function(t,e){"use strict";function n(t,e){var n=t.getBoundingClientRect(),r=n.top,i=n.right,o=n.bottom,u=n.left,f=n.width,s=n.height,c={t:o,r:window.innerWidth-u,b:window.innerHeight-r,l:i},a={x:e.threshold*f,y:e.threshold*s};return c.t>e.offset.top+a.y&&c.r>e.offset.right+a.x&&c.b>e.offset.bottom+a.y&&c.l>e.offset.left+a.x}Object.defineProperty(e,"__esModule",{value:!0}),e.inViewport=n},function(t,e){(function(e){var n="object"==typeof e&&e&&e.Object===Object&&e;t.exports=n}).call(e,function(){return this}())},function(t,e,n){var r=n(5),i="object"==typeof self&&self&&self.Object===Object&&self,o=r||i||Function("return this")();t.exports=o},function(t,e,n){function r(t,e,n){function r(e){var n=x,r=m;return x=m=void 0,E=e,w=t.apply(r,n)}function a(t){return E=t,j=setTimeout(h,e),M?r(t):w}function l(t){var n=t-O,r=t-E,i=e-n;return _?c(i,g-r):i}function d(t){var n=t-O,r=t-E;return void 0===O||n>=e||n<0||_&&r>=g}function h(){var t=o();return d(t)?p(t):void(j=setTimeout(h,l(t)))}function p(t){return j=void 0,T&&x?r(t):(x=m=void 0,w)}function v(){void 0!==j&&clearTimeout(j),E=0,x=O=m=j=void 0}function y(){return void 0===j?w:p(o())}function b(){var t=o(),n=d(t);if(x=arguments,m=this,O=t,n){if(void 0===j)return a(O);if(_)return j=setTimeout(h,e),r(O)}return void 0===j&&(j=setTimeout(h,e)),w}var x,m,g,w,j,O,E=0,M=!1,_=!1,T=!0;if("function"!=typeof t)throw new TypeError(f);return e=u(e)||0,i(n)&&(M=!!n.leading,_="maxWait"in n,g=_?s(u(n.maxWait)||0,e):g,T="trailing"in n?!!n.trailing:T),b.cancel=v,b.flush=y,b}var i=n(1),o=n(8),u=n(10),f="Expected a function",s=Math.max,c=Math.min;t.exports=r},function(t,e,n){var r=n(6),i=function(){return r.Date.now()};t.exports=i},function(t,e,n){function r(t,e,n){var r=!0,f=!0;if("function"!=typeof t)throw new TypeError(u);return o(n)&&(r="leading"in n?!!n.leading:r,f="trailing"in n?!!n.trailing:f),i(t,e,{leading:r,maxWait:e,trailing:f})}var i=n(7),o=n(1),u="Expected a function";t.exports=r},function(t,e){function n(t){return t}t.exports=n}])});
diff --git a/website/assets/js/main.js b/website/assets/js/main.js
index 616fbb1df..42199538f 100644
--- a/website/assets/js/main.js
+++ b/website/assets/js/main.js
@@ -1,23 +1,324 @@
//- π« MAIN JAVASCRIPT
+//- Note: Will be compiled using Babel before deployment.
'use strict'
-{
- const nav = document.querySelector('.js-nav')
- const fixedClass = 'is-fixed'
- let vh, scrollY = 0, scrollUp = false
+const $ = document.querySelector.bind(document);
+const $$ = document.querySelectorAll.bind(document);
- const updateVh = () => Math.max(document.documentElement.clientHeight, window.innerHeight || 0)
- const updateNav = () => {
- const vh = updateVh()
- const newScrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0)
- if (newScrollY != scrollY) scrollUp = newScrollY <= scrollY
- scrollY = newScrollY
-
- if(scrollUp && !(isNaN(scrollY) || scrollY <= vh)) nav.classList.add(fixedClass)
- else if (!scrollUp || (isNaN(scrollY) || scrollY <= vh/2)) nav.classList.remove(fixedClass)
+class ProgressBar {
+ /**
+ * Animated reading progress bar.
+ * @param {String} selector β CSS selector of progress bar element.
+ */
+ constructor(selector) {
+ this.el = $(selector);
+ this.scrollY = 0;
+ this.sizes = this.updateSizes();
+ this.el.setAttribute('max', 100);
+ this.init();
}
- window.addEventListener('scroll', () => requestAnimationFrame(updateNav))
+ init() {
+ window.addEventListener('scroll', () => {
+ this.scrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0);
+ requestAnimationFrame(this.update.bind(this));
+ }, false);
+ window.addEventListener('resize', () => {
+ this.sizes = this.updateSizes();
+ requestAnimationFrame(this.update.bind(this));
+ })
+ }
+
+ update() {
+ const offset = 100 - ((this.sizes.height - this.scrollY - this.sizes.vh) / this.sizes.height * 100);
+ this.el.setAttribute('value', (this.scrollY == 0) ? 0 : offset || 0);
+ }
+
+ updateSizes() {
+ const body = document.body;
+ const html = document.documentElement;
+ return {
+ height: Math.max(body.scrollHeight, body.offsetHeight, html.clientHeight, html.scrollHeight, html.offsetHeight),
+ vh: Math.max(html.clientHeight, window.innerHeight || 0)
+ }
+ }
+}
+
+
+class SectionHighlighter {
+ /**
+ * Hightlight section in viewport in sidebar, using in-view library.
+ * @param {String} sectionAttr - Data attribute of sections.
+ * @param {String} navAttr - Data attribute of navigation items.
+ * @param {String} activeClass β Class name of active element.
+ */
+ constructor(sectionAttr, navAttr, activeClass = 'is-active') {
+ this.sections = [...$$(`[${navAttr}]`)];
+ this.navAttr = navAttr;
+ this.sectionAttr = sectionAttr;
+ this.activeClass = activeClass;
+ inView(`[${sectionAttr}]`).on('enter', this.highlightSection.bind(this));
+ }
+
+ highlightSection(section) {
+ const id = section.getAttribute(this.sectionAttr);
+ const el = $(`[${this.navAttr}="${id}"]`);
+ if (el) {
+ this.sections.forEach(el => el.classList.remove(this.activeClass));
+ el.classList.add(this.activeClass);
+ }
+ }
+}
+
+
+class Templater {
+ /**
+ * Mini templating engine based on data attributes. Selects elements based
+ * on a data-tpl and data-tpl-key attribute and can set textContent
+ * and innterHtml.
+ *
+ * @param {String} templateId - Template section, e.g. value of data-tpl.
+ */
+ constructor(templateId) {
+ this.templateId = templateId;
+ }
+
+ get(key) {
+ return $(`[data-tpl="${this.templateId}"][data-tpl-key="${key}"]`);
+ }
+
+ fill(key, value, html = false) {
+ const el = this.get(key);
+ if (html) el.innerHTML = value || '';
+ else el.textContent = value || '';
+ return el;
+ }
+}
+
+
+class ModelLoader {
+ /**
+ * Load model meta from GitHub and update model details on site. Uses the
+ * Templater mini template engine to update DOM.
+ *
+ * @param {String} repo - Path tp GitHub repository containing releases.
+ * @param {Array} models - List of model IDs, e.g. "en_core_web_sm".
+ * @param {Object} licenses - License IDs mapped to URLs.
+ * @param {Object} accKeys - Available accuracy keys mapped to display labels.
+ */
+ constructor(repo, models = [], licenses = {}, accKeys = {}) {
+ this.url = `https://raw.githubusercontent.com/${repo}/master`;
+ this.repo = `https://github.com/${repo}`;
+ this.modelIds = models;
+ this.licenses = licenses;
+ this.accKeys = accKeys;
+ this.chartColor = '#09a3d5';
+ this.chartOptions = {
+ type: 'bar',
+ options: { responsive: true, scales: {
+ yAxes: [{ label: 'Accuracy', ticks: { suggestedMin: 70 }}],
+ xAxes: [{ barPercentage: 0.425 }]
+ }}
+ }
+ Chart.defaults.global.legend.position = 'bottom';
+ Chart.defaults.global.defaultFontFamily = "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'";
+ this.init();
+ }
+
+ init() {
+ this.modelIds.forEach(modelId =>
+ new Templater(modelId).get('table').setAttribute('data-loading', ''));
+ fetch(`${this.url}/compatibility.json`)
+ .then(res => this.handleResponse(res))
+ .then(json => json.ok ? this.getModels(json['spacy']) : this.modelIds.forEach(modelId => this.showError(modelId)))
+ }
+
+ handleResponse(res) {
+ if (res.ok) return res.json().then(json => Object.assign({}, json, { ok: res.ok }))
+ else return ({ ok: res.ok })
+ }
+
+ getModels(compat) {
+ this.compat = compat;
+ for (let modelId of this.modelIds) {
+ const version = this.getLatestVersion(modelId, compat);
+ if (!version) {
+ this.showError(modelId); return;
+ }
+ fetch(`${this.url}/meta/${modelId}-${version}.json`)
+ .then(res => this.handleResponse(res))
+ .then(json => json.ok ? this.render(json) : this.showError(modelId))
+ }
+ // make sure scroll positions for progress bar etc. are recalculated
+ window.dispatchEvent(new Event('resize'));
+ }
+
+ showError(modelId) {
+ const template = new Templater(modelId);
+ template.get('table').removeAttribute('data-loading');
+ template.get('error').style.display = 'block';
+ for (let key of ['sources', 'pipeline', 'author', 'license']) {
+ template.get(key).parentElement.parentElement.style.display = 'none';
+ }
+ }
+
+ /**
+ * Update model details in tables. Currently quite hacky :(
+ */
+ render({ lang, name, version, sources, pipeline, url, author, license, accuracy, size, description, notes }) {
+ const modelId = `${lang}_${name}`;
+ const model = `${modelId}-${version}`;
+ const template = new Templater(modelId);
+
+ const getSources = s => (s instanceof Array) ? s.join(', ') : s;
+ const getPipeline = p => p.map(comp => `${comp}
`).join(', ');
+ const getLink = (t, l) => `${t}`;
+
+ const keys = { version, size, description, notes }
+ Object.keys(keys).forEach(key => template.fill(key, keys[key]));
+
+ if (sources) template.fill('sources', getSources(sources));
+ if (pipeline && pipeline.length) template.fill('pipeline', getPipeline(pipeline), true);
+ else template.get('pipeline').parentElement.parentElement.style.display = 'none';
+
+ if (author) template.fill('author', url ? getLink(author, url) : author, true);
+ if (license) template.fill('license', this.licenses[license] ? getLink(license, this.licenses[license]) : license, true);
+
+ template.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
+ if (accuracy) this.renderAccuracy(template, accuracy, modelId);
+ this.renderCompat(template, modelId);
+ template.get('table').removeAttribute('data-loading');
+ }
+
+ renderCompat(template, modelId) {
+ template.get('compat-wrapper').style.display = 'table-row';
+ const options = Object.keys(this.compat).map(v => ``).join('');
+ template
+ .fill('compat', '' + options, true)
+ .addEventListener('change', ev => {
+ const result = this.compat[ev.target.value][modelId];
+ if (result) template.fill('compat-versions', `${modelId}-${result[0]}
`, true);
+ else template.fill('compat-versions', '');
+ });
+ }
+
+ renderAccuracy(template, accuracy, modelId, compare=false) {
+ template.get('accuracy-wrapper').style.display = 'block';
+ const metaKeys = Object.keys(this.accKeys).map(k => accuracy[k] ? k : false).filter(k => k);
+ for (let key of metaKeys) {
+ template.fill(key, accuracy[key].toFixed(2)).parentElement.style.display = 'table-row';
+ }
+
+ this.chartOptions.options.legend = { display: compare }
+ new Chart(`chart_${modelId}`, Object.assign({}, this.chartOptions, { data: {
+ datasets: [{
+ label: modelId,
+ data: metaKeys.map(key => accuracy[key].toFixed(2)),
+ backgroundColor: this.chartColor
+ }],
+ labels: metaKeys.map(key => this.accKeys[key])
+ }}))
+ }
+
+ getLatestVersion(model, compat = {}) {
+ for (let spacy_v of Object.keys(compat)) {
+ const models = compat[spacy_v];
+ if (models[model]) return models[model][0];
+ }
+ }
+}
+
+
+class Changelog {
+ /**
+ * Fetch and render changelog from GitHub. Clones a template node (table row)
+ * to avoid doubling templating markup in JavaScript.
+ *
+ * @param {String} user - GitHub username.
+ * @param {String} repo - Repository to fetch releases from.
+ */
+ constructor(user, repo) {
+ this.url = `https://api.github.com/repos/${user}/${repo}/releases`;
+ this.template = new Templater('changelog');
+ fetch(this.url)
+ .then(res => this.handleResponse(res))
+ .then(json => json.ok ? this.render(json) : false)
+ }
+
+ /**
+ * Get template section from template row. Slightly hacky, but does make sense.
+ */
+ $(item, id) {
+ return item.querySelector(`[data-changelog="${id}"]`);
+ }
+
+ handleResponse(res) {
+ if (res.ok) return res.json().then(json => Object.assign({}, json, { ok: res.ok }))
+ else return ({ ok: res.ok })
+ }
+
+ render(json) {
+ this.template.get('error').style.display = 'none';
+ this.template.get('table').style.display = 'block';
+ this.row = this.template.get('item');
+ this.releases = this.template.get('releases');
+ this.prereleases = this.template.get('prereleases');
+ Object.values(json)
+ .filter(release => release.name)
+ .forEach(release => this.renderRelease(release));
+ this.row.remove();
+ // make sure scroll positions for progress bar etc. are recalculated
+ window.dispatchEvent(new Event('resize'));
+ }
+
+ /**
+ * Clone the template row and populate with content from API response.
+ * https://developer.github.com/v3/repos/releases/#list-releases-for-a-repository
+ *
+ * @param {String} name - Release title.
+ * @param {String} tag (tag_name) - Release tag.
+ * @param {String} url (html_url) - URL to the release page on GitHub.
+ * @param {String} date (published_at) - Timestamp of release publication.
+ * @param {Boolean} pre (prerelease) - Whether the release is a prerelease.
+ */
+ renderRelease({ name, tag_name: tag, html_url: url, published_at: date, prerelease: pre }) {
+ const container = pre ? this.prereleases : this.releases;
+ const row = this.row.cloneNode(true);
+ this.$(row, 'date').textContent = date.split('T')[0];
+ this.$(row, 'tag').innerHTML = `${tag}
`;
+ this.$(row, 'title').textContent = (name.split(': ').length == 2) ? name.split(': ')[1] : name;
+ container.appendChild(row);
+ }
+}
+
+
+class GitHubEmbed {
+ /**
+ * Embed code from GitHub repositories, similar to Gist embeds. Fetches the
+ * raw text and places it inside element.
+ * Usage:
+ *
+ * @param {String} user - GitHub user or organization.
+ * @param {String} attr - Data attribute used to select containers. Attribute
+ * value should be path to file relative to user.
+ */
+ constructor(user, attr) {
+ this.url = `https://raw.githubusercontent.com/${user}`;
+ this.attr = attr;
+ this.error = `\nCan't fetch code example from GitHub :(\n\nPlease use the link below to view the example. If you've come across\na broken link, we always appreciate a pull request to the repository,\nor a report on the issue tracker. Thanks!`;
+ [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
+ }
+
+ embed(el) {
+ el.parentElement.setAttribute('data-loading', '');
+ fetch(`${this.url}/${el.getAttribute(this.attr)}`)
+ .then(res => res.text().then(text => ({ text, ok: res.ok })))
+ .then(({ text, ok }) => {
+ el.textContent = ok ? text : this.error;
+ if (ok && window.Prism) Prism.highlightElement(el);
+ })
+ el.parentElement.removeAttribute('data-loading');
+ }
}
diff --git a/website/assets/js/prism.js b/website/assets/js/prism.min.js
similarity index 100%
rename from website/assets/js/prism.js
rename to website/assets/js/prism.min.js
diff --git a/website/assets/js/quickstart.js b/website/assets/js/quickstart.min.js
similarity index 100%
rename from website/assets/js/quickstart.js
rename to website/assets/js/quickstart.min.js
diff --git a/website/docs/_data.json b/website/docs/_data.json
deleted file mode 100644
index bc33ebc4c..000000000
--- a/website/docs/_data.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
- "index": {
- "title" : "Documentation",
-
- "sections": {
- "Usage": {
- "url": "/docs/usage",
- "svg": "computer",
- "description": "How to use spaCy and its features."
- },
- "API": {
- "url": "/docs/api",
- "svg": "brain",
- "description": "The detailed reference for spaCy's API."
- },
- "Tutorials": {
- "url": "/docs/usage/tutorials",
- "svg": "eye",
- "description": "End-to-end examples, with code you can modify and run."
- },
- "Showcase & Demos": {
- "url": "/docs/usage/showcase",
- "svg": "bubble",
- "description": "Demos, libraries and products from the spaCy community."
- }
- }
- }
-}
diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade
deleted file mode 100644
index ce18878b7..000000000
--- a/website/docs/api/annotation.jade
+++ /dev/null
@@ -1,156 +0,0 @@
-//- π« DOCS > API > ANNOTATION SPECS
-
-include ../../_includes/_mixins
-
-p This document describes the target annotations spaCy is trained to predict.
-
-+h(2, "tokenization") Tokenization
-
-p
- | Tokenization standards are based on the
- | #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
- | The tokenizer differs from most by including tokens for significant
- | whitespace. Any sequence of whitespace characters beyond a single space
- | (#[code ' ']) is included as a token.
-
-+aside-code("Example").
- from spacy.lang.en import English
- nlp = English()
- tokens = nlp('Some\nspaces and\ttab characters')
- tokens_text = [t.text for t in tokens]
- assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
- '\t', 'tab', 'characters']
-
-p
- | The whitespace tokens are useful for much the same reason punctuation is
- | β it's often an important delimiter in the text. By preserving it in the
- | token output, we are able to maintain a simple alignment between the
- | tokens and the original string, and we ensure that no information is
- | lost during processing.
-
-+h(2, "sentence-boundary") Sentence boundary detection
-
-p
- | Sentence boundaries are calculated from the syntactic parse tree, so
- | features such as punctuation and capitalisation play an important but
- | non-decisive role in determining the sentence boundaries. Usually this
- | means that the sentence boundaries will at least coincide with clause
- | boundaries, even given poorly punctuated text.
-
-+h(2, "pos-tagging") Part-of-speech Tagging
-
-+aside("Tip: Understanding tags")
- | You can also use #[code spacy.explain()] to get the description for the
- | string representation of a tag. For example,
- | #[code spacy.explain("RB")] will return "adverb".
-
-include _annotation/_pos-tags
-
-+h(2, "lemmatization") Lemmatization
-
-p A "lemma" is the uninflected form of a word. In English, this means:
-
-+list
- +item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
- +item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
- +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
- +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
-
-p
- | The lemmatization data is taken from
- | #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
- | special case for pronouns: all pronouns are lemmatized to the special
- | token #[code -PRON-].
-
-+infobox("About spaCy's custom pronoun lemma")
- | Unlike verbs and common nouns, there's no clear base form of a personal
- | pronoun. Should the lemma of "me" be "I", or should we normalize person
- | as well, giving "it" β or maybe "he"? spaCy's solution is to introduce a
- | novel symbol, #[code -PRON-], which is used as the lemma for
- | all personal pronouns.
-
-+h(2, "dependency-parsing") Syntactic Dependency Parsing
-
-+aside("Tip: Understanding labels")
- | You can also use #[code spacy.explain()] to get the description for the
- | string representation of a label. For example,
- | #[code spacy.explain("prt")] will return "particle".
-
-include _annotation/_dep-labels
-
-+h(2, "named-entities") Named Entity Recognition
-
-+aside("Tip: Understanding entity types")
- | You can also use #[code spacy.explain()] to get the description for the
- | string representation of an entity label. For example,
- | #[code spacy.explain("LANGUAGE")] will return "any named language".
-
-include _annotation/_named-entities
-
-+h(3, "biluo") BILUO Scheme
-
-p
- | spaCy translates character offsets into the BILUO scheme, in order to
- | decide the cost of each action given the current state of the entity
- | recognizer. The costs are then used to calculate the gradient of the
- | loss, to train the model.
-
-+aside("Why BILUO, not IOB?")
- | There are several coding schemes for encoding entity annotations as
- | token tags. These coding schemes are equally expressive, but not
- | necessarily equally learnable.
- | #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
- | showed that the minimal #[strong Begin], #[strong In], #[strong Out]
- | scheme was more difficult to learn than the #[strong BILUO] scheme that
- | we use, which explicitly marks boundary tokens.
-
-+table([ "Tag", "Description" ])
- +row
- +cell #[code #[span.u-color-theme B] EGIN]
- +cell The first token of a multi-token entity.
-
- +row
- +cell #[code #[span.u-color-theme I] N]
- +cell An inner token of a multi-token entity.
-
- +row
- +cell #[code #[span.u-color-theme L] AST]
- +cell The final token of a multi-token entity.
-
- +row
- +cell #[code #[span.u-color-theme U] NIT]
- +cell A single-token entity.
-
- +row
- +cell #[code #[span.u-color-theme O] UT]
- +cell A non-entity token.
-
-+h(2, "json-input") JSON input format for training
-
-p
- | spaCy takes training data in the following format:
-
-+code("Example structure").
- doc: {
- id: string,
- paragraphs: [{
- raw: string,
- sents: [int],
- tokens: [{
- start: int,
- tag: string,
- head: int,
- dep: string
- }],
- ner: [{
- start: int,
- end: int,
- label: string
- }],
- brackets: [{
- start: int,
- end: int,
- label: string
- }]
- }]
- }
diff --git a/website/docs/api/dependencyparser.jade b/website/docs/api/dependencyparser.jade
deleted file mode 100644
index a1a7e0b36..000000000
--- a/website/docs/api/dependencyparser.jade
+++ /dev/null
@@ -1,111 +0,0 @@
-//- π« DOCS > API > DEPENDENCYPARSER
-
-include ../../_includes/_mixins
-
-p Annotate syntactic dependencies on #[code Doc] objects.
-
-+under-construction
-
-+h(2, "init") DependencyParser.__init__
- +tag method
-
-p Create a #[code DependencyParser].
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code vocab]
- +cell #[code Vocab]
- +cell The vocabulary. Must be shared with documents to be processed.
-
- +row
- +cell #[code model]
- +cell #[thinc.linear.AveragedPerceptron]
- +cell The statistical model.
-
- +footrow
- +cell returns
- +cell #[code DependencyParser]
- +cell The newly constructed object.
-
-+h(2, "call") DependencyParser.__call__
- +tag method
-
-p
- | Apply the dependency parser, setting the heads and dependency relations
- | onto the #[code Doc] object.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The document to be processed.
-
- +footrow
- +cell returns
- +cell #[code None]
- +cell -
-
-+h(2, "pipe") DependencyParser.pipe
- +tag method
-
-p Process a stream of documents.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code stream]
- +cell -
- +cell The sequence of documents to process.
-
- +row
- +cell #[code batch_size]
- +cell int
- +cell The number of documents to accumulate into a working set.
-
- +row
- +cell #[code n_threads]
- +cell int
- +cell
- | The number of threads with which to work on the buffer in
- | parallel.
-
- +footrow
- +cell yields
- +cell #[code Doc]
- +cell Documents, in order.
-
-+h(2, "update") DependencyParser.update
- +tag method
-
-p Update the statistical model.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The example document for the update.
-
- +row
- +cell #[code gold]
- +cell #[code GoldParse]
- +cell The gold-standard annotations, to calculate the loss.
-
- +footrow
- +cell returns
- +cell int
- +cell The loss on this example.
-
-+h(2, "step_through") DependencyParser.step_through
- +tag method
-
-p Set up a stepwise state, to introspect and control the transition sequence.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The document to step through.
-
- +footrow
- +cell returns
- +cell #[code StepwiseState]
- +cell A state object, to step through the annotation process.
diff --git a/website/docs/api/entityrecognizer.jade b/website/docs/api/entityrecognizer.jade
deleted file mode 100644
index e3775b7f4..000000000
--- a/website/docs/api/entityrecognizer.jade
+++ /dev/null
@@ -1,109 +0,0 @@
-//- π« DOCS > API > ENTITYRECOGNIZER
-
-include ../../_includes/_mixins
-
-p Annotate named entities on #[code Doc] objects.
-
-+under-construction
-
-+h(2, "init") EntityRecognizer.__init__
- +tag method
-
-p Create an #[code EntityRecognizer].
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code vocab]
- +cell #[code Vocab]
- +cell The vocabulary. Must be shared with documents to be processed.
-
- +row
- +cell #[code model]
- +cell #[thinc.linear.AveragedPerceptron]
- +cell The statistical model.
-
- +footrow
- +cell returns
- +cell #[code EntityRecognizer]
- +cell The newly constructed object.
-
-+h(2, "call") EntityRecognizer.__call__
- +tag method
-
-p Apply the entity recognizer, setting the NER tags onto the #[code Doc] object.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The document to be processed.
-
- +footrow
- +cell returns
- +cell #[code None]
- +cell -
-
-+h(2, "pipe") EntityRecognizer.pipe
- +tag method
-
-p Process a stream of documents.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code stream]
- +cell -
- +cell The sequence of documents to process.
-
- +row
- +cell #[code batch_size]
- +cell int
- +cell The number of documents to accumulate into a working set.
-
- +row
- +cell #[code n_threads]
- +cell int
- +cell
- | The number of threads with which to work on the buffer in
- | parallel.
-
- +footrow
- +cell yields
- +cell #[code Doc]
- +cell Documents, in order.
-
-+h(2, "update") EntityRecognizer.update
- +tag method
-
-p Update the statistical model.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The example document for the update.
-
- +row
- +cell #[code gold]
- +cell #[code GoldParse]
- +cell The gold-standard annotations, to calculate the loss.
-
- +footrow
- +cell returns
- +cell int
- +cell The loss on this example.
-
-+h(2, "step_through") EntityRecognizer.step_through
- +tag method
-
-p Set up a stepwise state, to introspect and control the transition sequence.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The document to step through.
-
- +footrow
- +cell returns
- +cell #[code StepwiseState]
- +cell A state object, to step through the annotation process.
diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade
deleted file mode 100644
index f92080975..000000000
--- a/website/docs/api/index.jade
+++ /dev/null
@@ -1,241 +0,0 @@
-//- π« DOCS > API > FACTS & FIGURES
-
-include ../../_includes/_mixins
-
-+under-construction
-
-+h(2, "comparison") Feature comparison
-
-p
- | Here's a quick comparison of the functionalities offered by spaCy,
- | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet],
- | #[+a("http://www.nltk.org/py-modindex.html") NLTK] and
- | #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP].
-
-+table([ "", "spaCy", "SyntaxNet", "NLTK", "CoreNLP"])
- +row
- +cell Easy installation
- each icon in [ "pro", "con", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Python API
- each icon in [ "pro", "con", "pro", "con" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Multi-language support
- each icon in [ "neutral", "pro", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Tokenization
- each icon in [ "pro", "pro", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Part-of-speech tagging
- each icon in [ "pro", "pro", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Sentence segmentation
- each icon in [ "pro", "pro", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Dependency parsing
- each icon in [ "pro", "pro", "con", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Entity Recognition
- each icon in [ "pro", "con", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Integrated word vectors
- each icon in [ "pro", "con", "con", "con" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Sentiment analysis
- each icon in [ "pro", "con", "pro", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Coreference resolution
- each icon in [ "con", "con", "con", "pro" ]
- +cell.u-text-center #[+procon(icon)]
-
-+h(2, "benchmarks") Benchmarks
-
-p
- | Two peer-reviewed papers in 2015 confirm that spaCy offers the
- | #[strong fastest syntactic parser in the world] and that
- | #[strong its accuracy is within 1% of the best] available. The few
- | systems that are more accurate are 20× slower or more.
-
-+aside("About the evaluation")
- | The first of the evaluations was published by #[strong Yahoo! Labs] and
- | #[strong Emory University], as part of a survey of current parsing
- | technologies #[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") (Choi et al., 2015)].
- | Their results and subsequent discussions helped us develop a novel
- | psychologically-motivated technique to improve spaCy's accuracy, which
- | we published in joint work with Macquarie University
- | #[+a("https://aclweb.org/anthology/D/D15/D15-1162.pdf") (Honnibal and Johnson, 2015)].
-
-+table([ "System", "Language", "Accuracy", "Speed (wps)"])
- +row
- each data in [ "spaCy", "Cython", "91.8", "13,963" ]
- +cell #[strong=data]
- +row
- each data in [ "ClearNLP", "Java", "91.7", "10,271" ]
- +cell=data
-
- +row
- each data in [ "CoreNLP", "Java", "89.6", "8,602"]
- +cell=data
-
- +row
- each data in [ "MATE", "Java", "92.5", "550"]
- +cell=data
-
- +row
- each data in [ "Turbo", "C++", "92.4", "349" ]
- +cell=data
-
-+h(3, "parse-accuracy") Parse accuracy
-
-p
- | In 2016, Google released their
- | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet]
- | library, setting a new state of the art for syntactic dependency parsing
- | accuracy. SyntaxNet's algorithm is very similar to spaCy's. The main
- | difference is that SyntaxNet uses a neural network while spaCy uses a
- | sparse linear model.
-
-+aside("Methodology")
- | #[+a("http://arxiv.org/abs/1603.06042") Andor et al. (2016)] chose
- | slightly different experimental conditions from
- | #[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") Choi et al. (2015)],
- | so the two accuracy tables here do not present directly comparable
- | figures. We have only evaluated spaCy in the "News" condition following
- | the SyntaxNet methodology. We don't yet have benchmark figures for the
- | "Web" and "Questions" conditions.
-
-+table([ "System", "News", "Web", "Questions" ])
- +row
- +cell spaCy
- each data in [ 92.8, "n/a", "n/a" ]
- +cell=data
-
- +row
- +cell #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") Parsey McParseface]
- each data in [ 94.15, 89.08, 94.77 ]
- +cell=data
-
- +row
- +cell #[+a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al. (2013)]
- each data in [ 93.10, 88.23, 94.21 ]
- +cell=data
-
- +row
- +cell #[+a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald (2014)]
- each data in [ 93.32, 88.65, 93.37 ]
- +cell=data
-
- +row
- +cell #[+a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al. (2015)]
- each data in [ 93.91, 89.29, 94.17 ]
- +cell=data
-
- +row
- +cell #[strong #[+a("http://arxiv.org/abs/1603.06042") Andor et al. (2016)]]
- each data in [ 94.44, 90.17, 95.40 ]
- +cell #[strong=data]
-
-+h(3, "speed-comparison") Detailed speed comparison
-
-p
- | Here we compare the per-document processing time of various spaCy
- | functionalities against other NLP libraries. We show both absolute
- | timings (in ms) and relative performance (normalized to spaCy). Lower is
- | better.
-
-+aside("Methodology")
- | #[strong Set up:] 100,000 plain-text documents were streamed from an
- | SQLite3 database, and processed with an NLP library, to one of three
- | levels of detail β tokenization, tagging, or parsing. The tasks are
- | additive: to parse the text you have to tokenize and tag it. The
- | pre-processing was not subtracted from the times β I report the time
- | required for the pipeline to complete. I report mean times per document,
- | in milliseconds.#[br]#[br]
- | #[strong Hardware]: Intel i7-3770 (2012)#[br]
- | #[strong Implementation]: #[+src(gh("spacy-benchmarks")) spacy-benchmarks]
-
-+table
- +row.u-text-label.u-text-center
- th.c-table__head-cell
- th.c-table__head-cell(colspan="3") Absolute (ms per doc)
- th.c-table__head-cell(colspan="3") Relative (to spaCy)
-
- +row
- each column in ["System", "Tokenize", "Tag", "Parse", "Tokenize", "Tag", "Parse"]
- th.c-table__head-cell.u-text-label=column
-
- +row
- +cell #[strong spaCy]
- each data in [ "0.2ms", "1ms", "19ms"]
- +cell #[strong=data]
-
- each data in [ "1x", "1x", "1x" ]
- +cell=data
-
- +row
- each data in [ "CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
- +cell=data
- +row
- each data in [ "ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x" ]
- +cell=data
- +row
- each data in [ "NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a" ]
- +cell=data
-
-+h(3, "ner") Named entity comparison
-
-p
- | #[+a("https://aclweb.org/anthology/W/W16/W16-2703.pdf") Jiang et al. (2016)]
- | present several detailed comparisons of the named entity recognition
- | models provided by spaCy, CoreNLP, NLTK and LingPipe. Here we show their
- | evaluation of person, location and organization accuracy on Wikipedia.
-
-+aside("Methodology")
- | Making a meaningful comparison of different named entity recognition
- | systems is tricky. Systems are often trained on different data, which
- | usually have slight differences in annotation style. For instance, some
- | corpora include titles as part of person names, while others don't.
- | These trivial differences in convention can distort comparisons
- | significantly. Jiang et al.'s #[em partial overlap] metric goes a long
- | way to solving this problem.
-
-+table([ "System", "Precision", "Recall", "F-measure" ])
- +row
- +cell spaCy
- each data in [ 0.7240, 0.6514, 0.6858 ]
- +cell=data
-
- +row
- +cell #[strong CoreNLP]
- each data in [ 0.7914, 0.7327, 0.7609 ]
- +cell #[strong=data]
-
- +row
- +cell NLTK
- each data in [ 0.5136, 0.6532, 0.5750 ]
- +cell=data
-
- +row
- +cell LingPipe
- each data in [ 0.5412, 0.5357, 0.5384 ]
- +cell=data
diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade
deleted file mode 100644
index c6943b410..000000000
--- a/website/docs/api/language-models.jade
+++ /dev/null
@@ -1,93 +0,0 @@
-//- π« DOCS > API > LANGUAGE MODELS
-
-include ../../_includes/_mixins
-
-p
- | spaCy currently provides models for the following languages and
- | capabilities:
-
-
-+aside-code("Download language models", "bash").
- spacy download en
- spacy download de
- spacy download fr
-
-+table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
- +row
- +cell English #[code en]
- each icon in [ "pro", "pro", "pro", "pro", "pro", "pro", "pro", "con" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell German #[code de]
- each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell French #[code fr]
- each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
- +cell.u-text-center #[+procon(icon)]
-
- +row
- +cell Spanish #[code es]
- each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
- +cell.u-text-center #[+procon(icon)]
-
-p
- +button("/docs/usage/models", true, "primary") See available models
-
-+h(2, "alpha-support") Alpha tokenization support
-
-p
- | Work has started on the following languages. You can help by
- | #[+a("/docs/usage/adding-languages#language-data") improving the existing language data]
- | and extending the tokenization patterns.
-
-+aside("Usage note")
- | Note that the alpha languages don't yet come with a language model. In
- | order to use them, you have to import them directly:
-
- +code.o-no-block.
- from spacy.lang.fi import Finnish
- nlp = Finnish()
- doc = nlp(u'Ilmatyynyalukseni on tΓ€ynnΓ€ ankeriaita')
-
-+infobox("Dependencies")
- | Some language tokenizers require external dependencies. To use #[strong Chinese],
- | you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed.
- | The #[strong Japanese] tokenizer requires
- | #[+a("https://github.com/mocobeta/janome") Janome].
-
-+table([ "Language", "Code", "Source" ])
- each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian BokmΓ₯l", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
- +row
- +cell #{language}
- +cell #[code=code]
- +cell
- +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
-
-+h(2, "multi-language") Multi-language support
- +tag-new(2)
-
-p
- | As of v2.0, spaCy supports models trained on more than one language. This
- | is especially useful for named entity recognition. The language ID used
- | for multi-language or language-neutral models is #[code xx]. The
- | language class, a generic subclass containing only the base language data,
- | can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
-
-p
- | To load your model with the neutral, multi-language class, simply set
- | #[code "language": "xx"] in your
- | #[+a("/docs/usage/saving-loading#models-generating") model package]'s
- | meta.json. You can also import the class directly, or call
- | #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
- | lazy-loading.
-
-+code("Standard import").
- from spacy.lang.xx import MultiLanguage
- nlp = MultiLanguage()
-
-+code("With lazy-loading").
- from spacy.util import get_lang_class
- nlp = get_lang_class('xx')
diff --git a/website/docs/api/tagger.jade b/website/docs/api/tagger.jade
deleted file mode 100644
index c41de6a4e..000000000
--- a/website/docs/api/tagger.jade
+++ /dev/null
@@ -1,93 +0,0 @@
-//- π« DOCS > API > TAGGER
-
-include ../../_includes/_mixins
-
-p Annotate part-of-speech tags on #[code Doc] objects.
-
-+under-construction
-
-+h(2, "init") Tagger.__init__
- +tag method
-
-p Create a #[code Tagger].
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code vocab]
- +cell #[code Vocab]
- +cell The vocabulary. Must be shared with documents to be processed.
-
- +row
- +cell #[code model]
- +cell #[thinc.linear.AveragedPerceptron]
- +cell The statistical model.
-
- +footrow
- +cell returns
- +cell #[code Tagger]
- +cell The newly constructed object.
-
-+h(2, "call") Tagger.__call__
- +tag method
-
-p Apply the tagger, setting the POS tags onto the #[code Doc] object.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The tokens to be tagged.
-
- +footrow
- +cell returns
- +cell #[code None]
- +cell -
-
-+h(2, "pipe") Tagger.pipe
- +tag method
-
-p Tag a stream of documents.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code stream]
- +cell -
- +cell The sequence of documents to tag.
-
- +row
- +cell #[code batch_size]
- +cell int
- +cell The number of documents to accumulate into a working set.
-
- +row
- +cell #[code n_threads]
- +cell int
- +cell
- | The number of threads with which to work on the buffer in
- | parallel.
-
- +footrow
- +cell yields
- +cell #[code Doc]
- +cell Documents, in order.
-
-+h(2, "update") Tagger.update
- +tag method
-
-p Update the statistical model, with tags supplied for the given document.
-
-+table(["Name", "Type", "Description"])
- +row
- +cell #[code doc]
- +cell #[code Doc]
- +cell The example document for the update.
-
- +row
- +cell #[code gold]
- +cell #[code GoldParse]
- +cell Manager for the gold-standard tags.
-
- +footrow
- +cell returns
- +cell int
- +cell Number of tags predicted correctly.
diff --git a/website/docs/api/tensorizer.jade b/website/docs/api/tensorizer.jade
deleted file mode 100644
index 9abd6793b..000000000
--- a/website/docs/api/tensorizer.jade
+++ /dev/null
@@ -1,7 +0,0 @@
-//- π« DOCS > API > TENSORIZER
-
-include ../../_includes/_mixins
-
-p Add a tensor with position-sensitive meaning representations to a #[code Doc].
-
-+under-construction
diff --git a/website/docs/api/textcategorizer.jade b/website/docs/api/textcategorizer.jade
deleted file mode 100644
index 926d957f7..000000000
--- a/website/docs/api/textcategorizer.jade
+++ /dev/null
@@ -1,21 +0,0 @@
-//- π« DOCS > API > TEXTCATEGORIZER
-
-include ../../_includes/_mixins
-
-p
- | Add text categorization models to spaCy pipelines. The model supports
- | classification with multiple, non-mutually exclusive labels.
-
-p
- | You can change the model architecture rather easily, but by default, the
- | #[code TextCategorizer] class uses a convolutional neural network to
- | assign position-sensitive vectors to each word in the document. This step
- | is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the
- | #[code TextCategorizer] uses its own CNN model, to avoid sharing weights
- | with the other pipeline components. The document tensor is then
- | summarized by concatenating max and mean pooling, and a multilayer
- | perceptron is used to predict an output vector of length #[code nr_class],
- | before a logistic activation is applied elementwise. The value of each
- | output neuron is the probability that some class is present.
-
-+under-construction
diff --git a/website/docs/api/vectors.jade b/website/docs/api/vectors.jade
deleted file mode 100644
index ef9aa2b52..000000000
--- a/website/docs/api/vectors.jade
+++ /dev/null
@@ -1,7 +0,0 @@
-//- π« DOCS > API > VECTORS
-
-include ../../_includes/_mixins
-
-p A container class for vector data keyed by string.
-
-+under-construction
diff --git a/website/docs/index.jade b/website/docs/index.jade
deleted file mode 100644
index d5a8c6deb..000000000
--- a/website/docs/index.jade
+++ /dev/null
@@ -1,25 +0,0 @@
-//- π« DOCS
-
-include ../_includes/_mixins
-
-+aside("Help us improve the docs")
- | Did you spot a mistake or come across explanations that
- | are unclear? You can find a "Suggest edits" button at the
- | bottom of each page that points you to the source.
- | We always appreciate
- | #[+a(gh("spaCy") + "/pulls") pull requests].#[br]#[br]
- | Have you built something cool with spaCy, or did you
- | write a tutorial to help others use spaCy?
- | #[a(href="mailto:#{EMAIL}") Let us know!]
-
-+grid
- each details, title in sections
- +card(false, false)
- a(href=details.url)
- +svg("graphics", details.svg, 300, 150).u-color-theme
-
- a(href=details.url)
- +h(3)=title
-
- p=details.description
- +button(details.url, true, "primary")(target="_self") View
diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json
deleted file mode 100644
index c8373a095..000000000
--- a/website/docs/usage/_data.json
+++ /dev/null
@@ -1,420 +0,0 @@
-{
- "sidebar": {
- "Get started": {
- "Installation": "./",
- "Models": "models",
- "spaCy 101": "spacy-101",
- "Lightning tour": "lightning-tour",
- "What's new in v2.0": "v2"
- },
- "Guides": {
- "POS tagging": "pos-tagging",
- "Using the parse": "dependency-parse",
- "Entity recognition": "entity-recognition",
- "Vectors & similarity": "word-vectors-similarities",
- "Custom tokenization": "customizing-tokenizer",
- "Rule-based matching": "rule-based-matching",
- "Adding languages": "adding-languages",
- "Processing pipelines": "language-processing-pipeline",
- "Text classification": "text-classification",
- "Deep learning": "deep-learning",
- "Production use": "production-use",
- "Training": "training",
- "Training NER": "training-ner",
- "Saving & loading": "saving-loading",
- "Visualizers": "visualizers"
- },
- "Examples": {
- "Tutorials": "tutorials",
- "Showcase": "showcase"
- }
- },
-
- "index": {
- "title": "Install spaCy",
- "next": "models",
- "quickstart": true
- },
-
- "models": {
- "title": "Models",
- "next": "spacy-101",
- "quickstart": true
- },
-
- "spacy-101": {
- "title": "spaCy 101 β Everything you need to know",
- "next": "lightning-tour",
- "quickstart": true,
- "preview": "101"
- },
-
- "lightning-tour": {
- "title": "Lightning tour",
- "next": "v2"
- },
-
- "visualizers": {
- "title": "Visualizers"
- },
-
- "v2": {
- "title": "What's new in v2.0"
- },
-
- "pos-tagging": {
- "title": "Part-of-speech tagging",
- "next": "dependency-parse"
- },
-
- "dependency-parse": {
- "title": "Using the dependency parse",
- "next": "entity-recognition"
- },
-
- "entity-recognition": {
- "title": "Named Entity Recognition",
- "next": "training-ner"
- },
-
- "word-vectors-similarities": {
- "title": "Using word vectors and semantic similarities",
- "next": "customizing-tokenizer"
- },
-
- "customizing-tokenizer": {
- "title": "Customising the tokenizer",
- "next": "rule-based-matching"
- },
-
- "rule-based-matching": {
- "title": "Rule-based matching",
- "next": "adding-languages"
- },
-
- "adding-languages": {
- "title": "Adding languages",
- "next": "training"
- },
-
- "language-processing-pipeline": {
- "title": "Language processing pipelines",
- "next": "deep-learning"
- },
-
- "deep-learning": {
- "title": "Hooking a deep learning model into spaCy",
- "next": "production use"
- },
-
- "text-classification": {
- "title": "Text classification",
- "next": "training"
- },
-
- "production-use": {
- "title": "Production use",
- "next": "training"
- },
-
- "training": {
- "title": "Training spaCy's statistical models",
- "next": "saving-loading"
- },
-
- "training-ner": {
- "title": "Training the Named Entity Recognizer",
- "next": "saving-loading"
- },
-
- "saving-loading": {
- "title": "Saving, loading and data serialization"
- },
-
- "showcase": {
- "title": "Showcase",
-
- "libraries": {
- "spacy_api": {
- "url": "https://github.com/kootenpv/spacy_api",
- "author": "Pascal van Kooten",
- "description": "Server/client to load models in a separate, dedicated process."
- },
- "spacy-nlp": {
- "url": "https://github.com/kengz/spacy-nlp",
- "author": "Wah Loon Keng",
- "description": "Expose spaCy NLP text parsing to Node.js (and other languages) via Socket.IO."
- },
- "spacy-api-docker": {
- "url": "https://github.com/jgontrum/spacy-api-docker",
- "author": "Johannes Gontrum",
- "description": "spaCy accessed by a REST API, wrapped in a Docker container."
- },
- "spacy-nlp-zeromq": {
- "url": "https://github.com/pasupulaphani/spacy-nlp-docker",
- "author": "Phaninder Pasupula",
- "description": "Docker image exposing spaCy with ZeroMQ bindings."
- },
- "textacy": {
- "url": "https://github.com/chartbeat-labs/textacy",
- "author": " Burton DeWilde (Chartbeat)",
- "description": "Higher-level NLP built on spaCy."
- },
- "visual-qa": {
- "url": "https://github.com/avisingh599/visual-qa",
- "author": "Avi Singh",
- "description": "Keras-based LSTM/CNN models for Visual Question Answering."
- },
- "rasa_nlu": {
- "url": "https://github.com/golastmile/rasa_nlu",
- "author": "LASTMILE",
- "description": "High level APIs for building your own language parser using existing NLP and ML libraries."
- },
- "spacyr": {
- "url": "https://github.com/kbenoit/spacyr",
- "author": "Kenneth Benoit",
- "description": "An R wrapper for spaCy."
- }
- },
- "visualizations": {
- "displaCy": {
- "url": "https://demos.explosion.ai/displacy",
- "author": "Ines Montani",
- "description": "An open-source NLP visualiser for the modern web.",
- "image": "displacy.jpg"
- },
- "displaCy ENT": {
- "url": "https://demos.explosion.ai/displacy-ent",
- "author": "Ines Montani",
- "description": "An open-source named entity visualiser for the modern web.",
- "image": "displacy-ent.jpg"
- }
- },
- "products": {
- "sense2vec": {
- "url": "https://demos.explosion.ai/sense2vec",
- "author": "Matthew Honnibal and Ines Montani",
- "description": "Semantic analysis of the Reddit hivemind.",
- "image": "sense2vec.jpg"
- },
- "TruthBot": {
- "url": "http://summerscope.github.io/govhack/2016/truthbot/",
- "author": "Team Truthbot",
- "description": "The world's first artificially intelligent fact checking robot.",
- "image": "truthbot.jpg"
- },
- "Laice": {
- "url": "https://github.com/kendricktan/laice",
- "author": "Kendrick Tan",
- "description": "Train your own Natural Language Processor from a browser.",
- "image": "laice.jpg"
- },
- "FoxType": {
- "url": "https://foxtype.com",
- "description": "Smart tools for writers.",
- "image": "foxtype.jpg"
- },
- "Kip": {
- "url": "https://kipthis.com",
- "description": "An AI chat assistant for group shopping.",
- "image": "kip.jpg"
- },
- "Indico": {
- "url": "https://indico.io",
- "description": "Text and image analysis powered by Machine Learning.",
- "image": "indico.jpg"
- },
- "TextAnalysisOnline": {
- "url": "http://textanalysisonline.com",
- "description": "Online tool for spaCy's tokenizer, parser, NER and more.",
- "image": "textanalysis.jpg"
- }
- },
- "books": {
- "Introduction to Machine Learning with Python: A Guide for Data Scientists": {
- "url": "https://books.google.de/books?id=vbQlDQAAQBAJ",
- "author": "Andreas C. MΓΌller and Sarah Guido (O'Reilly, 2016)",
- "description": "Andreas is a lead developer of Scikit-Learn, and Sarah is a lead data scientist at Mashable. We're proud to get a mention."
- },
-
- "Text Analytics with Python: A Practical Real-World Approach to Gaining Actionable Insights from your Data": {
- "url": "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X",
- "author": "Dipanjan Sarkar (Apress / Springer, 2016)",
- "description": "Derive useful insights from your data using Python. Learn the techniques related to natural language processing and text analytics, and gain the skills to know which technique is best suited to solve a particular problem."
- }
- },
- "research": {
- "Distributional semantics for understanding spoken meal descriptions": {
- "url": "https://www.semanticscholar.org/paper/Distributional-semantics-for-understanding-spoken-Korpusik-Huang/5f55c5535e80d3e5ed7f1f0b89531e32725faff5",
- "author": "Mandy Korpusik et al. (2016)"
- },
-
- "Refactoring the Genia Event Extraction Shared Task Toward a General Framework for IE-Driven KB Development": {
- "url": "https://www.semanticscholar.org/paper/Refactoring-the-Genia-Event-Extraction-Shared-Task-Kim-Wang/06d94b64a7bd2d3433f57caddad5084435d6a91f",
- "author": "Jin-Dong Kim et al. (2016)"
- },
- "Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec": {
- "url": "https://www.semanticscholar.org/paper/Mixing-Dirichlet-Topic-Models-and-Word-Embeddings-Moody/bf8116e06f7b498c6abfbf97aeb67d0838c08609",
- "author": "Christopher E. Moody (2016)"
- },
- "Predicting Pre-click Quality for Native Advertisements": {
- "url": "https://www.semanticscholar.org/paper/Predicting-Pre-click-Quality-for-Native-Zhou-Redi/564985430ff2fbc3a9daa9c2af8997b7f5046da8",
- "author": "Ke Zhou et al. (2016)"
- },
- "Threat detection in online discussions": {
- "url": "https://www.semanticscholar.org/paper/Threat-detection-in-online-discussions-Wester-%C3%98vrelid/f4150e2fb4d8646ebc2ea84f1a86afa1b593239b",
- "author": "Aksel Wester et al. (2016)"
- },
- "The language of mental health problems in social media": {
- "url": "https://www.semanticscholar.org/paper/The-language-of-mental-health-problems-in-social-Gkotsis-Oellrich/537db6c2984514d92a754a591841e2e20845985a",
- "author": "George Gkotsis et al. (2016)"
- }
- }
- },
-
- "tutorials": {
- "title": "Tutorials",
- "next": "showcase",
-
- "first_steps": {
- "Setting up an NLP environment with Python": {
- "url": "https://shirishkadam.com/2016/10/06/setting-up-natural-language-processing-environment-with-python/",
- "author": "Shirish Kadam"
- },
- "NLP with spaCy in 10 lines of code": {
- "url": "https://github.com/cytora/pycon-nlp-in-10-lines",
- "author": "Andraz Hribernik et al. (Cytora)",
- "tags": ["jupyter"]
- },
- "Intro to NLP with spaCy": {
- "url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/",
- "author": "J Nicolas Schrading"
- },
- "NLP with spaCy and IPython Notebook": {
- "url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/",
- "author": "Dustin Miller (SharePoint)",
- "tags": ["jupyter"]
- },
- "Getting Started with spaCy": {
- "url": "http://textminingonline.com/getting-started-with-spacy",
- "author": "TextMiner"
- },
- "spaCy β A fast natural language processing library": {
- "url": "https://bjoernkw.com/2015/11/22/spacy-a-fast-natural-language-processing-library/",
- "author": "BjΓΆrn Wilmsmann"
- },
- "NLP (almost) From Scratch - POS Network with spaCy": {
- "url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html",
- "author": "Sujit Pal",
- "tags": ["gensim", "keras"]
- },
- "NLP tasks with various libraries": {
- "url": "http://clarkgrubb.com/nlp",
- "author": "Clark Grubb"
- },
- "A very (very) short primer on spacy.io": {
- "url": "http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html",
- "author": "Nimrod Milo "
- }
- },
-
- "deep_dives": {
- "Modern NLP in Python β What you can learn about food by analyzing a million Yelp reviews": {
- "url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
- "author": "Patrick Harrison (S&P Global)",
- "tags": ["jupyter", "gensim"]
- },
- "Deep Learning with custom pipelines and Keras": {
- "url": "https://explosion.ai/blog/spacy-deep-learning-keras",
- "author": "Matthew Honnibal",
- "tags": ["keras", "sentiment"]
- },
- "A decomposable attention model for Natural Language Inference": {
- "url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment",
- "author": "Matthew Honnibal",
- "tags": ["keras", "similarity"]
- },
-
- "Using the German model": {
- "url": "https://explosion.ai/blog/german-model",
- "author": "Wolfgang Seeker",
- "tags": ["multi-lingual"]
- },
- "Sense2vec with spaCy and Gensim": {
- "url": "https://explosion.ai/blog/sense2vec-with-spacy",
- "author": "Matthew Honnibal",
- "tags": ["big data", "gensim"]
- },
- "Building your bot's brain with Node.js and spaCy": {
- "url": "https://explosion.ai/blog/chatbot-node-js-spacy",
- "author": "Wah Loon Keng",
- "tags": ["bots", "node.js"]
- },
- "An intent classifier with spaCy": {
- "url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/",
- "author": "Musio",
- "tags": ["bots", "keras"]
- },
- "Visual Question Answering with spaCy": {
- "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
- "author": "Aaditya Prakash",
- "tags": ["vqa", "keras"]
- },
- "Extracting time suggestions from emails with spaCy": {
- "url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2",
- "author": "Chris Savvopoulos",
- "tags": ["ner"]
- },
-
- "Advanced text analysis with spaCy and Scikit-Learn": {
- "url": "https://github.com/JonathanReeve/advanced-text-analysis-workshop-2017/blob/master/advanced-text-analysis.ipynb",
- "author": "Jonathan Reeve",
- "tags": ["jupyter", "scikit-learn"]
- }
- },
-
- "code": {
- "Training a new entity type": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py",
- "author": "Matthew Honnibal",
- "tags": ["ner", "training"]
- },
-
- "Training an NER system from scratch": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py",
- "author": "Matthew Honnibal",
- "tags": ["ner", "training"]
- },
-
- "Information extraction": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
- "author": "Matthew Honnibal",
- "tags": ["snippet"]
- },
- "Neural bag of words": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py",
- "author": "Matthew Honnibal",
- "tags": ["sentiment"]
- },
- "Part-of-speech tagging": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py",
- "author": "Matthew Honnibal",
- "tags": ["pos"]
- },
- "Parallel parse": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py",
- "author": "Matthew Honnibal",
- "tags": ["big data"]
- },
- "Inventory count": {
- "url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count",
- "author": "Oleg Zd"
- },
- "Multi-word matches": {
- "url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py",
- "author": "Matthew Honnibal",
- "tags": ["matcher", "out of date"]
- }
- }
- }
-}
diff --git a/website/docs/usage/_models-list.jade b/website/docs/usage/_models-list.jade
deleted file mode 100644
index 195df9f56..000000000
--- a/website/docs/usage/_models-list.jade
+++ /dev/null
@@ -1,24 +0,0 @@
-//- π« DOCS > USAGE > MODELS LIST
-
-include ../../_includes/_mixins
-
-p
- | Model differences are mostly statistical. In general, we do expect larger
- | models to be "better" and more accurate overall. Ultimately, it depends on
- | your use case and requirements, and we recommend starting with the default
- | models (marked with a star below).
-
-+aside
- | Models are now available as #[code .tar.gz] archives #[+a(gh("spacy-models")) from GitHub],
- | attached to individual releases. They can be downloaded and loaded manually,
- | or using spaCy's #[code download] and #[code link] commands. All models
- | follow the naming convention of #[code [language]_[type]_[genre]_[size]].
- | #[br]#[br]
-
- +button(gh("spacy-models"), true, "primary").u-text-tag
- | View model releases
-
-+table(["Name", "Language", "Voc", "Dep", "Ent", "Vec", "Size", "License"])
- for models, lang in MODELS
- for model, i in models
- +model-row(model.id, model.lang, model.feats, model.size, model.license, model.def || models.length == 1, i == 0)
diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade
deleted file mode 100644
index 78448e43e..000000000
--- a/website/docs/usage/deep-learning.jade
+++ /dev/null
@@ -1,92 +0,0 @@
-//- π« DOCS > USAGE > DEEP LEARNING
-
-include ../../_includes/_mixins
-
-p
- | In this example, we'll be using #[+a("https://keras.io/") Keras], as
- | it's the most popular deep learning library for Python. Using Keras,
- | we will write a custom sentiment analysis model that predicts whether a
- | document is positive or negative. Then, we will use it to find which entities
- | are commonly associated with positive or negative documents. Here's a
- | quick example of how that can look at runtime.
-
-+aside("What's Keras?")
- | #[+a("https://keras.io/") Keras] gives you a high-level, declarative
- | interface to define neural networks. Models are trained using Google's
- | #[+a("https://www.tensorflow.org") TensorFlow] by default.
- | #[+a("http://deeplearning.net/software/theano/") Theano] is also
- | supported.
-
-+under-construction
-
-p
- | For most applications, I it's recommended to use pre-trained word embeddings
- | without "fine-tuning". This means that you'll use the same embeddings
- | across different models, and avoid learning adjustments to them on your
- | training data. The embeddings table is large, and the values provided by
- | the pre-trained vectors are already pretty good. Fine-tuning the
- | embeddings table is therefore a waste of your "parameter budget". It's
- | usually better to make your network larger some other way, e.g. by
- | adding another LSTM layer, using attention mechanism, using character
- | features, etc.
-
-+h(2, "attribute-hooks") Attribute hooks
-
-+under-construction
-
-p
- | Earlier, we saw how to store data in the new generic #[code user_data]
- | dict. This generalises well, but it's not terribly satisfying. Ideally,
- | we want to let the custom data drive more "native" behaviours. For
- | instance, consider the #[code .similarity()] methods provided by spaCy's
- | #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] and
- | #[+api("span") #[code Span]] objects:
-
-+code("Polymorphic similarity example").
- span.similarity(doc)
- token.similarity(span)
- doc1.similarity(doc2)
-
-p
- | By default, this just averages the vectors for each document, and
- | computes their cosine. Obviously, spaCy should make it easy for you to
- | install your own similarity model. This introduces a tricky design
- | challenge. The current solution is to add three more dicts to the
- | #[code Doc] object:
-
-+aside("Implementation note")
- | The hooks live on the #[code Doc] object because the #[code Span] and
- | #[code Token] objects are created lazily, and don't own any data. They
- | just proxy to their parent #[code Doc]. This turns out to be convenient
- | here β we only have to worry about installing hooks in one place.
-
-+table(["Name", "Description"])
- +row
- +cell #[code user_hooks]
- +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
-
- +row
- +cell #[code user_token_hooks]
- +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
-
- +row
- +cell #[code user_span_hooks]
- +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
-
-p
- | To sum up, here's an example of hooking in custom #[code .similarity()]
- | methods:
-
-+code("Add custom similarity hooks").
- class SimilarityModel(object):
- def __init__(self, model):
- self._model = model
-
- def __call__(self, doc):
- doc.user_hooks['similarity'] = self.similarity
- doc.user_span_hooks['similarity'] = self.similarity
- doc.user_token_hooks['similarity'] = self.similarity
-
- def similarity(self, obj1, obj2):
- y = self._model([obj1.vector, obj2.vector])
- return float(y[0])
diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade
deleted file mode 100644
index a0aa1dca8..000000000
--- a/website/docs/usage/index.jade
+++ /dev/null
@@ -1,353 +0,0 @@
-//- π« DOCS > USAGE
-
-include ../../_includes/_mixins
-
-p
- | spaCy is compatible with #[strong 64-bit CPython 2.6+∕3.3+] and
- | runs on #[strong Unix/Linux], #[strong macOS/OS X] and
- | #[strong Windows]. The latest spaCy releases are
- | available over #[+a("https://pypi.python.org/pypi/spacy") pip] (source
- | packages only) and #[+a("https://anaconda.org/conda-forge/spacy") conda].
- | Installation requires a working build environment. See notes on
- | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
- | and #[a(href="#source-windows") Windows] for details.
-
-+quickstart(QUICKSTART, "Quickstart")
- +qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
- +qs({config: 'venv', python: 3}) python -m pip install -U venv
- +qs({config: 'venv', python: 2}) virtualenv .env
- +qs({config: 'venv', python: 3}) venv .env
- +qs({config: 'venv', os: 'mac'}) source .env/bin/activate
- +qs({config: 'venv', os: 'linux'}) source .env/bin/activate
- +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
-
- +qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
- +qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
-
- +qs({package: 'pip'}) pip install -U spacy
- +qs({package: 'conda'}) conda install -c conda-forge spacy
-
- +qs({package: 'source'}) git clone https://github.com/explosion/spaCy
- +qs({package: 'source'}) cd spaCy
- +qs({package: 'source'}) pip install -r requirements.txt
- +qs({package: 'source'}) pip install -e .
-
- +qs({model: 'en'}) spacy download en
- +qs({model: 'de'}) spacy download de
- +qs({model: 'fr'}) spacy download fr
- +qs({model: 'es'}) spacy download es
-
-+h(2, "installation") Installation instructions
-
-+h(3, "pip") pip
- +badge("pipy")
-
-p Using pip, spaCy releases are currently only available as source packages.
-
-+code(false, "bash").
- pip install -U spacy
-
-+aside("Download models")
- | After installation you need to download a language model. For more info
- | and available models, see the #[+a("/docs/usage/models") docs on models].
-
- +code.o-no-block.
- spacy download en
-
- >>> import spacy
- >>> nlp = spacy.load('en')
-
-p
- | When using pip it is generally recommended to install packages in a
- | #[code virtualenv] to avoid modifying system state:
-
-+code(false, "bash").
- virtualenv .env
- source .env/bin/activate
- pip install spacy
-
-+h(3, "conda") conda
- +badge("conda")
-
-p
- | Thanks to our great community, we've finally re-added conda support. You
- | can now install spaCy via #[code conda-forge]:
-
-+code(false, "bash").
- conda config --add channels conda-forge
- conda install spacy
-
-p
- | For the feedstock including the build recipe and configuration, check out
- | #[+a("https://github.com/conda-forge/spacy-feedstock") this repository].
- | Improvements and pull requests to the recipe and setup are always appreciated.
-
-+h(2, "gpu") Run spaCy with GPU
-
-p
- | As of v2.0, spaCy's comes with neural network models that are implemented
- | in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
- | support, we've been grateful to use the work of
- | #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
- | a NumPy-compatible interface for GPU arrays.
-
-p
- | First, install follows the normal CUDA installation procedure. Next, set
- | your environment variables so that the installation will be able to find
- | CUDA. Finally, install spaCy.
-
-+code(false, "bash").
- export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
- export PATH=$PATH:$CUDA_HOME/bin
-
- pip install spacy
- python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
-
-+h(2, "source") Compile from source
-
-p
- | The other way to install spaCy is to clone its
- | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
- | the common way if you want to make changes to the code base. You'll need to
- | make sure that you have a development environment consisting of a Python
- | distribution including header files, a compiler,
- | #[+a("https://pip.pypa.io/en/latest/installing/") pip],
- | #[+a("https://virtualenv.pypa.io/") virtualenv] and
- | #[+a("https://git-scm.com") git] installed. The compiler part is the
- | trickiest. How to do that depends on your system. See notes on
- | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") OS X] and
- | #[a(href="#source-windows") Windows] for details.
-
-+code(false, "bash").
- # make sure you are using recent pip/virtualenv versions
- python -m pip install -U pip virtualenv
- git clone #{gh("spaCy")}
- cd spaCy
-
- virtualenv .env
- source .env/bin/activate
- pip install -r requirements.txt
- pip install -e .
-
-p
- | Compared to regular install via pip, #[+a(gh("spaCy", "requirements.txt")) requirements.txt]
- | additionally installs developer dependencies such as Cython.
-
-p
- | Instead of the above verbose commands, you can also use the following
- | #[+a("http://www.fabfile.org/") Fabric] commands:
-
-+table(["Command", "Description"])
- +row
- +cell #[code fab env]
- +cell Create #[code virtualenv] and delete previous one, if it exists.
-
- +row
- +cell #[code fab make]
- +cell Compile the source.
-
- +row
- +cell #[code fab clean]
- +cell Remove compiled objects, including the generated C++.
-
- +row
- +cell #[code fab test]
- +cell Run basic tests, aborting after first failure.
-
-p
- | All commands assume that your #[code virtualenv] is located in a
- | directory #[code .env]. If you're using a different directory, you can
- | change it via the environment variable #[code VENV_DIR], for example:
-
-+code(false, "bash").
- VENV_DIR=".custom-env" fab clean make
-
-+h(3, "source-ubuntu") Ubuntu
-
-p Install system-level dependencies via #[code apt-get]:
-
-+code(false, "bash").
- sudo apt-get install build-essential python-dev git
-
-+h(3, "source-osx") macOS / OS X
-
-p
- | Install a recent version of #[+a("https://developer.apple.com/xcode/") XCode],
- | including the so-called "Command Line Tools". macOS and OS X ship with
- | Python and git preinstalled. To compile spaCy with multi-threading support
- | on macOS / OS X, #[+a("https://github.com/explosion/spaCy/issues/267") see here].
-
-+h(3, "source-windows") Windows
-
-p
- | Install a version of
- | #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
- | that matches the version that was used to compile your Python
- | interpreter. For official distributions these are:
-
-+table([ "Distribution", "Version"])
- +row
- +cell Python 2.7
- +cell Visual Studio 2008
-
- +row
- +cell Python 3.4
- +cell Visual Studio 2010
-
- +row
- +cell Python 3.5+
- +cell Visual Studio 2015
-
-+h(2, "troubleshooting") Troubleshooting guide
-
-p
- | This section collects some of the most common errors you may come
- | across when installing, loading and using spaCy, as well as their solutions.
-
-+aside("Help us improve this guide")
- | Did you come across a problem like the ones listed here and want to
- | share the solution? You can find the "Suggest edits" button at the
- | bottom of this page that points you to the source. We always
- | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
-
-+h(3, "compatible-model") No compatible model found
-
-+code(false, "text").
- No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
-
-p
- | This usually means that the model you're trying to download does not
- | exist, or isn't available for your version of spaCy. Check the
- | #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
- | to see which models are available for your spaCy version. If you're using
- | an old version, consider upgrading to the latest release. Note that while
- | spaCy supports tokenization for
- | #[+a("/docs/api/language-models/#alpha-support") a variety of languages],
- | not all of them come with statistical models. To only use the tokenizer,
- | import the language's #[code Language] class instead, for example
- | #[code from spacy.fr import French].
-
-+h(3, "symlink-privilege") Symbolic link privilege not held
-
-+code(false, "text").
- OSError: symbolic link privilege not held
-
-p
- | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you
- | load models by name, spaCy creates a symbolic link in the
- | #[code spacy/data] directory. This means your user needs permission to do
- | this. The above error mostly occurs when doing a system-wide installation,
- | which will create the symlinks in a system directory. Run the
- | #[code download] or #[code link] command as administrator, or use a
- | #[code virtualenv] to install spaCy in a user directory, instead
- | of doing a system-wide installation.
-
-+h(3, "no-cache-dir") No such option: --no-cache-dir
-
-+code(false, "text").
- no such option: --no-cache-dir
-
-p
- | The #[code download] command uses pip to install the models and sets the
- | #[code --no-cache-dir] flag to prevent it from requiring too much memory.
- | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
- | requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to
- | the latest version of pip. To see which version you have installed,
- | run #[code pip --version].
-
-+h(3, "import-error") Import error
-
-+code(false, "text").
- Import Error: No module named spacy
-
-p
- | This error means that the spaCy module can't be located on your system, or in
- | your environment. Make sure you have spaCy installed. If you're using a
- | #[code virtualenv], make sure it's activated and check that spaCy is
- | installed in that environment β otherwise, you're trying to load a system
- | installation. You can also run #[code which python] to find out where
- | your Python executable is located.
-
-+h(3, "import-error-models") Import error: models
-
-+code(false, "text").
- ImportError: No module named 'en_core_web_sm'
-
-p
- | As of spaCy v1.7, all models can be installed as Python packages. This means
- | that they'll become importable modules of your application. When creating
- | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try
- | to import the model to load its meta data. If this fails, it's usually a
- | sign that the package is not installed in the current environment.
- | Run #[code pip list] or #[code pip freeze] to check which model packages
- | you have installed, and install the
- | #[+a("/docs/usage/models#available") correct models] if necessary. If you're
- | importing a model manually at the top of a file, make sure to use the name
- | of the package, not the shortcut link you've created.
-
-+h(3, "vocab-strings") File not found: vocab/strings.json
-
-+code(false, "text").
- FileNotFoundError: No such file or directory: [...]/vocab/strings.json
-
-p
- | This error may occur when using #[code spacy.load()] to load
- | a language model β either because you haven't set up a
- | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it
- | doesn't actually exist. Set up a
- | #[+a("/docs/usage/models/#usage") shortcut link] for the model
- | you want to load. This can either be an installed model package, or a
- | local directory containing the model data. If you want to use one of the
- | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for
- | languages that don't yet have a statistical model, you should import its
- | #[code Language] class instead, for example
- | #[code from spacy.lang.bn import Bengali].
-
-+h(3, "command-not-found") Command not found
-
-+code(false, "text").
- command not found: spacy
-
-p
- | This error may occur when running the #[code spacy] command from the
- | command line. spaCy does not currently add an entry to our #[code PATH]
- | environment variable, as this can lead to unexpected results, especially
- | when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
- | maps #[code spacy] to #[code python -m spacy]. If this is not working as
- | expected, run the command with #[code python -m], yourself β
- | for example #[code python -m spacy download en]. For more info on this,
- | see #[+api("cli#download") download].
-
-+h(3, "module-load") 'module' object has no attribute 'load'
-
-+code(false, "text").
- AttributeError: 'module' object has no attribute 'load'
-
-p
- | While this could technically have many causes, including spaCy being
- | broken, the most likely one is that your script's file or directory name
- | is "shadowing" the module β e.g. your file is called #[code spacy.py],
- | or a directory you're importing from is called #[code spacy]. So, when
- | using spaCy, never call anything else #[code spacy].
-
-+h(2, "tests") Run tests
-
-p
- | spaCy comes with an #[+a(gh("spacy", "spacy/tests")) extensive test suite].
- | First, find out where spaCy is installed:
-
-+code(false, "bash").
- python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
-
-p
- | Then run #[code pytest] on that directory. The flags #[code --slow] and
- | #[code --model] are optional and enable additional tests.
-
-+code(false, "bash").
- # make sure you are using recent pytest version
- python -m pip install -U pytest
-
- python -m pytest <spacy-directory> # basic tests
- python -m pytest <spacy-directory> --slow # basic and slow tests
- python -m pytest <spacy-directory> --models --all # basic and all model tests
- python -m pytest <spacy-directory> --models --en # basic and English model tests
diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade
deleted file mode 100644
index d4a1ffbc2..000000000
--- a/website/docs/usage/production-use.jade
+++ /dev/null
@@ -1,147 +0,0 @@
-//- π« DOCS > USAGE > PROCESSING TEXT
-
-include ../../_includes/_mixins
-
-+under-construction
-
-+h(2, "multithreading") Multi-threading with #[code .pipe()]
-
-p
- | If you have a sequence of documents to process, you should use the
- | #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
- | an iterator of texts, and accumulates an internal buffer,
- | which it works on in parallel. It then yields the documents in order,
- | one-by-one. After a long and bitter struggle, the global interpreter
- | lock was freed around spaCy's main parsing loop in v0.100.3. This means
- | that #[code .pipe()] will be significantly faster in most
- | practical situations, because it allows shared memory parallelism.
-
-+code.
- for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
- pass
-
-p
- | To make full use of the #[code .pipe()] function, you might want to
- | brush up on #[strong Python generators]. Here are a few quick hints:
-
-+list
- +item
- | Generator comprehensions can be written as
- | #[code (item for item in sequence)].
-
- +item
- | The
- | #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
- | and the
- | #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
- | provide a lot of handy #[strong generator tools].
-
- +item
- | Often you'll have an input stream that pairs text with some
- | important meta data, e.g. a JSON document. To
- | #[strong pair up the meta data] with the processed #[code Doc]
- | object, you should use the #[code itertools.tee] function to split
- | the generator in two, and then #[code izip] the extra stream to the
- | document stream.
-
-+h(2, "own-annotations") Bringing your own annotations
-
-p
- | spaCy generally assumes by default that your data is raw text. However,
- | sometimes your data is partially annotated, e.g. with pre-existing
- | tokenization, part-of-speech tags, etc. The most common situation is
- | that you have pre-defined tokenization. If you have a list of strings,
- | you can create a #[code Doc] object directly. Optionally, you can also
- | specify a list of boolean values, indicating whether each word has a
- | subsequent space.
-
-+code.
- doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
-
-p
- | If provided, the spaces list must be the same length as the words list.
- | The spaces list affects the #[code doc.text], #[code span.text],
- | #[code token.idx], #[code span.start_char] and #[code span.end_char]
- | attributes. If you don't provide a #[code spaces] sequence, spaCy will
- | assume that all words are whitespace delimited.
-
-+code.
- good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
- bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
- assert bad_spaces.text == u'Hello , world !'
- assert good_spaces.text == u'Hello, world!'
-
-p
- | Once you have a #[+api("doc") #[code Doc]] object, you can write to its
- | attributes to set the part-of-speech tags, syntactic dependencies, named
- | entities and other attributes. For details, see the respective usage
- | pages.
-
-+h(2, "models") Working with models
-
-p
- | If your application depends on one or more #[+a("/docs/usage/models") models],
- | you'll usually want to integrate them into your continuous integration
- | workflow and build process. While spaCy provides a range of useful helpers
- | for downloading, linking and loading models, the underlying functionality
- | is entirely based on native Python packages. This allows your application
- | to handle a model like any other package dependency.
-
-+h(3, "models-download") Downloading and requiring model dependencies
-
-p
- | spaCy's built-in #[+api("cli#download") #[code download]] command
- | is mostly intended as a convenient, interactive wrapper. It performs
- | compatibility checks and prints detailed error messages and warnings.
- | However, if you're downloading models as part of an automated build
- | process, this only adds an unnecessary layer of complexity. If you know
- | which models your application needs, you should be specifying them directly.
-
-p
- | Because all models are valid Python packages, you can add them to your
- | application's #[code requirements.txt]. If you're running your own
- | internal PyPi installation, you can simply upload the models there. pip's
- | #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format]
- | supports both package names to download via a PyPi server, as well as direct
- | URLs.
-
-+code("requirements.txt", "text").
- spacy>=2.0.0,<3.0.0
- -e #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
-
-p
- | All models are versioned and specify their spaCy dependency. This ensures
- | cross-compatibility and lets you specify exact version requirements for
- | each model. If you've trained your own model, you can use the
- | #[+api("cli#package") #[code package]] command to generate the required
- | meta data and turn it into a loadable package.
-
-+h(3, "models-loading") Loading and testing models
-
-p
- | Downloading models directly via pip won't call spaCy's link
- | #[+api("cli#link") #[code link]] command, which creates
- | symlinks for model shortcuts. This means that you'll have to run this
- | command separately, or use the native #[code import] syntax to load the
- | models:
-
-+code.
- import en_core_web_sm
- nlp = en_core_web_sm.load()
-
-p
- | In general, this approach is recommended for larger code bases, as it's
- | more "native", and doesn't depend on symlinks or rely on spaCy's loader
- | to resolve string names to model packages. If a model can't be
- | imported, Python will raise an #[code ImportError] immediately. And if a
- | model is imported but not used, any linter will catch that.
-
-p
- | Similarly, it'll give you more flexibility when writing tests that
- | require loading models. For example, instead of writing your own
- | #[code try] and #[code except] logic around spaCy's loader, you can use
- | #[+a("http://pytest.readthedocs.io/en/latest/") pytest]'s
- | #[code importorskip()] method to only run a test if a specific model or
- | model version is installed. Each model package exposes a #[code __version__]
- | attribute which you can also use to perform your own version compatibility
- | checks before loading a model.
diff --git a/website/docs/usage/showcase.jade b/website/docs/usage/showcase.jade
deleted file mode 100644
index 66b7e6d86..000000000
--- a/website/docs/usage/showcase.jade
+++ /dev/null
@@ -1,44 +0,0 @@
-//- π« DOCS > USAGE > SHOWCASE
-
-include ../../_includes/_mixins
-
-p
- | On this page, we'll be featuring demos, libraries and products from
- | the spaCy community. Have you done something cool with spaCy?
- | #[a(href="mailto:#{EMAIL}") Let us know!]
-
-+h(2, "libraries") Third-party libraries
-
-+list
- each details, title in libraries
- +card-item(title, details)
-
-+h(2, "visualizations") Visualizations
-
-+grid
- each details, name in visualizations
- - details.image = "/assets/img/showcase/" + details.image
- +card(name, details)
-
-+h(2, "products") Built with spaCy
-
-+grid
- each details, name in products
- - details.image = "/assets/img/showcase/" + details.image
- +card(name, details)
-
-+h(2, "books") Books
-
-p We're excited to see books featuring spaCy already start to appear.
-
-+list
- each details, title in books
- +card-item(title, details)
-
-+h(2, "research") Research systems
-
-p Researchers are using spaCy to build ambitious, next-generation text processing technologies. spaCy is particularly popular amongst the biomedical NLP community, who are working on extracting knowledge from the huge volume of literature in their field. For an up-to-date list of the papers citing spaCy, see #[+a("https://www.semanticscholar.org/search?year%5B%5D=2015&year%5B%5D=2020&q=spacy&sort=relevance&ae=false") Semantic Scholar].
-
-+list
- each details, title in research
- +card-item(title, details)
diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade
deleted file mode 100644
index ac3e808b3..000000000
--- a/website/docs/usage/spacy-101.jade
+++ /dev/null
@@ -1,430 +0,0 @@
-//- π« DOCS > USAGE > SPACY 101
-
-include ../../_includes/_mixins
-
-p
- | Whether you're new to spaCy, or just want to brush up on some
- | NLP basics and implementation details β this page should have you covered.
- | Each section will explain one of spaCy's features in simple terms and
- | with examples or illustrations. Some sections will also reappear across
- | the usage guides as a quick introcution.
-
-+aside("Help us improve the docs")
- | Did you spot a mistake or come across explanations that
- | are unclear? We always appreciate improvement
- | #[+a(gh("spaCy") + "/issues") suggestions] or
- | #[+a(gh("spaCy") + "/pulls") pull requests]. You can find a "Suggest
- | edits" link at the bottom of each page that points you to the source.
-
-+h(2, "whats-spacy") What's spaCy?
-
-+grid.o-no-block
- +grid-col("half")
- p
- | spaCy is a #[strong free, open-source library] for advanced
- | #[strong Natural Language Processing] (NLP) in Python.
-
- p
- | If you're working with a lot of text, you'll eventually want to
- | know more about it. For example, what's it about? What do the
- | words mean in context? Who is doing what to whom? What companies
- | and products are mentioned? Which texts are similar to each other?
-
- p
- | spaCy is designed specifically for #[strong production use] and
- | helps you build applications that process and "understand"
- | large volumes of text. It can be used to build
- | #[strong information extraction] or
- | #[strong natural language understanding] systems, or to
- | pre-process text for #[strong deep learning].
-
- +table-of-contents
- +item #[+a("#features") Features]
- +item #[+a("#annotations") Linguistic annotations]
- +item #[+a("#annotations-token") Tokenization]
- +item #[+a("#annotations-pos-deps") POS tags and dependencies]
- +item #[+a("#annotations-ner") Named entities]
- +item #[+a("#vectors-similarity") Word vectors and similarity]
- +item #[+a("#pipelines") Pipelines]
- +item #[+a("#vocab") Vocab, hashes and lexemes]
- +item #[+a("#serialization") Serialization]
- +item #[+a("#training") Training]
- +item #[+a("#language-data") Language data]
- +item #[+a("#architecture") Architecture]
- +item #[+a("#community") Community & FAQ]
-
-+h(3, "what-spacy-isnt") What spaCy isn't
-
-+list
- +item #[strong spaCy is not a platform or "an API"].
- | Unlike a platform, spaCy does not provide a software as a service, or
- | a web application. It's an open-source library designed to help you
- | build NLP applications, not a consumable service.
- +item #[strong spaCy is not an out-of-the-box chat bot engine].
- | While spaCy can be used to power conversational applications, it's
- | not designed specifically for chat bots, and only provides the
- | underlying text processing capabilities.
- +item #[strong spaCy is not research software].
- | It's built on the latest research, but it's designed to get
- | things done. This leads to fairly different design decisions than
- | #[+a("https://github./nltk/nltk") NLTK]
- | or #[+a("https://stanfordnlp.github.io/CoreNLP/") CoreNLP], which were
- | created as platforms for teaching and research. The main difference
- | is that spaCy is integrated and opinionated. spaCy tries to avoid asking
- | the user to choose between multiple algorithms that deliver equivalent
- | functionality. Keeping the menu small lets spaCy deliver generally better
- | performance and developer experience.
- +item #[strong spaCy is not a company].
- | It's an open-source library. Our company publishing spaCy and other
- | software is called #[+a(COMPANY_URL, true) Explosion AI].
-
-+h(2, "features") Features
-
-p
- | In the documentation, you'll come across mentions of spaCy's
- | features and capabilities. Some of them refer to linguistic concepts,
- | while others are related to more general machine learning functionality.
-
-+aside
- | If one of spaCy's functionalities #[strong needs a model], it means that
- | you need to have one of the available
- | #[+a("/docs/usage/models") statistical models] installed. Models are used
- | to #[strong predict] linguistic annotations β for example, if a word is
- | a verb or a noun.
-
-+table(["Name", "Description", "Needs model"])
- +row
- +cell #[strong Tokenization]
- +cell Segmenting text into words, punctuations marks etc.
- +cell #[+procon("con")]
-
- +row
- +cell #[strong Part-of-speech] (POS) #[strong Tagging]
- +cell Assigning word types to tokens, like verb or noun.
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Dependency Parsing]
- +cell
- | Assigning syntactic dependency labels, describing the relations
- | between individual tokens, like subject or object.
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Lemmatization]
- +cell
- | Assigning the base forms of words. For example, the lemma of
- | "was" is "be", and the lemma of "rats" is "rat".
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Sentence Boundary Detection] (SBD)
- +cell Finding and segmenting individual sentences.
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Named Entity Recongition] (NER)
- +cell
- | Labelling named "real-world" objects, like persons, companies or
- | locations.
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Similarity]
- +cell
- | Comparing words, text spans and documents and how similar they
- | are to each other.
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Text classification]
- +cell Assigning categories or labels to a whole document, or parts of a document.
- +cell #[+procon("pro")]
-
- +row
- +cell #[strong Rule-based Matching]
- +cell
- | Finding sequences of tokens based on their texts and linguistic
- | annotations, similar to regular expressions.
- +cell #[+procon("con")]
-
- +row
- +cell #[strong Training]
- +cell Updating and improving a statistical model's predictions.
- +cell #[+procon("neutral")]
-
- +row
- +cell #[strong Serialization]
- +cell Saving objects to files or byte strings.
- +cell #[+procon("neutral")]
-
-+h(2, "annotations") Linguistic annotations
-
-p
- | spaCy provides a variety of linguistic annotations to give you
- | #[strong insights into a text's grammatical structure]. This includes the
- | word types, like the parts of speech, and how the words are related to
- | each other. For example, if you're analysing text, it makes a huge
- | difference whether a noun is the subject of a sentence, or the object β
- | or whether "google" is used as a verb, or refers to the website or
- | company in a specific context.
-
-p
- | Once you've downloaded and installed a #[+a("/docs/usage/models") model],
- | you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
- | return a #[code Language] object contaning all components and data needed
- | to process text. We usually call it #[code nlp]. Calling the #[code nlp]
- | object on a string of text will return a processed #[code Doc]:
-
-+code.
- import spacy
-
- nlp = spacy.load('en')
- doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
-
-p
- | Even though a #[code Doc] is processed β e.g. split into individual words
- | and annotated β it still holds #[strong all information of the original text],
- | like whitespace characters. You can always get the offset of a token into the
- | original string, or reconstruct the original by joining the tokens and their
- | trailing whitespace. This way, you'll never lose any information
- | when processing text with spaCy.
-
-+h(3, "annotations-token") Tokenization
-
-include _spacy-101/_tokenization
-
-+infobox
- | To learn more about how spaCy's tokenization rules work in detail,
- | how to #[strong customise and replace] the default tokenizer and how to
- | #[strong add language-specific data], see the usage guides on
- | #[+a("/docs/usage/adding-languages") adding languages] and
- | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
-
-+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
- +tag-model("dependency parse")
-
-include _spacy-101/_pos-deps
-
-+infobox
- | To learn more about #[strong part-of-speech tagging] and rule-based
- | morphology, and how to #[strong navigate and use the parse tree]
- | effectively, see the usage guides on
- | #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
- | #[+a("/docs/usage/dependency-parse") using the dependency parse].
-
-+h(3, "annotations-ner") Named Entities
- +tag-model("named entities")
-
-include _spacy-101/_named-entities
-
-+infobox
- | To learn more about entity recognition in spaCy, how to
- | #[strong add your own entities] to a document and how to
- | #[strong train and update] the entity predictions of a model, see the
- | usage guides on
- | #[+a("/docs/usage/entity-recognition") named entity recognition] and
- | #[+a("/docs/usage/training-ner") training the named entity recognizer].
-
-+h(2, "vectors-similarity") Word vectors and similarity
- +tag-model("vectors")
-
-include _spacy-101/_similarity
-
-include _spacy-101/_word-vectors
-
-+infobox
- | To learn more about word vectors, how to #[strong customise them] and
- | how to load #[strong your own vectors] into spaCy, see the usage
- | guide on
- | #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
-
-+h(2, "pipelines") Pipelines
-
-include _spacy-101/_pipelines
-
-+infobox
- | To learn more about #[strong how processing pipelines work] in detail,
- | how to enable and disable their components, and how to
- | #[strong create your own], see the usage guide on
- | #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
-
-+h(2, "vocab") Vocab, hashes and lexemes
-
-include _spacy-101/_vocab
-
-+h(2, "serialization") Serialization
-
-include _spacy-101/_serialization
-
-+infobox
- | To learn more about #[strong serialization] and how to
- | #[strong save and load your own models], see the usage guide on
- | #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
-
-+h(2, "training") Training
-
-include _spacy-101/_training
-
-+infobox
- | To learn more about #[strong training and updating] models, how to create
- | training data and how to improve spaCy's named entity recognition models,
- | see the usage guides on #[+a("/docs/usage/training") training] and
- | #[+a("/docs/usage/training-ner") training the named entity recognizer].
-
-+h(2, "language-data") Language data
-
-include _spacy-101/_language-data
-
-+infobox
- | To learn more about the individual components of the language data and
- | how to #[strong add a new language] to spaCy in preparation for training
- | a language model, see the usage guide on
- | #[+a("/docs/usage/adding-languages") adding languages].
-
-+h(2, "architecture") Architecture
-
-include _spacy-101/_architecture.jade
-
-+h(2, "community") Community & FAQ
-
-p
- | We're very happy to see the spaCy community grow and include a mix of
- | people from all kinds of different backgrounds β computational
- | linguistics, data science, deep learning, research and more. If you'd
- | like to get involved, below are some answers to the most important
- | questions and resources for further reading.
-
-+h(3, "faq-help-code") Help, my code isn't working!
-
-p
- | Bugs suck, and we're doing our best to continuously improve the tests
- | and fix bugs as soon as possible. Before you submit an issue, do a
- | quick search and check if the problem has already been reported. If
- | you're having installation or loading problems, make sure to also check
- | out the #[+a("/docs/usage#troubleshooting") troubleshooting guide]. Help
- | with spaCy is available via the following platforms:
-
-+aside("How do I know if something is a bug?")
- | Of course, it's always hard to know for sure, so don't worry β we're not
- | going to be mad if a bug report turns out to be a typo in your
- | code. As a simple rule, any C-level error without a Python traceback,
- | like a #[strong segmentation fault] or #[strong memory error],
- | is #[strongΒ always] a spaCy bug.#[br]#[br]
-
- | Because models are statistical, their performance will never be
- | #[em perfect]. However, if you come across
- | #[strong patterns that might indicate an underlying issue], please do
- | file a report. Similarly, we also care about behaviours that
- | #[strong contradict our docs].
-
-+table(["Platform", "Purpose"])
- +row
- +cell #[+a("https://stackoverflow.com/questions/tagged/spacy") StackOverflow]
- +cell
- | #[strong Usage questions] and everything related to problems with
- | your specific code. The StackOverflow community is much larger
- | than ours, so if your problem can be solved by others, you'll
- | receive help much quicker.
-
- +row
- +cell #[+a("https://gitter.im/" + SOCIAL.gitter) Gitter chat]
- +cell
- | #[strong General discussion] about spaCy, meeting other community
- | members and exchanging #[strong tips, tricks and best practices].
- | If we're working on experimental models and features, we usually
- | share them on Gitter first.
-
- +row
- +cell #[+a(gh("spaCy") + "/issues") GitHub issue tracker]
- +cell
- | #[strong Bug reports] and #[strong improvement suggestions], i.e.
- | everything that's likely spaCy's fault. This also includes
- | problems with the models beyond statistical imprecisions, like
- | patterns that point to a bug.
-
-+infobox
- | Please understand that we won't be able to provide individual support via
- | email. We also believe that help is much more valuable if it's shared
- | publicly, so that #[strong more people can benefit from it]. If you come
- | across an issue and you think you might be able to help, consider posting
- | a quick update with your solution. No matter how simple, it can easily
- | save someone a lot of time and headache β and the next time you need help,
- | they might repay the favour.
-
-+h(3, "faq-contributing") How can I contribute to spaCy?
-
-p
- | You don't have to be an NLP expert or Python pro to contribute, and we're
- | happy to help you get started. If you're new to spaCy, a good place to
- | start is the
- | #[+a(gh("spaCy") + '/issues?q=is%3Aissue+is%3Aopen+label%3A"help+wanted+%28easy%29"') #[code help wanted (easy)] label]
- | on GitHub, which we use to tag bugs and feature requests that are easy
- | and self-contained. We also appreciate contributions to the docs β whether
- | it's fixing a typo, improving an example or adding additional explanations.
- | You'll find a "Suggest edits" link at the bottom of each page that points
- | you to the source.
-
-p
- | Another way of getting involved is to help us improve the
- | #[+a("/docs/usage/adding-languages#language-data") language data] β
- | especially if you happen to speak one of the languages currently in
- | #[+a("/docs/api/language-models#alpha-support") alpha support]. Even
- | adding simple tokenizer exceptions, stop words or lemmatizer data
- | can make a big difference. It will also make it easier for us to provide
- | a statistical model for the language in the future. Submitting a test
- | that documents a bug or performance issue, or covers functionality that's
- | especially important for your application is also very helpful. This way,
- | you'll also make sure we never accidentally introduce regressions to the
- | parts of the library that you care about the most.
-
-p
- strong
- | For more details on the types of contributions we're looking for, the
- | code conventions and other useful tips, make sure to check out the
- | #[+a(gh("spaCy", "CONTRIBUTING.md")) contributing guidelines].
-
-+infobox("Code of Conduct")
- | spaCy adheres to the
- | #[+a("http://contributor-covenant.org/version/1/4/") Contributor Covenant Code of Conduct].
- | By participating, you are expected to uphold this code.
-
-+h(3, "faq-project-with-spacy")
- | I've built something cool with spaCy β how can I get the word out?
-
-p
- | First, congrats β we'd love to check it out! When you share your
- | project on Twitter, don't forget to tag
- | #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] so we
- | don't miss it. If you think your project would be a good fit for the
- | #[+a("/docs/usage/showcase") showcase], #[strong feel free to submit it!]
- | Tutorials are also incredibly valuable to other users and a great way to
- | get exposure. So we strongly encourage #[strong writing up your experiences],
- | or sharing your code and some tips and tricks on your blog. Since our
- | website is open-source, you can add your project or tutorial by making a
- | pull request on GitHub.
-
-+aside("Contributing to spacy.io")
- | All showcase and tutorial links are stored in a
- | #[+a(gh("spaCy", "website/docs/usage/_data.json")) JSON file], so you
- | won't even have to edit any markup. For more info on how to submit
- | your project, see the
- | #[+a(gh("spaCy", "CONTRIBUTING.md#submitting-a-project-to-the-showcase")) contributing guidelines]
- | and our #[+a(gh("spaCy", "website")) website docs].
-
-p
- | If you would like to use the spaCy logo on your site, please get in touch
- | and ask us first. However, if you want to show support and tell others
- | that your project is using spaCy, you can grab one of our
- | #[strong spaCy badges] here:
-
-- SPACY_BADGES = ["built%20with-spaCy-09a3d5.svg", "made%20with%20β€%20and-spaCy-09a3d5.svg", "spaCy-v2-09a3d5.svg"]
-+quickstart([{id: "badge", input_style: "check", options: SPACY_BADGES.map(function(badge, i) { return {id: i, title: "", checked: (i == 0) ? true : false}}) }], false, false, true)
- .c-code-block(data-qs-results)
- for badge, i in SPACY_BADGES
- - var url = "https://img.shields.io/badge/" + badge
- +code(false, "text", "star").o-no-block(data-qs-badge=i)=url
- +code(false, "text", "code").o-no-block(data-qs-badge=i).
- <a href="#{SITE_URL}"><img src="#{url}" height="20"></a>
- +code(false, "text", "markdown").o-no-block(data-qs-badge=i).
- [![spaCy](#{url})](#{SITE_URL})
diff --git a/website/docs/usage/text-classification.jade b/website/docs/usage/text-classification.jade
deleted file mode 100644
index 33e384dbd..000000000
--- a/website/docs/usage/text-classification.jade
+++ /dev/null
@@ -1,5 +0,0 @@
-//- π« DOCS > USAGE > TEXT CLASSIFICATION
-
-include ../../_includes/_mixins
-
-+under-construction
diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade
deleted file mode 100644
index 3c74f7a9d..000000000
--- a/website/docs/usage/training-ner.jade
+++ /dev/null
@@ -1,114 +0,0 @@
-include ../../_includes/_mixins
-
-p
- | All #[+a("/docs/usage/models") spaCy models] support online learning, so
- | you can update a pre-trained model with new examples. You can even add
- | new classes to an existing model, to recognise a new entity type,
- | part-of-speech, or syntactic relation. Updating an existing model is
- | particularly useful as a "quick and dirty solution", if you have only a
- | few corrections or annotations.
-
-+h(2, "improving-accuracy") Improving accuracy on existing entity types
-
-p
- | To update the model, you first need to create an instance of
- | #[+api("goldparse") #[code GoldParse]], with the entity labels
- | you want to learn. You'll usually need to provide many examples to
- | meaningfully improve the system β a few hundred is a good start, although
- | more is better.
-
-+image
- include ../../assets/img/docs/training-loop.svg
- .u-text-right
- +button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
-
-p
- | You should avoid iterating over the same few examples multiple times, or
- | the model is likely to "forget" how to annotate other examples. If you
- | iterate over the same few examples, you're effectively changing the loss
- | function. The optimizer will find a way to minimize the loss on your
- | examples, without regard for the consequences on the examples it's no
- | longer paying attention to.
-
-p
- | One way to avoid this "catastrophic forgetting" problem is to "remind"
- | the model of other examples by augmenting your annotations with sentences
- | annotated with entities automatically recognised by the original model.
- | Ultimately, this is an empirical process: you'll need to
- | #[strong experiment on your own data] to find a solution that works best
- | for you.
-
-+h(2, "example") Example
-
-+under-construction
-
-+code.
- import random
- from spacy.lang.en import English
- from spacy.gold import GoldParse, biluo_tags_from_offsets
-
- def main(model_dir=None):
- train_data = [
- ('Who is Shaka Khan?',
- [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]),
- ('I like London and Berlin.',
- [(len('I like '), len('I like London'), 'LOC'),
- (len('I like London and '), len('I like London and Berlin'), 'LOC')])
- ]
- nlp = English(pipeline=['tensorizer', 'ner'])
- get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
- optimizer = nlp.begin_training(get_data)
- for itn in range(100):
- random.shuffle(train_data)
- losses = {}
- for raw_text, entity_offsets in train_data:
- doc = nlp.make_doc(raw_text)
- gold = GoldParse(doc, entities=entity_offsets)
- nlp.update([doc], [gold], drop=0.5, sgd=optimizer, losses=losses)
- nlp.to_disk(model_dir)
-
-+code.
- def reformat_train_data(tokenizer, examples):
- """Reformat data to match JSON format"""
- output = []
- for i, (text, entity_offsets) in enumerate(examples):
- doc = tokenizer(text)
- ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
- words = [w.text for w in doc]
- tags = ['-'] * len(doc)
- heads = [0] * len(doc)
- deps = [''] * len(doc)
- sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
- output.append((text, [(sentence, [])]))
- return output
-
-p.u-text-right
- +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary").u-text-tag View full example
-
-+h(2, "saving-loading") Saving and loading
-
-p
- | After training our model, you'll usually want to save its state, and load
- | it back later. You can do this with the
- | #[+api("language#to_disk") #[code Language.to_disk()]] method:
-
-+code.
- nlp.to_disk('/home/me/data/en_technology')
-
-p
- | To make the model more convenient to deploy, we recommend wrapping it as
- | a Python package, so that you can install it via pip and load it as a
- | module. spaCy comes with a handy #[+api("cli#package") #[code package]]
- | CLI command to create all required files and directories.
-
-+code(false, "bash").
- spacy package /home/me/data/en_technology /home/me/my_models
-
-p
- | To build the package and create a #[code .tar.gz] archive, run
- | #[code python setup.py sdist] from within its directory.
-
-+infobox("Saving and loading models")
- | For more information and a detailed guide on how to package your model,
- | see the documentation on
- | #[+a("/docs/usage/saving-loading#models") saving and loading models].
diff --git a/website/docs/usage/tutorials.jade b/website/docs/usage/tutorials.jade
deleted file mode 100644
index 2b8eddbf1..000000000
--- a/website/docs/usage/tutorials.jade
+++ /dev/null
@@ -1,38 +0,0 @@
-//- π« DOCS > USAGE > TUTORIALS
-
-include ../../_includes/_mixins
-
-p
- | Have you written a tutorial on spaCy, or did you find one that should be
- | featured here? #[a(href="mailto:#{EMAIL}") Let us know!]
-
-+h(2, "first-steps") First steps
-
-p
- | These tutorials help you get started. They describe how to set up your
- | environment and start using spaCy.
-
-+grid
- each details, title in first_steps
- +card(title, details)
-
-+h(2, "features") Deep dives
-
-p
- | These tutorials take a closer look at particular features of spaCy, or
- | particular types of NLP problems. Most come with more explanatory text,
- | to help introduce you to new concepts.
-
-+grid
- each details, title in deep_dives
- +card(title, details)
-
-+h(2, "code") Programs and scripts
-
-p
- | These tutorials give you all the code and nothing but the code β they're
- | Python scripts you can modify and run.
-
-+grid
- each details, title in code
- +card(title, details)
diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade
deleted file mode 100644
index 6d98e3f05..000000000
--- a/website/docs/usage/v2.jade
+++ /dev/null
@@ -1,531 +0,0 @@
-//- π« DOCS > USAGE > WHAT'S NEW IN V2.0
-
-include ../../_includes/_mixins
-
-p
- | We're very excited to finally introduce spaCy v2.0! On this page, you'll
- | find a summary of the new features, information on the backwards
- | incompatibilities, including a handy overview of what's been renamed or
- | deprecated. To help you make the most of v2.0, we also
- | #[strong re-wrote almost all of the usage guides and API docs], and added
- | more real-world examples. If you're new to spaCy, or just want to brush
- | up on some NLP basics and the details of the library, check out
- | the #[+a("/docs/usage/spacy-101") spaCy 101 guide] that explains the most
- | important concepts with examples and illustrations.
-
-+h(2, "summary") Summary
-
-+grid.o-no-block
- +grid-col("half")
-
- p This release features
- | entirely new #[strong deep learning-powered models] for spaCy's tagger,
- | parser and entity recognizer. The new models are #[strong 20x smaller]
- | than the linear models that have powered spaCy until now: from 300 MB to
- | only 15 MB.
-
- p
- | We've also made several usability improvements that are
- | particularly helpful for #[strong production deployments]. spaCy
- | v2 now fully supports the Pickle protocol, making it easy to use
- | spaCy with #[+a("https://spark.apache.org/") Apache Spark]. The
- | string-to-integer mapping is #[strong no longer stateful], making
- | it easy to reconcile annotations made in different processes.
- | Models are smaller and use less memory, and the APIs for serialization
- | are now much more consistent.
-
- +table-of-contents
- +item #[+a("#summary") Summary]
- +item #[+a("#features") New features]
- +item #[+a("#features-pipelines") Improved processing pipelines]
- +item #[+a("#features-text-classification") Text classification]
- +item #[+a("#features-hash-ids") Hash values instead of integer IDs]
- +item #[+a("#features-serializer") Saving, loading and serialization]
- +item #[+a("#features-displacy") displaCy visualizer]
- +item #[+a("#features-language") Language data and lazy loading]
- +item #[+a("#features-matcher") Revised matcher API]
- +item #[+a("#features-models") Neural network models]
- +item #[+a("#incompat") Backwards incompatibilities]
- +item #[+a("#migrating") Migrating from spaCy v1.x]
- +item #[+a("#benchmarks") Benchmarks]
-
-p
- | The main usability improvements you'll notice in spaCy v2.0 are around
- | #[strong defining, training and loading your own models] and components.
- | The new neural network models make it much easier to train a model from
- | scratch, or update an existing model with a few examples. In v1.x, the
- | statistical models depended on the state of the #[code Vocab]. If you
- | taught the model a new word, you would have to save and load a lot of
- | data β otherwise the model wouldn't correctly recall the features of your
- | new example. That's no longer the case.
-
-p
- | Due to some clever use of hashing, the statistical models
- | #[strong never change size], even as they learn new vocabulary items.
- | The whole pipeline is also now fully differentiable. Even if you don't
- | have explicitly annotated data, you can update spaCy using all the
- | #[strong latest deep learning tricks] like adversarial training, noise
- | contrastive estimation or reinforcement learning.
-
-+h(2, "features") New features
-
-p
- | This section contains an overview of the most important
- | #[strong new features and improvements]. The #[+a("/docs/api") API docs]
- | include additional deprecation notes. New methods and functions that
- | were introduced in this version are marked with a #[+tag-new(2)] tag.
-
-+h(3, "features-pipelines") Improved processing pipelines
-
-+aside-code("Example").
- # Modify an existing pipeline
- nlp = spacy.load('en')
- nlp.pipeline.append(my_component)
-
- # Register a factory to create a component
- spacy.set_factory('my_factory', my_factory)
- nlp = Language(pipeline=['my_factory', mycomponent])
-
-p
- | It's now much easier to #[strong customise the pipeline] with your own
- | components, functions that receive a #[code Doc] object, modify and
- | return it. If your component is stateful, you can define and register a
- | factory which receives the shared #[code Vocab] object and returns a
- |Β component. spaCy's default components can be added to your pipeline by
- | using their string IDs. This way, you won't have to worry about finding
- | and implementing them β simply add #[code "tagger"] to the pipeline,
- | and spaCy will know what to do.
-
-+image
- include ../../assets/img/docs/pipeline.svg
-
-+infobox
- | #[strong API:] #[+api("language") #[code Language]]
- | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
-
-+h(3, "features-text-classification") Text classification
-
-+aside-code("Example").
- from spacy.lang.en import English
- nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
-
-p
- | spaCy v2.0 lets you add text categorization models to spaCy pipelines.
- | The model supports classification with multiple, non-mutually exclusive
- | labels β so multiple labels can apply at once. You can change the model
- | architecture rather easily, but by default, the #[code TextCategorizer]
- | class uses a convolutional neural network to assign position-sensitive
- | vectors to each word in the document.
-
-+infobox
- | #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]],
- | #[+api("doc#attributes") #[code Doc.cats]],
- | #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
- | #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification]
-
-+h(3, "features-hash-ids") Hash values instead of integer IDs
-
-+aside-code("Example").
- doc = nlp(u'I love coffee')
- assert doc.vocab.strings[u'coffee'] == 3197928453018144401
- assert doc.vocab.strings[3197928453018144401] == u'coffee'
-
- beer_hash = doc.vocab.strings.add(u'beer')
- assert doc.vocab.strings[u'beer'] == beer_hash
- assert doc.vocab.strings[beer_hash] == u'beer'
-
-p
- | The #[+api("stringstore") #[code StringStore]] now resolves all strings
- | to hash values instead of integer IDs. This means that the string-to-int
- | mapping #[strong no longer depends on the vocabulary state], making a lot
- | of workflows much simpler, especially during training. Unlike integer IDs
- | in spaCy v1.x, hash values will #[strong always match] β even across
- | models. Strings can now be added explicitly using the new
- | #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
- | is available via #[code token.orth].
-
-+infobox
- | #[strong API:] #[+api("stringstore") #[code StringStore]]
- | #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
-
-+h(3, "features-serializer") Saving, loading and serialization
-
-+aside-code("Example").
- nlp = spacy.load('en') # shortcut link
- nlp = spacy.load('en_core_web_sm') # package
- nlp = spacy.load('/path/to/en') # unicode path
- nlp = spacy.load(Path('/path/to/en')) # pathlib Path
-
- nlp.to_disk('/path/to/nlp')
- nlp = English().from_disk('/path/to/nlp')
-
-p
- | spay's serialization API has been made consistent across classes and
- | objects. All container classes, i.e. #[code Language], #[code Doc],
- | #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
- | #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
- | that supports the Pickle protocol.
-
-p
- | The improved #[code spacy.load] makes loading models easier and more
- | transparent. You can load a model by supplying its
- | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
- | #[+a("/docs/usage/saving-loading#generating") model package] or a path.
- | The #[code Language] class to initialise will be determined based on the
- | model's settings. For a blank language, you can import the class directly,
- | e.g. #[code from spacy.lang.en import English].
-
-+infobox
- | #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]]
- | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
-
-+h(3, "features-displacy") displaCy visualizer with Jupyter support
-
-+aside-code("Example").
- from spacy import displacy
- doc = nlp(u'This is a sentence about Facebook.')
- displacy.serve(doc, style='dep') # run the web server
- html = displacy.render(doc, style='ent') # generate HTML
-
-p
- | Our popular dependency and named entity visualizers are now an official
- | part of the spaCy library! displaCy can run a simple web server, or
- | generate raw HTML markup or SVG files to be exported. You can pass in one
- | or more docs, and customise the style. displaCy also auto-detects whether
- | you're running #[+a("https://jupyter.org") Jupyter] and will render the
- | visualizations in your notebook.
-
-+infobox
- | #[strong API:] #[+api("displacy") #[code displacy]]
- | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
-
-+h(3, "features-language") Improved language data and lazy loading
-
-p
- | Language-specfic data now lives in its own submodule, #[code spacy.lang].
- | Languages are lazy-loaded, i.e. only loaded when you import a
- | #[code Language] class, or load a model that initialises one. This allows
- | languages to contain more custom data, e.g. lemmatizer lookup tables, or
- | complex regular expressions. The language data has also been tidied up
- | and simplified. spaCy now also supports simple lookup-based lemmatization.
-
-+infobox
- | #[strong API:] #[+api("language") #[code Language]]
- | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang]
- | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
-
-+h(3, "features-matcher") Revised matcher API
-
-+aside-code("Example").
- from spacy.matcher import Matcher
- matcher = Matcher(nlp.vocab)
- matcher.add('HEARTS', None, [{'ORTH': 'β€οΈ', 'OP': '+'}])
- assert len(matcher) == 1
- assert 'HEARTS' in matcher
-
-p
- | Patterns can now be added to the matcher by calling
- | #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
- | callback function to be invoked on each match, and one or more patterns.
- | This allows you to write powerful, pattern-specific logic using only one
- | matcher. For example, you might only want to merge some entity types,
- | and set custom flags for other matched patterns.
-
-+infobox
- | #[strong API:] #[+api("matcher") #[code Matcher]]
- | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
-
-+h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER
-
-+aside-code("Example", "bash").
- spacy download en # default English model
- spacy download de # default German model
- spacy download fr # default French model
- spacy download es # default Spanish model
- spacy download xx_ent_wiki_sm # multi-language NER
-
-p
- | spaCy v2.0 comes with new and improved neural network models for English,
- | German, French and Spanish, as well as a multi-language named entity
- | recognition model trained on Wikipedia. #[strong GPU usage] is now
- | supported via #[+a("http://chainer.org") Chainer]'s CuPy module.
-
-+infobox
- | #[strong Details:] #[+a("/docs/api/language-models") Languages],
- | #[+src(gh("spacy-models")) spacy-models]
- | #[strong Usage:] #[+a("/docs/usage/models") Models],
- | #[+a("/docs/usage#gpu") Using spaCy with GPU]
-
-+h(2, "incompat") Backwards incompatibilities
-
-+table(["Old", "New"])
- +row
- +cell
- | #[code spacy.en]
- | #[code spacy.xx]
- +cell
- | #[code spacy.lang.en]
- | #[code spacy.lang.xx]
-
- +row
- +cell #[code orth]
- +cell #[code lang.xx.lex_attrs]
-
- +row
- +cell #[code syntax.iterators]
- +cell #[code lang.xx.syntax_iterators]
-
- +row
- +cell #[code Language.save_to_directory]
- +cell #[+api("language#to_disk") #[code Language.to_disk]]
-
- +row
- +cell #[code Language.create_make_doc]
- +cell #[+api("language#attributes") #[code Language.tokenizer]]
-
- +row
- +cell
- | #[code Vocab.load]
- | #[code Vocab.load_lexemes]
- +cell
- | #[+api("vocab#from_disk") #[code Vocab.from_disk]]
- | #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
-
- +row
- +cell
- | #[code Vocab.dump]
- +cell
- | #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br]
- | #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
-
- +row
- +cell
- | #[code Vocab.load_vectors]
- | #[code Vocab.load_vectors_from_bin_loc]
- +cell
- | #[+api("vectors#from_disk") #[code Vectors.from_disk]]
- | #[+api("vectors#from_bytes") #[code Vectors.from_bytes]]
-
- +row
- +cell
- | #[code Vocab.dump_vectors]
- +cell
- | #[+api("vectors#to_disk") #[code Vectors.to_disk]]
- | #[+api("vectors#to_bytes") #[code Vectors.to_bytes]]
-
- +row
- +cell
- | #[code StringStore.load]
- +cell
- | #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
- | #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
-
- +row
- +cell
- | #[code StringStore.dump]
- +cell
- | #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
- | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
-
- +row
- +cell #[code Tokenizer.load]
- +cell
- | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
- | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
-
- +row
- +cell #[code Tagger.load]
- +cell
- | #[+api("tagger#from_disk") #[code Tagger.from_disk]]
- | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
-
- +row
- +cell #[code DependencyParser.load]
- +cell
- | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
- | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
-
- +row
- +cell #[code EntityRecognizer.load]
- +cell
- | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
- | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
-
- +row
- +cell #[code Matcher.load]
- +cell -
-
- +row
- +cell
- | #[code Matcher.add_pattern]
- | #[code Matcher.add_entity]
- +cell #[+api("matcher#add") #[code Matcher.add]]
-
- +row
- +cell #[code Matcher.get_entity]
- +cell #[+api("matcher#get") #[code Matcher.get]]
-
- +row
- +cell #[code Matcher.has_entity]
- +cell #[+api("matcher#contains") #[code Matcher.__contains__]]
-
- +row
- +cell #[code Doc.read_bytes]
- +cell #[+api("binder") #[code Binder]]
-
- +row
- +cell #[code Token.is_ancestor_of]
- +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
-
- +row
- +cell #[code cli.model]
- +cell -
-
-+h(2, "migrating") Migrating from spaCy 1.x
-
-p
- | Because we'e made so many architectural changes to the library, we've
- | tried to #[strong keep breaking changes to a minimum]. A lot of projects
- | follow the philosophy that if you're going to break anything, you may as
- | well break everything. We think migration is easier if there's a logic to
- | what has changed.
-
-p
- | We've therefore followed a policy of avoiding breaking changes to the
- | #[code Doc], #[code Span] and #[code Token] objects. This way, you can
- | focus on only migrating the code that does training, loading and
- | serialization β in other words, code that works with the #[code nlp]
- | object directly. Code that uses the annotations should continue to work.
-
-+infobox("Important note")
- | If you've trained your own models, keep in mind that your train and
- | runtime inputs must match. This means you'll have to
- | #[strong retrain your models] with spaCy v2.0.
-
-+h(3, "migrating-saving-loading") Saving, loading and serialization
-
-p
- | Double-check all calls to #[code spacy.load()] and make sure they don't
- | use the #[code path] keyword argument. If you're only loading in binary
- | data and not a model package that can construct its own #[code Language]
- | class and pipeline, you should now use the
- | #[+api("language#from_disk") #[code Language.from_disk()]] method.
-
-+code-new.
- nlp = spacy.load('/model')
- nlp = English().from_disk('/model/data')
-+code-old nlp = spacy.load('en', path='/model')
-
-p
- | Review all other code that writes state to disk or bytes.
- | All containers, now share the same, consistent API for saving and
- | loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
- | loading with #[code from_disk()] and #[code from_bytes()].
-
-+code-new.
- nlp.to_disk('/model')
- nlp.vocab.to_disk('/vocab')
-
-+code-old.
- nlp.save_to_directory('/model')
- nlp.vocab.dump('/vocab')
-
-p
- | If you've trained models with input from v1.x, you'll need to
- | #[strong retrain them] with spaCy v2.0. All previous models will not
- | be compatible with the new version.
-
-+h(3, "migrating-strings") Strings and hash values
-
-p
- | The change from integer IDs to hash values may not actually affect your
- | code very much. However, if you're adding strings to the vocab manually,
- | you now need to call #[+api("stringstore#add") #[code StringStore.add()]]
- | explicitly. You can also now be sure that the string-to-hash mapping will
- | always match across vocabularies.
-
-+code-new.
- nlp.vocab.strings.add(u'coffee')
- nlp.vocab.strings[u'coffee'] # 3197928453018144401
- other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
-
-+code-old.
- nlp.vocab.strings[u'coffee'] # 3672
- other_nlp.vocab.strings[u'coffee'] # 40259
-
-+h(3, "migrating-languages") Processing pipelines and language data
-
-p
- | If you're importing language data or #[code Language] classes, make sure
- | to change your import statements to import from #[code spacy.lang]. If
- | you've added your own custom language, it needs to be moved to
- | #[code spacy/lang/xx] and adjusted accordingly.
-
-+code-new from spacy.lang.en import English
-+code-old from spacy.en import English
-
-p
- | If you've been using custom pipeline components, check out the new
- | guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines].
- | Appending functions to the pipeline still works β but you might be able
- | to make this more convenient by registering "component factories".
- | Components of the processing pipeline can now be disabled by passing a
- | list of their names to the #[code disable] keyword argument on loading
- | or processing.
-
-+code-new.
- nlp = spacy.load('en', disable=['tagger', 'ner'])
- doc = nlp(u"I don't want parsed", disable=['parser'])
-+code-old.
- nlp = spacy.load('en', tagger=False, entity=False)
- doc = nlp(u"I don't want parsed", parse=False)
-
-+h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
-
-p
- | If you're using the matcher, you can now add patterns in one step. This
- | should be easy to update β simply merge the ID, callback and patterns
- | into one call to #[+api("matcher#add") #[code Matcher.add()]].
-
-+code-new.
- matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
-
-+code-old.
- matcher.add_entity('GoogleNow', on_match=merge_phrases)
- matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
-
-p
- | If you've been using #[strong acceptor functions], you'll need to move
- | this logic into the
- | #[+a("/docs/usage/rule-based-matching#on_match") #[code on_match] callbacks].
- | The callback function is invoked on every match and will give you access to
- | the doc, the index of the current match and all total matches. This lets
- | you both accept or reject the match, and define the actions to be
- | triggered.
-
-+h(2, "benchmarks") Benchmarks
-
-+under-construction
-
-+aside("Data sources")
- | #[strong Parser, tagger, NER:] #[+a("https://www.gabormelli.com/RKB/OntoNotes_Corpus") OntoNotes 5]#[br]
- | #[strong Word vectors:] #[+a("http://commoncrawl.org") Common Crawl]#[br]
-
-p The evaluation was conducted on raw text with no gold standard information.
-
-+table(["Model", "Version", "Type", "UAS", "LAS", "NER F", "POS", "w/s"])
- mixin benchmark-row(name, details, values, highlight, style)
- +row(style)
- +cell #[code=name]
- for cell in details
- +cell=cell
- for cell, i in values
- +cell.u-text-right
- if highlight && highlight[i]
- strong=cell
- else
- !=cell
-
- +benchmark-row("en_core_web_sm", ["2.0.0", "neural"], ["91.2", "89.2", "82.6", "96.6", "10,300"], [1, 1, 1, 0, 0])
- +benchmark-row("en_core_web_sm", ["1.2.0", "linear"], ["86.6", "83.8", "78.5", "96.6", "25,700"], [0, 0, 0, 0, 1], "divider")
- +benchmark-row("en_core_web_md", ["1.2.1", "linear"], ["90.6", "88.5", "81.4", "96.7", "18,800"], [0, 0, 0, 1, 0])
diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade
deleted file mode 100644
index 96a6bd49f..000000000
--- a/website/docs/usage/visualizers.jade
+++ /dev/null
@@ -1,384 +0,0 @@
-//- π« DOCS > USAGE > VISUALIZERS
-
-include ../../_includes/_mixins
-
-p
- | As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
- | and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
- | official part of the library. Visualizing a dependency parse or named
- | entities in a text is not only a fun NLP demo β it can also be incredibly
- | helpful in speeding up development and debugging your code and training
- | process. Instead of printing a list of dependency labels or entity spans,
- | you can simply pass your #[code Doc] objects to #[code displacy] and view
- | the visualizations in your browser, or export them as HTML files or
- | vector graphics.
-
-p
- | If you're running a #[+a("https://jupyter.org") Jupyter] notebook,
- | displaCy will detect this and return the markup in a format
- | #[+a("#jupyter") ready to be rendered and exported].
-
-+aside("What about the old visualizers?")
- | Our JavaScript-based visualizers #[+src(gh("displacy")) displacy.js] and
- | #[+src(gh("displacy-ent")) displacy-ent.js] will still be available on
- | GitHub. If you're looking to implement web-based visualizations, we
- | generally recommend using those instead of spaCy's built-in
- | #[code displacy] module. It'll allow your application to perform all
- | rendering on the client and only rely on the server for the text
- | processing. The generated markup is also more compatible with modern web
- | standards.
-
-+h(2, "getting-started") Getting started
- +tag-new(2)
-
-p
- | The quickest way visualize #[code Doc] is to use
- | #[+api("displacy#serve") #[code displacy.serve]]. This will spin up a
- | simple web server and let you view the result straight from your browser.
- | displaCy can either take a single #[code Doc] or a list of #[code Doc]
- | objects as its first argument. This lets you construct them however you
- | like β using any model or modifications you like.
-
-+h(3, "dep") Visualizing the dependency parse
-
-p
- | The dependency visualizer, #[code dep], shows part-of-speech tags
- | and syntactic dependencies.
-
-+code("Dependency example").
- import spacy
- from spacy import displacy
-
- nlp = spacy.load('en')
- doc = nlp(u'This is a sentence.')
- displacy.serve(doc, style='dep')
-
-+codepen("f0e85b64d469d6617251d8241716d55f", 370)
-
-p
- | The argument #[code options] lets you specify a dictionary of settings
- | to customise the layout, for example:
-
-+aside("Important note")
- | There's currently a known issue with the #[code compact] mode for
- | sentences with short arrows and long dependency labels, that causes labels
- | longer than the arrow to wrap. So if you come across this problem,
- | especially when using custom labels, you'll have to increase the
- | #[code distance] setting in the #[code options] to allow longer arcs.
-
-+table(["Name", "Type", "Description", "Default"])
- +row
- +cell #[code compact]
- +cell bool
- +cell "Compact mode" with square arrows that takes up less space.
- +cell #[code False]
-
- +row
- +cell #[code color]
- +cell unicode
- +cell Text color (HEX, RGB or color names).
- +cell #[code '#000000']
-
- +row
- +cell #[code bg]
- +cell unicode
- +cell Background color (HEX, RGB or color names).
- +cell #[code '#ffffff']
-
- +row
- +cell #[code font]
- +cell unicode
- +cell Font name or font family for all text.
- +cell #[code 'Arial']
-
-p
- | For a list of all available options, see the
- | #[+api("displacy#options") #[code displacy] API documentation].
-
-+aside-code("Options example").
- options = {'compact': True, 'bg': '#09a3d5',
- 'color': 'white', 'font': 'Source Sans Pro'}
- displacy.serve(doc, style='dep', options=options)
-
-+codepen("39c02c893a84794353de77a605d817fd", 360)
-
-+h(3, "ent") Visualizing the entity recognizer
-
-p
- | The entity visualizer, #[code ent], highlights named entities and
- | their labels in a text.
-
-+code("Named Entity example").
- import spacy
- from spacy import displacy
-
- text = """But Google is starting from behind. The company made a late push
- into hardware, and Appleβs Siri, available on iPhones, and Amazonβs Alexa
- software, which runs on its Echo and Dot devices, have clear leads in
- consumer adoption."""
-
- nlp = spacy.load('custom_ner_model')
- doc = nlp(text)
- displacy.serve(doc, style='ent')
-
-+codepen("a73f8b68f9af3157855962b283b364e4", 345)
-
-p The entity visualizer lets you customise the following #[code options]:
-
-+table(["Name", "Type", "Description", "Default"])
- +row
- +cell #[code ents]
- +cell list
- +cell
- | Entity types to highlight (#[code None] for all types).
- +cell #[code None]
-
- +row
- +cell #[code colors]
- +cell dict
- +cell
- | Color overrides. Entity types in lowercase should be mapped to
- | color names or values.
- +cell #[code {}]
-
-p
- | If you specify a list of #[code ents], only those entity types will be
- | rendered β for example, you can choose to display #[code PERSON] entities.
- | Internally, the visualizer knows nothing about available entity types and
- | will render whichever spans and labels it receives. This makes it
- | especially easy to work with custom entity types. By default, displaCy
- | comes with colours for all
- | #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
- | If you're using custom entity types, you can use the #[code colors]
- | setting to add your own colours for them.
-
-+aside-code("Options example").
- colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
- options = {'ents': ['ORG'], 'colors': colors}
- displacy.serve(doc, style='ent', options=options)
-
-+codepen("f42ec690762b6f007022a7acd6d0c7d4", 300)
-
-p
- | The above example uses a little trick: Since the background colour values
- | are added as the #[code background] style attribute, you can use any
- | #[+a("https://tympanus.net/codrops/css_reference/background/") valid background value]
- | or shorthand β including gradients and even images!
-
-+h(3, "ent-titles") Adding titles to documents
-
-p
- | Rendering several large documents on one page can easily become confusing.
- | To add a headline to each visualization, you can add a #[code title] to
- | its #[code user_data]. User data is never touched or modified by spaCy.
-
-+code.
- doc = nlp(u'This is a sentence about Google.')
- doc.user_data['title'] = 'This is a title'
- displacy.serve(doc, style='ent')
-
-p
- | This feature is espeically handy if you're using displaCy to compare
- | performance at different stages of a process, e.g. during training. Here
- | you could use the title for a brief description of the text example and
- | the number of iterations.
-
-+h(2, "render") Rendering visualizations
-
-p
- | If you don't need the web server and just want to generate the markup
- | β for example, to export it to a file or serve it in a custom
- | way β you can use #[+api("displacy#render") #[code displacy.render]].
- | It works the same way, but returns a string containing the markup.
-
-+code("Example").
- import spacy
- from spacy import displacy
-
- nlp = spacy.load('en')
- doc1 = nlp(u'This is a sentence.')
- doc2 = nlp(u'This is another sentence.')
- html = displacy.render([doc1, doc2], style='dep', page=True)
-
-p
- | #[code page=True] renders the markup wrapped as a full HTML page.
- | For minified and more compact HTML markup, you can set #[code minify=True].
- | If you're rendering a dependency parse, you can also export it as an
- | #[code .svg] file.
-
-+aside("What's SVG?")
- | Unlike other image formats, the SVG (Scalable Vector Graphics) uses XML
- | markup that's easy to manipulate
- | #[+a("https://www.smashingmagazine.com/2014/11/styling-and-animating-svgs-with-css/") using CSS] or
- | #[+a("https://css-tricks.com/smil-is-dead-long-live-smil-a-guide-to-alternatives-to-smil-features/") JavaScript].
- | Essentially, SVG lets you design with code, which makes it a perfect fit
- | for visualizing dependency trees. SVGs can be embedded online in an
- | #[code <img>] tag, or inlined in an HTML document. They're also
- | pretty easy to #[+a("https://convertio.co/image-converter/") convert].
-
-+code.
- svg = displacy.render(doc, style='dep')
- output_path = Path('/images/sentence.svg')
- output_path.open('w', encoding='utf-8').write(svg)
-
-+infobox("Important note")
- | Since each visualization is generated as a separate SVG, exporting
- | #[code .svg] files only works if you're rendering #[strong one single doc]
- | at a time. (This makes sense β after all, each visualization should be
- | a standalone graphic.) So instead of rendering all #[code Doc]s at one,
- | loop over them and export them separately.
-
-
-+h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses
-
-+code("Example").
- import spacy
- from spacy import displacy
- from pathlib import Path
-
- nlp = spacy.load('en')
- sentences = ["This is an example.", "This is another one."]
- for sent in sentences:
- doc = nlp(sentence)
- svg = displacy.render(doc, style='dep')
- file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
- output_path = Path('/images/' + file_name)
- output_path.open('w', encoding='utf-8').write(svg)
-
-p
- | The above code will generate the dependency visualizations and them to
- | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
-
-
-+h(2, "jupyter") Using displaCy in Jupyter notebooks
-
-p
- | displaCy is able to detect whether you're working in a
- | #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
- | that can be rendered in a cell straight away. When you export your
- | notebook, the visualizations will be included as HTML.
-
-+code("Jupyter Example").
- # don't forget to install a model, e.g.: spacy download en
- import spacy
- from spacy import displacy
-
- doc = nlp(u'Rats are various medium-sized, long-tailed rodents.')
- displacy.render(doc, style='dep')
-
- doc2 = nlp(LONG_NEWS_ARTICLE)
- displacy.render(doc2, style='ent')
-
-+aside("Enabling or disabling Jupyter mode")
- | To explicitly enable or disable "Jupyter mode", you can use the
- | #[code jupyter] keyword argument β e.g. to return raw HTML in a notebook,
- | or to force Jupyter rendering if auto-detection fails.
-
-+image("/assets/img/docs/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook")
-
-p
- | Internally, displaCy imports #[code display] and #[code HTML] from
- | #[code IPython.core.display] and returns a Jupyter HTML object. If you
- | were doing it manually, it'd look like this:
-
-+code.
- from IPython.core.display import display, HTML
-
- html = displacy.render(doc, style='dep')
- return display(HTML(html))
-
-+h(2, "manual-usage") Rendering data manually
-
-p
- | You can also use displaCy to manually render data. This can be useful if
- | you want to visualize output from other libraries, like
- | #[+a("http://www.nltk.org") NLTK] or
- | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet].
- | Simply convert the dependency parse or recognised entities to displaCy's
- | format and set #[code manual=True] on either #[code render()] or
- | #[code serve()].
-
-+aside-code("Example").
- ex = [{'text': 'But Google is starting from behind.',
- 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
- 'title': None}]
- html = displacy.render(ex, style='ent', manual=True)
-
-+code("DEP input").
- {
- 'words': [
- {'text': 'This', 'tag': 'DT'},
- {'text': 'is', 'tag': 'VBZ'},
- {'text': 'a', 'tag': 'DT'},
- {'text': 'sentence', 'tag': 'NN'}],
- 'arcs': [
- {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
- {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
- {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
- }
-
-+code("ENT input").
- {
- 'text': 'But Google is starting from behind.',
- 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
- 'title': None
- }
-
-+h(2, "webapp") Using displaCy in a web application
-
-p
- | If you want to use the visualizers as part of a web application, for
- | example to create something like our
- | #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to
- | simply wrap and serve the displaCy renderer. Instead, you should only
- | rely on the server to perform spaCy's processing capabilities, and use
- | #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output.
-
-+aside("Why not return the HTML by the server?")
- | It's certainly possible to just have your server return the markup.
- | But outputting raw, unsanitised HTML is risky and makes your app vulnerable to
- | #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting]
- | (XSS). All your user needs to do is find a way to make spaCy return text
- | like #[code <script src="malicious-code.js"><script>], which
- | is pretty easy in NER mode. Instead of relying on the server to render
- | and sanitise HTML, you can do this on the client in JavaScript.
- | displaCy.js creates the markup as DOM nodes and will never insert raw
- | HTML.
-
-p
- | The #[code parse_deps] function takes a #[code Doc] object and returns
- | a dictionary in a format that can be rendered by displaCy.
-
-+code("Example").
- import spacy
- from spacy import displacy
-
- nlp = spacy.load('en')
-
- def displacy_service(text):
- doc = nlp(text)
- return displacy.parse_deps(doc)
-
-p
- | Using a library like #[+a("https://falconframework.org/") Falcon] or
- | #[+a("http://www.hug.rest/") Hug], you can easily turn the above code
- | into a simple REST API that receives a text and returns a JSON-formatted
- | parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and
- | initialise it with the API URL and the ID or query selector of the
- | container to render the visualisation in, e.g. #[code '#displacy'] for
- | #[code <div id="displacy">].
-
-+code("script.js", "javascript").
- var displacy = new displaCy('http://localhost:8080', {
- container: '#displacy'
- })
-
- function parse(text) {
- displacy.parse(text);
- }
-
-p
- | When you call #[code parse()], it will make a request to your API,
- | receive the JSON-formatted parse and render it in your container. To
- | create an interactive experience, you could trigger this function by
- | a button and read the text from an #[code <input>] field.
diff --git a/website/index.jade b/website/index.jade
index 9336d5c34..0155ab295 100644
--- a/website/index.jade
+++ b/website/index.jade
@@ -8,61 +8,48 @@ include _includes/_mixins
| Natural Language#[br]
| Processing
- h2.c-landing__title.o-block.u-heading-1
- | in Python
+ h2.c-landing__title.o-block.u-heading-3
+ span.u-text-label.u-text-label--light in Python
- +landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")
++grid.o-content.c-landing__blocks
+ +grid-col("third").c-landing__card.o-card.o-grid.o-grid--space
+ +h(3) Fastest in the world
+ p
+ | spaCy excels at large-scale information extraction tasks.
+ | It's written from the ground up in carefully memory-managed
+ | Cython. Independent research has confirmed that spaCy is
+ | the fastest in the world. If your application needs to
+ | process entire web dumps, spaCy is the library you want to
+ | be using.
- +grid.o-content
- +grid-col("third").o-card
- +h(2) Fastest in the world
- p
- | spaCy excels at large-scale information extraction tasks.
- | It's written from the ground up in carefully memory-managed
- | Cython. Independent research has confirmed that spaCy is
- | the fastest in the world. If your application needs to
- | process entire web dumps, spaCy is the library you want to
- | be using.
+ +button("/usage/facts-figures", true, "primary")
+ | Facts & figures
- +button("/docs/api", true, "primary")
- | Facts & figures
+ +grid-col("third").c-landing__card.o-card.o-grid.o-grid--space
+ +h(3) Get things done
+ p
+ | spaCy is designed to help you do real work β to build real
+ | products, or gather real insights. The library respects
+ | your time, and tries to avoid wasting it. It's easy to
+ | install, and its API is simple and productive. We like to
+ | think of spaCy as the Ruby on Rails of Natural Language
+ | Processing.
- +grid-col("third").o-card
- +h(2) Get things done
- p
- | spaCy is designed to help you do real work β to build real
- | products, or gather real insights. The library respects
- | your time, and tries to avoid wasting it. It's easy to
- | install, and its API is simple and productive. I like to
- | think of spaCy as the Ruby on Rails of Natural Language
- | Processing.
+ +button("/usage", true, "primary")
+ | Get started
- +button("/docs/usage", true, "primary")
- | Get started
+ +grid-col("third").c-landing__card.o-card.o-grid.o-grid--space
+ +h(3) Deep learning
+ p
+ | spaCy is the best way to prepare text for deep learning.
+ | It interoperates seamlessly with TensorFlow, PyTorch,
+ | scikit-learn, Gensim and the
+ | rest of Python's awesome AI ecosystem. spaCy helps you
+ | connect the statistical models trained by these libraries
+ | to the rest of your application.
- +grid-col("third").o-card
- +h(2) Deep learning
- p
- | spaCy is the best way to prepare text for deep learning.
- | It interoperates seamlessly with
- | #[+a("https://www.tensorflow.org") TensorFlow],
- | #[+a("https://keras.io") Keras],
- | #[+a("http://scikit-learn.org") Scikit-Learn],
- | #[+a("https://radimrehurek.com/gensim") Gensim] and the
- | rest of Python's awesome AI ecosystem. spaCy helps you
- | connect the statistical models trained by these libraries
- | to the rest of your application.
-
- +button("/docs/usage/deep-learning", true, "primary")
- | Read more
-
-.o-inline-list.o-block.u-border-bottom.u-text-small.u-text-center.u-padding-small
- +a(gh("spaCy") + "/releases")
- strong.u-text-label.u-color-subtle #[+icon("code", 18)] Latest release:
- | v#{SPACY_VERSION}
-
- if LATEST_NEWS
- +a(LATEST_NEWS.url) #[+tag.o-icon New!] #{LATEST_NEWS.title}
+ +button("/usage/deep-learning", true, "primary")
+ | Read more
.o-content
+grid
@@ -92,67 +79,77 @@ include _includes/_mixins
+h(2) Features
+list
+item Non-destructive #[strong tokenization]
- +item Syntax-driven sentence segmentation
+ +item Support for #[strong #{LANG_COUNT}+ languages]
+ +item #[strong #{MODEL_COUNT} statistical models] for #{MODEL_LANG_COUNT} languages
+item Pre-trained #[strong word vectors]
+ +item Easy #[strong deep learning] integration
+item Part-of-speech tagging
+item #[strong Named entity] recognition
+item Labelled dependency parsing
+ +item Syntax-driven sentence segmentation
+ +item Built in #[strong visualizers] for syntax and NER
+item Convenient string-to-hash mapping
+item Export to numpy data arrays
- +item GIL-free #[strong multi-threading]
+item Efficient binary serialization
- +item Easy #[strong deep learning] integration
- +item Statistical models for #[strong English] and #[strong German]
+ +item Easy #[strong model packaging] and deployment
+item State-of-the-art speed
+item Robust, rigorously evaluated accuracy
++landing-banner("Convolutional neural network models", "New in v2.0")
+ p
+ | spaCy v2.0 features new neural models for #[strong tagging],
+ | #[strong parsing] and #[strong entity recognition]. The models have
+ | been designed and implemented from scratch specifically for spaCy, to
+ | give you an unmatched balance of speed, size and accuracy. A novel
+ | bloom embedding strategy with subword features is used to support
+ | huge vocabularies in tiny tables. Convolutional layers with residual
+ | connections, layer normalization and maxout non-linearity are used,
+ | giving much better efficiency than the standard BiLSTM solution.
+ | Finally, the parser and NER use an imitation learning objective to
+ | deliver accuracy in-line with the latest research systems,
+ | even when evaluated from raw text. With these innovations, spaCy
+ | v2.0's models are #[strong 10× smaller],
+ | #[strong 20% more accurate], and #[strong just as fast] as the
+ | previous generation.
+
+ .o-block-small.u-text-right
+ +button("/models", true, "secondary-light") Download models
+
++landing-logos("spaCy is trusted by", logos)
+ +button(gh("spacy") + "/stargazers", false, "secondary", "small")
+ | and many more
+
++landing-logos("Featured on", features).o-block-small
+
++landing-banner("Prodigy: Radically efficient machine teaching", "From the makers of spaCy")
+ p
+ | Prodigy is an #[strong annotation tool] so efficient that data scientists can
+ | do the annotation themselves, enabling a new level of rapid
+ | iteration. Whether you're working on entity recognition, intent
+ | detection or image classification, Prodigy can help you
+ | #[strong train and evaluate] your models faster. Stream in your own examples or
+ | real-world data from live APIs, update your model in real-time and
+ | chain models together to build more complex systems.
+
+ .o-block-small.u-text-right
+ +button("https://prodi.gy", true, "secondary-light") Try it out
+
+.o-content
+ +grid
+ +grid-col("half")
+ +h(2) Benchmarks
+
+ p
+ | In 2015, independent researchers from Emory University and
+ | Yahoo! Labs showed that spaCy offered the
+ | #[strong fastest syntactic parser in the world] and that its
+ | accuracy was #[strong within 1% of the best] available
+ | (#[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") Choi et al., 2015]).
+ | spaCy v2.0, released in 2017, is more accurate than any of
+ | the systems Choi et al. evaluated.
+
.o-inline-list
- +button("/docs/usage/lightning-tour", true, "secondary")
- | See examples
+ +button("/usage/facts-figures#benchmarks", true, "secondary") See details
- .o-block.u-text-center.u-padding
- h3.u-text-label.u-color-subtle.o-block spaCy is trusted by
-
- each row in logos
- +grid("center").o-inline-list
- each details, name in row
- +a(details[0])
- img(src="/assets/img/logos/#{name}.png" alt=name width=(details[1] || 150)).u-padding-small
-
-.u-pattern.u-padding
- +grid.o-card.o-content
- +grid-col("quarter")
- img(src="/assets/img/profile_matt.png" width="280")
-
- +grid-col("three-quarters")
- +h(2) What's spaCy all about?
-
- p
- | By 2014, I'd been publishing NLP research for about 10
- | years. During that time, I saw a huge gap open between the
- | technology that Google-sized companies could take to market,
- | and what was available to everyone else. This was especially
- | clear when companies started trying to use my research. Like
- | most researchers, my work was free to read, but expensive to
- | apply. You could run my code, but its requirements were
- | narrow. My code's mission in life was to print results
- | tables for my papers β it was good at this job, and bad at
- | all others.
-
- p
- | spaCy's #[+a("/docs/api/philosophy") mission] is to make
- | cutting-edge NLP practical and commonly available. That's
- | why I left academia in 2014, to build a production-quality
- | open-source NLP library. It's why
- | #[+a("https://twitter.com/_inesmontani") Ines] joined the
- | project in 2015, to build visualisations, demos and
- | annotation tools that make NLP technologies less abstract
- | and easier to use. Together, we've founded
- | #[+a(COMPANY_URL, true) Explosion AI], to develop data packs
- | you can drop into spaCy to extend its capabilities. If
- | you're processing Hindi insurance claims, you need a model
- | for that. We can build it for you.
-
- .o-block
- +a("https://twitter.com/honnibal")
- +svg("graphics", "matt-signature", 60, 45).u-color-theme
+ +grid-col("half")
+ include usage/_facts-figures/_benchmarks-choi-2015
diff --git a/website/models/_data.json b/website/models/_data.json
new file mode 100644
index 000000000..b2898be8a
--- /dev/null
+++ b/website/models/_data.json
@@ -0,0 +1,99 @@
+{
+ "sidebar": {
+ "Models": {
+ "Overview": "./"
+ },
+
+ "Language models": {
+ "English": "en",
+ "German": "de",
+ "Spanish": "es",
+ "French": "fr",
+ "Multi-Language": "xx"
+ }
+ },
+
+ "index": {
+ "title": "Models Overview",
+ "teaser": "Downloadable statistical models for spaCy to predict and assign linguistic features.",
+ "quickstart": true,
+ "menu": {
+ "Quickstart": "quickstart",
+ "Installation": "install",
+ "Naming Conventions": "conventions"
+ }
+ },
+
+ "MODELS": {
+ "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
+ "de": ["de_dep_news_sm"],
+ "es": ["es_core_web_sm"],
+ "fr": [],
+ "xx": ["xx_ent_wiki_sm"]
+ },
+
+ "MODEL_META": {
+ "core": "Vocabulary, syntax, entities, vectors",
+ "dep": "Vocabulary, syntax",
+ "ent": "Named entities",
+ "vectors": "Word vectors",
+ "web": "written text (blogs, news, comments)",
+ "news": "written text (news, media)",
+ "wiki": "Wikipedia",
+ "uas": "Unlabelled dependencies",
+ "las": "Labelled dependencies",
+ "tags_acc": "Part-of-speech tags",
+ "ents_f": "Entities (F-score)",
+ "ents_p": "Entities (precision)",
+ "ents_r": "Entities (recall)",
+ "pipeline": "Processing pipeline components in order",
+ "sources": "Sources of training data"
+ },
+
+ "MODEL_LICENSES": {
+ "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/",
+ "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
+ "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/",
+ "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/"
+ },
+
+ "MODEL_ACCURACY": {
+ "uas": "UAS",
+ "las": "LAS",
+ "tags_acc": "POS",
+ "ents_f": "NER F",
+ "ents_p": "NER P",
+ "ents_r": "NER R"
+ },
+
+ "LANGUAGES": {
+ "en": "English",
+ "de": "German",
+ "fr": "French",
+ "es": "Spanish",
+ "it": "Italian",
+ "pt": "Portuguese",
+ "nl": "Dutch",
+ "sv": "Swedish",
+ "fi": "Finnish",
+ "nb": "Norwegian BokmΓ₯l",
+ "da": "Danish",
+ "hu": "Hungarian",
+ "pl": "Polish",
+ "he": "Hebrew",
+ "bn": "Bengali",
+ "id": "Indonesian",
+ "th": "Thai",
+ "zh": "Chinese",
+ "ja": "Japanese",
+ "xx": "Multi-language"
+ },
+
+ "EXAMPLE_SENTENCES": {
+ "en": "This is a sentence.",
+ "de": "Dies ist ein Satz.",
+ "fr": "C'est une phrase.",
+ "es": "Esto es una frase.",
+ "xx": "This is a sentence about Facebook."
+ }
+}
diff --git a/website/models/de.jade b/website/models/de.jade
new file mode 100644
index 000000000..113290b7a
--- /dev/null
+++ b/website/models/de.jade
@@ -0,0 +1,6 @@
+//- π« DOCS > MODELS > DE
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/models/en.jade b/website/models/en.jade
new file mode 100644
index 000000000..4f400662b
--- /dev/null
+++ b/website/models/en.jade
@@ -0,0 +1,6 @@
+//- π« DOCS > MODELS > EN
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/models/es.jade b/website/models/es.jade
new file mode 100644
index 000000000..7aad72e81
--- /dev/null
+++ b/website/models/es.jade
@@ -0,0 +1,6 @@
+//- π« DOCS > MODELS > ES
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/models/fr.jade b/website/models/fr.jade
new file mode 100644
index 000000000..1b3cc3fde
--- /dev/null
+++ b/website/models/fr.jade
@@ -0,0 +1,6 @@
+//- π« DOCS > MODELS > FR
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/models/index.jade b/website/models/index.jade
new file mode 100644
index 000000000..8f9aae739
--- /dev/null
+++ b/website/models/index.jade
@@ -0,0 +1,98 @@
+//- π« DOCS > MODELS
+
+include ../_includes/_mixins
+
++section("quickstart")
+ p
+ | spaCy v2.0 features new neural models for #[strong tagging],
+ | #[strong parsing] and #[strong entity recognition]. The models have
+ | been designed and implemented from scratch specifically for spaCy, to
+ | give you an unmatched balance of speed, size and accuracy. A novel
+ | bloom embedding strategy with subword features is used to support
+ | huge vocabularies in tiny tables. Convolutional layers with residual
+ | connections, layer normalization and maxout non-linearity are used,
+ | giving much better efficiency than the standard BiLSTM solution. For
+ | more details, see the notes on the
+ | #[+a("/api/#nn-models") model architecture].
+
+ p
+ | The parser and NER use an imitation learning objective to
+ | deliver #[strong accuracy in-line with the latest research systems],
+ | even when evaluated from raw text. With these innovations, spaCy
+ | v2.0's models are #[strong 10× smaller],
+ | #[strong 20% more accurate], and #[strong just as fast] as the
+ | previous generation.
+
+ include ../usage/_models/_quickstart
+
++section("install")
+ +h(2, "install") Installation & Usage
+
+ include ../usage/_models/_install-basics
+
+ +infobox
+ | For more details on how to use models with spaCy, see the
+ | #[+a("/usage/models") usage guide on models].
+
++section("conventions")
+ +h(2, "model-naming") Model naming conventions
+
+ p
+ | In general, spaCy expects all model packages to follow the naming
+ | convention of #[code [lang]_[name]]. For spaCy's models, we also
+ | chose to divide the name into three components:
+
+ +table
+ +row
+ +cell #[+label Type]
+ +cell
+ | Model capabilities (e.g. #[code core] for general-purpose
+ | model with vocabulary, syntax, entities and word vectors, or
+ | #[code depent] for only vocab, syntax and entities).
+ +row
+ +cell #[+label Genre]
+ +cell
+ | Type of text the model is trained on, e.g. #[code web] or
+ | #[code news].
+ +row
+ +cell #[+label Size]
+ +cell Model size indicator, #[code sm], #[code md] or #[code lg].
+
+ p
+ | For example, #[code en_core_web_sm] is a small English model trained
+ | on written web text (blogs, news, comments), that includes
+ | vocabulary, vectors, syntax and entities.
+
+ +h(3, "model-versioning") Model versioning
+
+ p
+ | Additionally, the model versioning reflects both the compatibility
+ | with spaCy, as well as the major and minor model version. A model
+ | version #[code a.b.c] translates to:
+
+ +table
+ +row
+ +cell #[code a]
+ +cell
+ | #[strong spaCy major version]. For example, #[code 2] for
+ | spaCy v2.x.
+ +row
+ +cell #[code b]
+ +cell
+ | #[strong Model major version]. Models with a different major
+ | version can't be loaded by the same code. For example,
+ | changing the width of the model, adding hidden layers or
+ | changing the activation changes the model major version.
+ +row
+ +cell #[code c]
+ +cell
+ | #[strong Model minor version]. Same model structure, but
+ | different parameter values, e.g. from being trained on
+ | different data, for different numbers of iterations, etc.
+
+ p
+ | For a detailed compatibility overview, see the
+ | #[+a(gh("spacy-models", "compatibility.json")) #[code compatibility.json]]
+ | in the models repository. This is also the source of spaCy's internal
+ | compatibility check, performed when you run the
+ | #[+api("cli#download") #[code download]] command.
diff --git a/website/models/xx.jade b/website/models/xx.jade
new file mode 100644
index 000000000..8967f38fa
--- /dev/null
+++ b/website/models/xx.jade
@@ -0,0 +1,6 @@
+//- π« DOCS > MODELS > XX
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/package.json b/website/package.json
new file mode 100644
index 000000000..c86aca222
--- /dev/null
+++ b/website/package.json
@@ -0,0 +1,20 @@
+{
+ "name": "spacy.io",
+ "private": true,
+ "version": "2.0.0",
+ "description": "spacy.io website",
+ "author": "Explosion AI",
+ "license": "MIT",
+ "devDependencies": {
+ "babel-cli": "^6.14.0",
+ "harp": "^0.24.0",
+ "uglify-js": "^2.7.3"
+ },
+ "dependencies": {},
+ "scripts": {
+ "compile": "NODE_ENV=deploy harp compile",
+ "compile_js": "babel www/assets/js/main.js --out-file www/assets/js/main.js --presets=es2015",
+ "uglify": "uglifyjs www/assets/js/main.js --output www/assets/js/main.js",
+ "build": "npm run compile && npm run compile_js && npm run uglify"
+ }
+}
diff --git a/website/styleguide.jade b/website/styleguide.jade
new file mode 100644
index 000000000..42e70ed73
--- /dev/null
+++ b/website/styleguide.jade
@@ -0,0 +1,623 @@
+//- π« STYLEGUIDE
+
+include _includes/_mixins
+
++section("intro")
+ p
+ | This styleguide is loosely based on the concept and principles of
+ | #[+a("http://bradfrost.com/blog/post/atomic-web-design/") Atomic Design].
+ | The templates consist of small elements (atoms) which are combined
+ | and connected to form larger molecules and full components. The site
+ | is compiled using #[+a("http://harpjs.com/") Harp], a static web
+ | server with built-in preprocessing. Templates are written entirely in
+ | #[+a("http://jade-lang.com") Jade] (aka. Pug), a clean,
+ | whitespace-sensitive templating language that compiles to HTML.
+ | CSS is written in #[+a("http://sass-lang.com") Sass] and preprocessed
+ | via Harp, JavaScript is written in ES6 syntax and compiled using
+ | #[+a("https://babeljs.io") Babel].
+
++section("logo")
+ +h(2, "logo", "website/assets/img/logo.svg") Logo
+
+ p
+ | If you would like to use the spaCy logo on your site, please get in
+ | touch and ask us first. However, if you want to show support and tell
+ | others that your project is using spaCy, you can grab one of our
+ | #[+a("/usage/spacy-101#faq-project-with-spacy") spaCy badges].
+
+ +grid
+ each color in [["#09a3d5", "#fff"], ["#fff", "#09a3d5"]]
+ +grid-col("half").o-box.u-text-center.u-padding-medium(style="background: #{color[1]}; color: #{color[0]}")
+ +icon("spacy", 338, 108)(style="max-width: 100%")
+
++section("colors")
+ +h(2, "colors", "website/assets/css/_variables.sass") Colors
+
+ +grid
+ each color, label in {"dark": "#1a1e23", "medium": "#45505e", "light": "#dddddd", "faint": "#f6f6f6", "blue": "#09a3d5", "dark blue": "#077ea4", "green": "#05b083", "dark green": "#047e5e"}
+ +grid-col("quarter").u-text-small.o-card
+ div(style="height: 75px; background: #{color}; border-top-left-radius: 6px; border-top-right-radius: 6px")
+ .u-text-center.u-padding-medium
+ +label=label
+ code=color
+
+ each pattern in ["blue", "green"]
+ +grid-col("half").u-text-small.o-card
+ div(style="background: url('/assets/img/pattern_#{pattern}.jpg') center/100% repeat; height: 125px; border-top-left-radius: 6px; border-top-right-radius: 6px")
+ .u-text-center.u-padding-medium
+ +label #{pattern} pattern
+ .u-text-tiny.u-color-subtle by #[+a("https://dribbble.com/kemal").u-color-dark Kemal ΕanlΔ±]
+
++section("typography")
+ +h(2, "typography") Typography
+
+ +aside-code("Usage", "jade").
+ +h(2) Headline two
+ +h(3, "some-id") Headline three
+
+ p
+ | Headlines are set in
+ | #[+a("http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font") HK Grotesk]
+ | by Hanken Design. All other body text and code uses the best-matching
+ | default system font to provide a "native" reading experience.
+
+ each heading in [0, 1, 2, 3, 4, 5]
+ .o-block-small(class="u-heading-" + heading) Heading #{heading}
+ +label Label
+
++section("elements")
+ +h(2, "elements", "website/_includes/_mixins.jade") Elements
+
+ p
+ | The site comes with a collection of simple content elements,
+ | implemented as mixins. These elements can be used individually, or as
+ | part of larger components.
+
+ +h(3, "text-links") Special text & links
+
+ +aside-code("Usage", "jade").
+ +api("token") #[code Token]
+ +src("https://github.com") GitHub source
+ +help("Help text here")
+ +fn(1, "bibliography")
+
+ p
+ | Special link styles are implemented as mixins and can be used to
+ | mark links to the API documentation, or links to source code.
+ | Additionally a "help" icon can be added to provide more information
+ | via a tooltip.
+
+ p.o-inline-list
+ +a("#") Link
+ code Inline Code
+ +api("token") #[code Token]
+ +src(gh("spacy")) Source
+ span.u-color-dark.u-nowrap Help #[+help("Help text here")]
+ span Footnote#[+fn(1, "", "This marks a footnote and can link to a section")]
+
+ +h(3, "buttons") Buttons
+
+ +aside-code("Usage", "jade").
+ +button("https://spacy.io", true, "secondary")
+ +button("https://spacy.io", true, "primary", "small")
+
+ p
+ | Link buttons come in two variants, #[code primary] and
+ | #[code secondary] and two sizes, with an optional #[code small] size
+ | modifier.Since they're mostly used as enhanced links, the buttons are
+ | implemented as styled links instead of native button elements.
+
+ p.o-inline-list
+ +button("#", false, "primary") Primary
+ +button("#", false, "secondary") Secondary
+ +button("#", false, "primary", "small") Primary small
+ +button("#", false, "secondary", "small") Secondary small
+
+ +h(3, "tags") Tags
+
+ +aside-code("Usage", "jade").
+ +tag I'm a tag
+ +tag-new(2)
+ +tag-model("Named entities")
+
+ p
+ | Tags can be used together with headlines, or next to properties
+ | across the documentation, and combined with tooltips to provide
+ | additional information. The #[code +tag-new] mixin takes a version
+ | number and can mark new features. Using the mixin, visibility of this
+ | tag can be toggled once the feature isn't considered new anymore.
+ | The #[code +tag-model] mixin takes a description of model
+ | capabilities and can be used to mark features that require a
+ | respective model to be installed.
+
+ p.o-inline-list
+ +tag I'm a tag
+ +tag-new(2)
+ +tag-model("Named entities")
+
+ +h(3, "icons", "website/_includes/_svg.jade") Icons
+
+ +aside-code("Usage", "jade").
+ +icon("github", 18)
+
+ p
+ | Icons are implemented via an SVG sprite and can be included as a
+ | mixin, using their name and an optional size value in #[code px].
+
+ +infobox.u-text-center
+ each icon in ["code", "arrow-right", "book", "circle", "chat", "star", "help", "accept", "reject", "markdown", "course", "github", "jupyter"]
+ .u-inline-block.u-padding-small.u-color-dark(data-tooltip=icon data-tooltip-style="code" aria-label=icon)
+ +icon(icon, 20)
+
++section("components")
+ +h(2, "components", "website/_includes/_mixins.jade") Components
+
+ p
+ | The site uses a collection of Jade mixins to make it easy to use
+ | complex content elements across templates and blog posts. To read
+ | more about the concept of modular markup components, check out our
+ | #[+a("https://explosion.ai/blog/modular-markup", true) blog post] on
+ | the subject.
+
+ +h(3, "grid") Grid
+
+ +aside-code("Usage", "jade").
+ +grid
+ +grid-col("half") Half
+ +grid-col("half") Half
+
+ p
+ | For now, the grid is still implemented as a standard #[code flexbox]
+ | grid, although it may be refactored to use CSS #[code grid] going
+ | forward. The grid supports up to four columns and collapses on
+ | small screens.
+
+ +grid
+ each count, label in {"full": 1, "half": 2, "third": 3, "quarter": 4}
+ each _ in Array(count)
+ +grid-col(label).o-box.u-text-center.u-text-label.u-color-dark=label
+
+ +h(3, "table") Table
+
+ +aside-code("Usage", "jade").
+ +table(["Header 1", "Header 2"])
+ +row
+ +cell Cell
+ +cell Cell
+
+ p
+ | Tables are used to present data and API documentation. If a list of
+ | headings is specified, those will be rendered as the table header.
+ | An optional #[code +row("foot")] can be used to mark a footer row
+ | with a distinct style, for example to visualise the return values
+ | of a documented function.
+
+ - var table_cols = ["Header 1", "Header 2", "Header 3"]
+ +table(table_cols)
+ each row, i in Array(4)
+ +row((i == 3) ? "foot" : null)
+ each col, j in table_cols
+ +cell
+ if i == 3 && j == 0
+ | Footer
+
+ else
+ | Row #{i + 1}, cell #{j + 1}
+
+ +h(3, "list") List
+
+ +aside-code("Usage", "jade").
+ +list("numbers", 3)
+ +item List item
+ +item List item
+
+ p
+ | Lists are available as bulleted, numbered, lettered and lower roman.
+ | Optionally, a start index can be defined as the second argument
+ | on ordered lists.
+
+ +grid
+ +list
+ +item I am a bulleted list
+ +item I have nice bullets
+ +item Lorem ipsum dolor
+ +item consectetur adipiscing elit
+
+ +list("numbers")
+ +item I am an ordered list
+ +item I have nice numbers
+ +item Lorem ipsum dolor
+ +item consectetur adipiscing elit
+
+ +list("numbers", 10)
+ +item I am an numbered list
+ +item with a custom start number
+ +item Lorem ipsum dolor
+ +item consectetur adipiscing elit
+
+ +list("letters")
+ +item I am an ordered list
+ +item I have uppercase letters
+ +item Lorem ipsum dolor
+ +item consectetur adipiscing elit
+
+ +list("letters", 18)
+ +item I am an ordered list
+ +item with a custom start letter
+ +item Lorem ipsum dolor
+ +item consectetur adipiscing elit
+
+ +list("roman")
+ +item I am an ordered list
+ +item I have roman numerals
+ +item Lorem ipsum dolor
+ +item consectetur adipiscing elit
+
+ +h(3, "code") Code
+
+ +aside-code("Usage", "jade").
+ +code("Label", "python").
+ import spacy
+ nlp = spacy.load('en')
+ doc = nlp(u"This is a sentence.")
+
+ p
+ | Code blocks use the #[+a("http://prismjs.com/") Prism] syntax
+ | highlighter with a custom theme. The language can be set individually
+ | on each block, and defaults to Python. An optional label can be
+ | added as the first argument, which is displayed above the block.
+ | When using the #[code +code] mixin, don't forget to append a period
+ | #[code .] to the mixin call. This tells Jade to interpret the
+ | indented block as plain text and preserve whitespace.
+
+ +code("Using spaCy").
+ import spacy
+ nlp = spacy.load('en')
+ doc = nlp(u"This is a sentence.")
+
+ +h(3, "aside") Aside
+
+ +aside-code("Usage", "jade").
+ +aside("Title") This is an aside
+ +aside-code("Title", "python").
+ nlp = spacy.load('en')
+
+ p
+ | Asides can be used to display additional notes and content in the
+ | right-hand column. Two mixins are available: #[code +aside] for
+ | regular text with an optional title, #[code +aside-code], which
+ | roughly mimicks the #[code +code] component. Visually, asides are
+ | moved to the side on the X-axis, and displayed at the same level
+ | they were inserted. On small screens, they collapse and are rendered
+ | in their original position, in between the text.
+
+ +h(3, "infobox") Infobox
+
+ +aside-code("Usage", "jade").
+ +infobox("Label") This is text.
+ +infobox("Label", "β οΈ") This is text.
+
+ p
+ | Infoboxes can be used to add notes, updates, warnings or additional
+ | information to a page or section. Semantically, they're implemented
+ | and interpreted as an #[code aside] element. Since infobox titles
+ | are especially nice with emoji, an emoji can be specified as the
+ | second argument for optimal rendering and spacing.
+
+ +infobox("Infobox label") Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt nisi. Integer commodo pellentesque tincidunt.
+
+ +infobox("Infobox label with emoji", "β οΈ") Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt nisi. Integer commodo pellentesque tincidunt.
+
+ +h(3, "card") Card
+
+ +aside-code("Usage", "jade").
+ +grid
+ +card("Title", "https://", "Author", "github")
+ | Card content goes here
+ p
+ | Cards can be used to present external content and links, like GitHub
+ | projects, websites, books or articles. They can take an optional
+ | value for the content author and icon, which is displayed in the
+ | corner. The content supplied via an indented block can also include
+ | formatting or other elements like images. Under the hood, cards are
+ | styled grid columns and should therefore always be used as children
+ | of #[code +grid].
+
+ +grid
+ +card("spaCy", "https://github.com/explosion/spaCy", "Explosion AI", "github")
+ | An open-source library for industrial-strength Natural Language
+ | Processing in Python.
+
+ +card("Prodigy", "https://prodi.gy", "Explosion AI", "star")
+ | A new annotation tool for radically efficient machine teaching,
+ | powered by active learning.
+
++section("embeds")
+ +h(2, "embeds") Embeds
+
+ p
+ | The framework also allows embedding content from selected sites via
+ | mixins, usually styled wrappers for the respective embed codes.
+
+ +h(3, "codepen") CodePen
+
+ p
+ | #[+a("https://codepen.io") CodePen] is a platform to share and
+ | collaborate on front-end code. It comes with a powerful live editor,
+ | and is mostly used on this site to present visualizations created by
+ | spaCy's built-in visualizers. Embeds use a
+ | #[+a("https://blog.codepen.io/documentation/pro-features/unlimited-embed-themes/") custom theme]
+ | and are included using a mixin that takes the pen ID, and an optional
+ | height to prevent content reflow on load.
+
+ +aside-code("Usage", "jade").
+ +codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160)
+
+ +codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160)
+
+ +h(3, "github") GitHub
+
+ p
+ | GitHub only allows native embedding of Gists, but Gists are only
+ | available for users, not organisations. So in order to be able to
+ | embed examples from spaCy's #[+src(gh("spacy", "examples")) examples],
+ | we ended up developing our own micro library. A #[code data-gh-embed]
+ | attribute on the code block, set via the mixin, specifies the file
+ | to load. The script then fetches the raw text via the GitHub API and
+ | renders it in the container. This way, the example previews on the
+ | site are always in sync with the examples in the repository.
+
+ +aside-code("Usage", "jade").
+ +github("spacy", "examples/training/train_textcat.py")
+
+ +github("spacy", "examples/training/train_textcat.py")
+
++section("markup")
+ +h(2, "markup") Markup reference
+
+ p
+ | The spaCy website is implemented
+ | in #[+a("https://www.jade-lang.org") Jade (aka Pug)], and is built or
+ | served by #[+a("(https://harpjs.com") Harp]. Jade is an extensible
+ | templating language with a readable syntax, that compiles to HTML.
+ | The website source makes extensive use of Jade mixins, so that the
+ | design system is abstracted away from the content you're writing. You
+ | can read more about our approach in our blog post,
+ | #[+a("https://explosion.ai/blog/modular-markup", true) "Rebuilding a Website with Modular Markup"].
+
+ +code("Viewing the site locally", "bash").
+ sudo npm install --global harp
+ git clone #{gh("spacy")}
+ cd spacy/website
+ harp server --port 9000
+
+ +h(3, "jade") Jade conventions
+
+ p
+ | Jade/Pug is a whitespace-sensitive markup language that compiles to
+ | HTML. Indentation is used to nest elements, and for template logic,
+ | like #[code if], #[code else] or #[code for], mainly used to iterate
+ | over objects and arrays in the meta data. It also allows inline
+ | JavaScript expressions.
+
+ +grid.o-no-block
+ +grid-col("half")
+ +code("Input", "jade").
+ ul#some-id
+ for item in ['a', 'b', 'c']
+ li.test=item.toUpperCase()
+ if item == 'a'
+ | π
+
+ +grid-col("half")
+ +code("Output", "markup").
+ <ul id="some-id">
+ <li class="test">A π<li>
+ <li class="test">B<li>
+ <li class="test">C<li>
+ </ul>
+
+ p
+ | For an overview of Harp and Jade, see
+ | #[+a("https://ines.io/blog/the-ultimate-guide-static-websites-harp-jade") this blog post].
+ | For more info on the Jade/Pug syntax, check out their
+ | #[+a("https://pugjs.org") documentation]. In the spacy.io source, we
+ | use 4 spaces to indent and hard-wrap at 80 characters.
+
+ +code(false, "jade").
+ p This is a very short paragraph. It stays inline.
+
+ p
+ | This is a much longer paragraph. It's hard-wrapped at 80 characters to
+ | make it easier to read on GitHub and in editors that do not have soft
+ | wrapping enabled. To prevent Jade from interpreting each line as a new
+ | element, it's prefixed with a pipe and two spaces. This ensures that no
+ | spaces are dropped β for example, if your editor strips out trailing
+ | whitespace by default. Inline links are added using the inline syntax,
+ | like this: #[+a("https://google.com") Google].
+
+ +aside("Plain HTML elements used")
+ +list.o-no-block
+ +item #[code p]: Regular paragraph.
+ +item #[code code]: Inline #[code code].
+ +item #[code em]: #[em Italicized] text.
+ +item #[code strong]: #[strong Bold] text.
+
+ p
+ | Note that for external links, #[code +a("...")] is used instead
+ | of #[code a(href="...")] β it's a mixin that takes care of adding all
+ | required attributes. If possible, always use a mixin instead of
+ | regular HTML elements. With a few exceptions for practical reasons,
+ | class names and other HTML attributes should
+ | #[strong only live in mixins] and not in the site content.
+
+ +infobox("Mixins documentation")
+ | For a more detailed overview and API documentation of the available
+ | mixins and their arguments, see the source of the
+ | #[+src(gh("spacy", "website/_includes/_mixins.jade")) #[code _includes/_mixins.jade]]
+ | file.
+
+ +h(3, "directory-structure") Directory structure
+
+ p
+ | Each section is represented by its own subdirectory, containing a
+ | #[code _data.json] to store its meta information. All #[code .jade]
+ | files that are not prefixed with an underscore are later converted to
+ | #[code .html]. Site assets like images, styles, fonts and scripts are
+ | loaded from a directory #[code assets]. Global variables like titles,
+ | navigations, URLs and other settings are defined in the global
+ | #[code _harp.json].
+
+ +code("website", "yaml").
+ βββ _includes # layout partials, shared mixins, functions
+ βββ api
+ | βββ _data.json # meta data for API section
+ | βββ ... # other pages and partials
+ βββ assets
+ | βββ css # Sass styles, will be converted to CSS
+ | βββ fonts # web fonts
+ | βββ img # images and icons
+ | βββ js # scripts, custom and third-party
+ βββ models
+ | βββ _data.json # model meta data and meta for models section
+ | βββ ... # other pages and partials
+ βββ usage
+ | βββ _data.json # meta data for usage section
+ | βββ ... # other pages and partials
+ βββ _data.json # meta data for pages in the root
+ βββ _harp.json # global site configuration and variables
+ βββ _layout.jade # global layout
+ βββ 404.jade # 404 page
+ βββ index.jade # landing page
+
+ +h(3, "data-structure") Data structure
+
+ p
+ | While all page content lives in the #[code .jade] files, article meta
+ | (page titles, sidebars etc.) is stored as JSON. Each folder contains
+ | a #[code _data.json] with all required meta for its files. Meta
+ | information is keyed by the page's filename or slug, and becomes
+ | available to the templates as variables. The #[code menu] specifies
+ | the sub-navigation in the sidebar and maps titles to section IDs.
+
+ +code(false, "json").
+ "resources": {
+ "title": "Resources",
+ "teaser": "Libraries, demos, books, courses and research systems featuring spaCy.",
+ "menu": {
+ "Third-party libraries": "libraries",
+ "Demos & Visualizations": "demos",
+ "Books & Courses": "books",
+ "Jupyter Notebooks": "notebooks",
+ "Research": "research"
+ }
+ }
+
+ p
+ | Long pages with multiple sections are often split into separate
+ | partials that live in their own subdirectory. Those partials can be
+ | included on the page, and if needed, across the site to avoid content
+ | duplication. Partials and partial directories are prefixed with an
+ | underscore #[code _] to prevent Harp from building them as separate
+ | files.
+
+ +code("spacy-101.jade", "jade").
+ +section("architecture")
+ +h(2, "architecture") Architecture
+ include _spacy-101/_architecture
+
+ +h(3, "model-data", "website/models/_data.json") Model data
+
+ p
+ | The new #[+a("/models") models directory] uses the GitHub API to
+ | fetch meta information from the latest
+ | #[+a(gh("spacy-models") + "/releases") model releases]. This ensures
+ | that the website is always up to date. However, some details, like
+ | human-readable descriptions and the list of available models and
+ | languages, is stored in the static CMS and used across the site.
+ | This info only lives in one place, #[code models/_data.json].
+ | Wherever possible, the model info is generated dynamically β for
+ | example, in installation examples, quickstart widgets and even in the
+ | total model and language count on the landing page.
+
+ p
+ | The following data is stored and made available in the global scope:
+
+ +table(["Variable", "Description", "Example"])
+ +row
+ +cell #[code LANGUAGES]
+ +cell All languages supported by spaCy, code mapped to name.
+ +cell
+ +code(false, "json").o-no-block "en": "English"
+
+ +row
+ +cell #[code MODELS]
+ +cell Model names (without version). Language codes mapped to list of names.
+ +cell
+ +code(false, "json").o-no-block "xx": ["xx_ent_wiki_sm"]
+
+ +row
+ +cell #[code MODEL_META]
+ +cell Description for model name components and meta data, ID mapped to string.
+ +cell
+ +code(false, "json").o-no-block "vectors": "Word vectors"
+
+ +row
+ +cell #[code MODEL_LICENSES]
+ +cell License types mapped to license URL.
+ +cell
+ +code(false, "json").o-no-block "CC BY-SA 3.0": "http://..."
+
+ +row
+ +cell #[code MODEL_ACCURACY]
+ +cell Display labels for accuracy keys.
+ +cell
+ +code(false, "json").o-no-block "ents_f": "NER F"
+
+ +row
+ +cell #[code EXAMPLE_SENTENCES]
+ +cell Example sentences for different languages.
+ +cell
+ +code(false, "json").o-no-block "es": "Esto es una frase."
+
+ +h(3, "functions", "website/_includes/_functions.jade") Template functions
+
+ p
+ | Jade allows you to implement any custom logic as inline JavaScript
+ | expressions. Reusable functions are organised in a
+ | #[code _functions.jade], which is included via the mixins file and
+ | makes them accessible on each page. However, most functions deal
+ | with internals only, e.g. prefixing class names in mixins or
+ | converting paths and links.
+
+ +h(4, "gh") gh
+ +tag function
+
+ p
+ | Since GitHub links can be long and tricky, this function takes care
+ | generating them automatically for spaCy and all repositories owned
+ | by the #[+a(gh())=SOCIAL.github] organisation.
+
+ +aside-code("Example", "jade").
+ +a(gh("spacy", "spacy/language.py")) This is a link
+
+ +table(["Name", "Type", "Description"])
+ +row
+ +cell #[code repo]
+ +cell String
+ +cell Name of the repository, e.g. #[code "spacy"].
+
+ +row
+ +cell #[code filepath]
+ +cell String
+ +cell Logical path to the file, relative to the repository root.
+
+ +row
+ +cell #[code branch]
+ +cell String
+ +cell Optional branch. Defaults to #[code "master"].
+
+ +row("foot")
+ +cell returns
+ +cell String
+ +cell The full GitHub link to the file.
diff --git a/website/docs/usage/adding-languages.jade b/website/usage/_adding-languages/_language-data.jade
similarity index 62%
rename from website/docs/usage/adding-languages.jade
rename to website/usage/_adding-languages/_language-data.jade
index b341c9f9b..81a6d638e 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@@ -1,58 +1,4 @@
-//- π« DOCS > USAGE > ADDING LANGUAGES
-
-include ../../_includes/_mixins
-
-p
- | Adding full support for a language touches many different parts of the
- | spaCy library. This guide explains how to fit everything together, and
- | points you to the specific workflows for each component.
-
-+aside("Working on spaCy's source")
- | To add a new language to spaCy, you'll need to
- | #[strong modify the library's code]. The easiest way to do this is to
- | clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
- | For more information on this, see the #[+a("/docs/usage") installation guide].
- | Unlike spaCy's core, which is mostly written in Cython, all language
- | data is stored in regular Python files. This means that you won't have to
- | rebuild anything in between β you can simply make edits and reload spaCy
- | to test them.
-
-+grid.o-no-block
- +grid-col("half")
- p
- | Obviously, there are lots of ways you can organise your code when
- | you implement your own language data. This guide will focus on
- | how it's done within spaCy. For full language support, you'll
- | need to create a #[code Language] subclass, define custom
- | #[strong language data], like a stop list and tokenizer
- | exceptions and test the new tokenizer. Once the language is set
- | up, you can #[strong build the vocabulary], including word
- | frequencies, Brown clusters and word vectors. Finally, you can
- | #[strong train the tagger and parser], and save the model to a
- | directory.
-
- p
- | For some languages, you may also want to develop a solution for
- | lemmatization and morphological analysis.
-
- +table-of-contents
- +item #[+a("#101") Language data 101]
- +item #[+a("#language-subclass") The Language subclass]
- +item #[+a("#stop-words") Stop words]
- +item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
- +item #[+a("#norm-exceptions") Norm exceptions]
- +item #[+a("#lex-attrs") Lexical attributes]
- +item #[+a("#syntax-iterators") Syntax iterators]
- +item #[+a("#lemmatizer") Lemmatizer]
- +item #[+a("#tag-map") Tag map]
- +item #[+a("#morph-rules") Morph rules]
- +item #[+a("#testing") Testing the tokenizer]
- +item #[+a("#vocabulary") Building the vocabulary]
- +item #[+a("#training") Training]
-
-+h(2, "101") Language data 101
-
-include _spacy-101/_language-data
+//- π« DOCS > USAGE > ADDING LANGUAGES > LANGUAGE DATA
p
| The individual components #[strong expose variables] that can be imported
@@ -137,7 +83,7 @@ p
+aside("Should I ever update the global data?")
| Reuseable language data is collected as atomic pieces in the root of the
- | #[+src(gh("spaCy", "lang")) spacy.lang] package. Often, when a new
+ | #[+src(gh("spaCy", "lang")) #[code spacy.lang]] package. Often, when a new
| language is added, you'll find a pattern or symbol that's missing. Even
| if it isn't common in other languages, it might be best to add it to the
| shared language data, unless it has some conflicting interpretation. For
@@ -150,14 +96,14 @@ p
| needs to know the language's character set. If the language you're adding
| uses non-latin characters, you might need to add the required character
| classes to the global
- | #[+src(gh("spacy", "spacy/lang/char_classes.py")) char_classes.py].
+ | #[+src(gh("spacy", "spacy/lang/char_classes.py")) #[code char_classes.py]].
| spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library]
| to keep this simple and readable. If the language requires very specific
| punctuation rules, you should consider overwriting the default regular
| expressions with your own in the language's #[code Defaults].
-+h(2, "language-subclass") Creating a #[code Language] subclass
++h(3, "language-subclass") Creating a #[code Language] subclass
p
| Language-specific code and resources should be organised into a
@@ -250,7 +196,7 @@ p
+h(3, "tokenizer-exceptions") Tokenizer exceptions
p
- | spaCy's #[+a("/docs/usage/customizing-tokenizer#how-tokenizer-works") tokenization algorithm]
+ | spaCy's #[+a("/usage/linguistic-features#how-tokenizer-works") tokenization algorithm]
| lets you deal with whitespace-delimited chunks separately. This makes it
| easy to define special-case rules, without worrying about how they
| interact with the rest of the tokenizer. Whenever the key string is
@@ -284,7 +230,7 @@ p
| efficiently and make your data less verbose. How you do this ultimately
| depends on the language. Here's an example of how exceptions for time
| formats like "1a.m." and "1am" are generated in the English
- | #[+src(gh("spaCy", "spacy/en/lang/tokenizer_exceptions.py")) tokenizer_exceptions.py]:
+ | #[+src(gh("spaCy", "spacy/en/lang/tokenizer_exceptions.py")) #[code tokenizer_exceptions.py]]:
+code("tokenizer_exceptions.py (excerpt)").
# use short, internal variable for readability
@@ -376,7 +322,7 @@ p
p
| Norm exceptions can be provided as a simple dictionary. For more examples,
| see the English
- | #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) norm_exceptions.py].
+ | #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) #[code norm_exceptions.py]].
+code("Example").
NORM_EXCEPTIONS = {
@@ -428,7 +374,7 @@ p
p
| Here's an example from the English
- | #[+src(gh("spaCy", "spacy/en/lang/lex_attrs.py")) lex_attrs.py]:
+ | #[+src(gh("spaCy", "spacy/en/lang/lex_attrs.py")) #[code lex_attrs.py]]:
+code("lex_attrs.py").
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
@@ -466,7 +412,7 @@ p
| Syntax iterators are functions that compute views of a #[code Doc]
| object based on its syntax. At the moment, this data is only used for
| extracting
- | #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which
+ | #[+a("/usage/linguistic-features#noun-chunks") noun chunks], which
| are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]
| property. Because base noun phrases work differently across languages,
| the rules to compute them are part of the individual language's data. If
@@ -479,13 +425,14 @@ p
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
-+table(["Language", "Source"])
- for lang, lang_id in {en: "English", de: "German", es: "Spanish"}
++table(["Language", "Code", "Source"])
+ for lang in ["en", "de", "fr", "es"]
+row
- +cell=lang
+ +cell=LANGUAGES[lang]
+ +cell #[code=lang]
+cell
- +src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py"))
- | lang/#{lang_id}/syntax_iterators.py
+ +src(gh("spaCy", "spacy/lang/" + lang + "/syntax_iterators.py"))
+ code lang/#{lang}/syntax_iterators.py
+h(3, "lemmatizer") Lemmatizer
@@ -547,7 +494,7 @@ p
| #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
| tags. Optionally, you can also include morphological features or other
| token attributes in the tag map as well. This allows you to do simple
- | #[+a("/docs/usage/pos-tagging#rule-based-morphology") rule-based morphological analysis].
+ | #[+a("/usage/linguistic-features#rule-based-morphology") rule-based morphological analysis].
+code("Example").
from ..symbols import POS, NOUN, VERB, DET
@@ -560,233 +507,62 @@ p
+h(3, "morph-rules") Morph rules
-+under-construction
+p
+ | The morphology rules let you set token attributes such as lemmas, keyed
+ | by the extended part-of-speech tag and token text. The morphological
+ | features and their possible values are language-specific and based on the
+ | #[+a("http://universaldependencies.org") Universal Dependencies scheme].
-+h(2, "testing") Testing the new language tokenizer
+
++code("Example").
+ from ..symbols import LEMMA
+
+ MORPH_RULES = {
+ "VBZ": {
+ "am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
+ "are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
+ "is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
+ "'re": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
+ "'s": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}
+ }
+ }
p
- | Before using the new language or submitting a
- | #[+a(gh("spaCy") + "/pulls") pull request] to spaCy, you should make sure
- | it works as expected. This is especially important if you've added custom
- | regular expressions for token matching or punctuation β you don't want to
- | be causing regressions.
+ | In the example of #[code "am"], the attributes look like this:
-+aside("spaCy's test suite")
- | spaCy uses the #[+a("https://docs.pytest.org/en/latest/") pytest framework]
- | for testing. For more details on how the tests are structured and best
- | practices for writing your own tests, see our
- | #[+a(gh("spaCy", "spacy/tests")) tests documentation].
++table(["Attribute", "Description"])
+ +row
+ +cell #[code LEMMA: "be"]
+ +cell Base form, e.g. "to be".
-+h(3, "testing-tokenizer") Testing the basic tokenizer
+ +row
+ +cell #[code "VerbForm": "Fin"]
+ +cell
+ | Finite verb. Finite verbs have a subject and can be the root of
+ | an independent clause β "I am." is a valid, complete
+ | sentence.
-p
- | The easiest way to test your new tokenizer is to run the
- | language-independent "tokenizer sanity" tests located in
- | #[+src(gh("spaCy", "spacy/tests/tokenizer")) tests/tokenizer]. This will
- | test for basic behaviours like punctuation splitting, URL matching and
- | correct handling of whitespace. In the
- | #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py], add the new
- | language ID to the list of #[code _languages]:
+ +row
+ +cell #[code "Person": "One"]
+ +cell First person, i.e. "#[strong I] am".
-+code.
- _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
- 'nl', 'pl', 'pt', 'sv', 'xx'] # new language here
+ +row
+ +cell #[code "Tense": "Pres"]
+ +cell
+ | Present tense, i.e. actions that are happening right now or
+ | actions that usually happen.
-+aside-code("Global tokenizer test example").
- # use fixture by adding it as an argument
- def test_with_all_languages(tokenizer):
- # will be performed on ALL language tokenizers
- tokens = tokenizer(u'Some text here.')
+ +row
+ +cell #[code "Mood": "Ind"]
+ +cell
+ | Indicative, i.e. something happens, has happened or will happen
+ | (as opposed to imperative or conditional).
-p
- | The language will now be included in the #[code tokenizer] test fixture,
- | which is used by the basic tokenizer tests. If you want to add your own
- | tests that should be run over all languages, you can use this fixture as
- | an argument of your test function.
-+h(3, "testing-custom") Writing language-specific tests
-
-p
- | It's recommended to always add at least some tests with examples specific
- | to the language. Language tests should be located in
- | #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
- | after the language ID. You'll also need to create a fixture for your
- | tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
- | Always use the #[code get_lang_class()] helper function within the fixture,
- | instead of importing the class at the top of the file. This will load the
- | language data only when it's needed. (Otherwise, #[em all data] would be
- | loaded every time you run a test.)
-
-+code.
- @pytest.fixture
- def en_tokenizer():
- return util.get_lang_class('en').Defaults.create_tokenizer()
-
-p
- | When adding test cases, always
- | #[+a(gh("spaCy", "spacy/tests#parameters")) #[code parametrize]] them β
- | this will make it easier for others to add more test cases without having
- | to modify the test itself. You can also add parameter tuples, for example,
- | a test sentence and its expected length, or a list of expected tokens.
- | Here's an example of an English tokenizer test for combinations of
- | punctuation and abbreviations:
-
-+code("Example test").
- @pytest.mark.parametrize('text,length', [
- ("The U.S. Army likes Shock and Awe.", 8),
- ("U.N. regulations are not a part of their concern.", 10),
- ("βIsn't it?β", 6)])
- def test_en_tokenizer_handles_punct_abbrev(en_tokenizer, text, length):
- tokens = en_tokenizer(text)
- assert len(tokens) == length
-
-+h(2, "vocabulary") Building the vocabulary
-
-+under-construction
-
-p
- | spaCy expects that common words will be cached in a
- | #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
- | features, and makes it easy to use information from unlabelled text
- | samples in your models. Specifically, you'll usually want to collect
- | word frequencies, and train two types of distributional similarity model:
- | Brown clusters, and word vectors. The Brown clusters are used as features
- | by linear models, while the word vectors are useful for lexical
- | similarity models and deep learning.
-
-+h(3, "word-frequencies") Word frequencies
-
-p
- | To generate the word frequencies from a large, raw corpus, you can use the
- | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
- | script from the spaCy developer resources. Note that your corpus should
- | not be preprocessed (i.e. you need punctuation for example). The
- | #[+api("cli#model") #[code model]] command expects a tab-separated word
- | frequencies file with three columns:
-
-+list("numbers")
- +item The number of times the word occurred in your language sample.
- +item The number of distinct documents the word occurred in.
- +item The word itself.
-
-p
- | An example word frequencies file could look like this:
-
-+code("es_word_freqs.txt", "text").
- 6361109 111 Aunque
- 23598543 111 aunque
- 10097056 111 claro
- 193454 111 aro
- 7711123 111 viene
- 12812323 111 mal
- 23414636 111 momento
- 2014580 111 felicidad
- 233865 111 repleto
- 15527 111 eto
- 235565 111 deliciosos
- 17259079 111 buena
- 71155 111 AnΓmate
- 37705 111 anΓmate
- 33155 111 cuΓ©ntanos
- 2389171 111 cuΓ‘l
- 961576 111 tΓpico
-
-p
- | You should make sure you use the spaCy tokenizer for your
- | language to segment the text for your word frequencies. This will ensure
- | that the frequencies refer to the same segmentation standards you'll be
- | using at run-time. For instance, spaCy's English tokenizer segments
- | "can't" into two tokens. If we segmented the text by whitespace to
- | produce the frequency counts, we'll have incorrect frequency counts for
- | the tokens "ca" and "n't".
-
-+h(3, "brown-clusters") Training the Brown clusters
-
-p
- | spaCy's tagger, parser and entity recognizer are designed to use
- | distributional similarity features provided by the
- | #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm].
- | You should train a model with between 500 and 1000 clusters. A minimum
- | frequency threshold of 10 usually works well.
-
-p
- | An example clusters file could look like this:
-
-+code("es_clusters.data", "text").
- 0000 Vestigial 1
- 0000 Vesturland 1
- 0000 Veyreau 1
- 0000 Veynes 1
- 0000 VexilografΓa 1
- 0000 Vetrigne 1
- 0000 VetΓ³nica 1
- 0000 Asunden 1
- 0000 VillalambrΓΊs 1
- 0000 VichuquΓ©n 1
- 0000 Vichtis 1
- 0000 Vichigasta 1
- 0000 VAAH 1
- 0000 Viciebsk 1
- 0000 Vicovaro 1
- 0000 Villardeveyo 1
- 0000 Vidala 1
- 0000 Videoguard 1
- 0000 VedΓ‘s 1
- 0000 Videocomunicado 1
- 0000 VideoCrypt 1
-
-+h(3, "word-vectors") Training the word vectors
-
-+under-construction
-
-p
- | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
- | algorithms let you train useful word similarity models from unlabelled
- | text. This is a key part of using
- | #[+a("/docs/usage/deep-learning") deep learning] for NLP with limited
- | labelled data. The vectors are also useful by themselves β they power
- | the #[code .similarity()] methods in spaCy. For best results, you should
- | pre-process the text with spaCy before training the Word2vec model. This
- | ensures your tokenization will match.
-
-p
- | You can use our
- | #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script],
- | which pre-processes the text with your language-specific tokenizer and
- | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
- | The #[code vectors.bin] file should consist of one word and vector per line.
-
-//-+aside-code("your_data_directory", "yaml").
- βββ vocab/
- | βββ lexemes.bin
- | βββ strings.json
- | βββ oov_prob
- βββ pos/
- | βββ model
- | βββ config.json
- βββ deps/
- | βββ model
- | βββ config.json
- βββ ner/
- βββ model
- βββ config.json
-
-+h(2, "train-tagger-parser") Training the tagger and parser
-
-+under-construction
-
-p
- | You can now train the model using a corpus for your language annotated
- | with #[+a("http://universaldependencies.org/") Universal Dependencies].
- | If your corpus uses the
- | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
- | i.e. files with the extension #[code .conllu], you can use the
- | #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
- | #[+a("/docs/api/annotation#json-input") JSON format] for training.
-
-p
- | Once you have your UD corpus transformed into JSON, you can train your
- | model use the using spaCy's #[+api("cli#train") #[code train]] command:
-
-+code(false, "bash").
- spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
++infobox("Important note", "β οΈ")
+ | The morphological attributes are currently #[strong not all used by spaCy].
+ | Full integration is still being developed. In the meantime, it can still
+ | be useful to add them, especially if the language you're adding includes
+ | important distinctions and special cases. This ensures that as soon as
+ | full support is introduced, your language will be able to assign all
+ | possible attributes.
diff --git a/website/usage/_adding-languages/_testing.jade b/website/usage/_adding-languages/_testing.jade
new file mode 100644
index 000000000..825d8db6f
--- /dev/null
+++ b/website/usage/_adding-languages/_testing.jade
@@ -0,0 +1,76 @@
+//- π« DOCS > USAGE > ADDING LANGUAGES > TESTING
+
+p
+ | Before using the new language or submitting a
+ | #[+a(gh("spaCy") + "/pulls") pull request] to spaCy, you should make sure
+ | it works as expected. This is especially important if you've added custom
+ | regular expressions for token matching or punctuation β you don't want to
+ | be causing regressions.
+
++infobox("spaCy's test suite")
+ | spaCy uses the #[+a("https://docs.pytest.org/en/latest/") pytest framework]
+ | for testing. For more details on how the tests are structured and best
+ | practices for writing your own tests, see our
+ | #[+a(gh("spaCy", "spacy/tests")) tests documentation].
+
+p
+ | The easiest way to test your new tokenizer is to run the
+ | language-independent "tokenizer sanity" tests located in
+ | #[+src(gh("spaCy", "spacy/tests/tokenizer")) #[code tests/tokenizer]].
+ | This will test for basic behaviours like punctuation splitting, URL
+ | matching and correct handling of whitespace. In the
+ | #[+src(gh("spaCy", "spacy/tests/conftest.py")) #[code conftest.py]], add
+ | the new language ID to the list of #[code _languages]:
+
++code.
+ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
+ 'nl', 'pl', 'pt', 'sv', 'xx'] # new language here
+
++aside-code("Global tokenizer test example").
+ # use fixture by adding it as an argument
+ def test_with_all_languages(tokenizer):
+ # will be performed on ALL language tokenizers
+ tokens = tokenizer(u'Some text here.')
+
+p
+ | The language will now be included in the #[code tokenizer] test fixture,
+ | which is used by the basic tokenizer tests. If you want to add your own
+ | tests that should be run over all languages, you can use this fixture as
+ | an argument of your test function.
+
++h(3, "testing-custom") Writing language-specific tests
+
+p
+ | It's recommended to always add at least some tests with examples specific
+ | to the language. Language tests should be located in
+ | #[+src(gh("spaCy", "spacy/tests/lang")) #[code tests/lang]] in a
+ | directory named after the language ID. You'll also need to create a
+ | fixture for your tokenizer in the
+ | #[+src(gh("spaCy", "spacy/tests/conftest.py")) #[code conftest.py]].
+ | Always use the #[+api("util#get_lang_class") #[code get_lang_class()]]
+ | helper function within the fixture, instead of importing the class at the
+ | top of the file. This will load the language data only when it's needed.
+ | (Otherwise, #[em all data] would be loaded every time you run a test.)
+
++code.
+ @pytest.fixture
+ def en_tokenizer():
+ return util.get_lang_class('en').Defaults.create_tokenizer()
+
+p
+ | When adding test cases, always
+ | #[+a(gh("spaCy", "spacy/tests#parameters")) #[code parametrize]] them β
+ | this will make it easier for others to add more test cases without having
+ | to modify the test itself. You can also add parameter tuples, for example,
+ | a test sentence and its expected length, or a list of expected tokens.
+ | Here's an example of an English tokenizer test for combinations of
+ | punctuation and abbreviations:
+
++code("Example test").
+ @pytest.mark.parametrize('text,length', [
+ ("The U.S. Army likes Shock and Awe.", 8),
+ ("U.N. regulations are not a part of their concern.", 10),
+ ("βIsn't it?β", 6)])
+ def test_en_tokenizer_handles_punct_abbrev(en_tokenizer, text, length):
+ tokens = en_tokenizer(text)
+ assert len(tokens) == length
diff --git a/website/usage/_adding-languages/_training.jade b/website/usage/_adding-languages/_training.jade
new file mode 100644
index 000000000..054f2a460
--- /dev/null
+++ b/website/usage/_adding-languages/_training.jade
@@ -0,0 +1,93 @@
+//- π« DOCS > USAGE > ADDING LANGUAGES > TRAINING
+
+p
+ | spaCy expects that common words will be cached in a
+ | #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
+ | features, and makes it easy to use information from unlabelled text
+ | samples in your models. Specifically, you'll usually want to collect
+ | word frequencies, and train word vectors. To generate the word frequencies
+ | from a large, raw corpus, you can use the
+ | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) #[code word_freqs.py]]
+ | script from the spaCy developer resources.
+
++github("spacy-dev-resources", "training/word_freqs.py")
+
+p
+ | Note that your corpus should not be preprocessed (i.e. you need
+ | punctuation for example). The word frequencies should be generated as a
+ | tab-separated file with three columns:
+
++list("numbers")
+ +item The number of times the word occurred in your language sample.
+ +item The number of distinct documents the word occurred in.
+ +item The word itself.
+
++code("es_word_freqs.txt", "text").
+ 6361109 111 Aunque
+ 23598543 111 aunque
+ 10097056 111 claro
+ 193454 111 aro
+ 7711123 111 viene
+ 12812323 111 mal
+ 23414636 111 momento
+ 2014580 111 felicidad
+ 233865 111 repleto
+ 15527 111 eto
+ 235565 111 deliciosos
+ 17259079 111 buena
+ 71155 111 AnΓmate
+ 37705 111 anΓmate
+ 33155 111 cuΓ©ntanos
+ 2389171 111 cuΓ‘l
+ 961576 111 tΓpico
+
++aside("Brown Clusters")
+ | Additionally, you can use distributional similarity features provided by the
+ | #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm].
+ | You should train a model with between 500 and 1000 clusters. A minimum
+ | frequency threshold of 10 usually works well.
+
+p
+ | You should make sure you use the spaCy tokenizer for your
+ | language to segment the text for your word frequencies. This will ensure
+ | that the frequencies refer to the same segmentation standards you'll be
+ | using at run-time. For instance, spaCy's English tokenizer segments
+ | "can't" into two tokens. If we segmented the text by whitespace to
+ | produce the frequency counts, we'll have incorrect frequency counts for
+ | the tokens "ca" and "n't".
+
++h(4, "word-vectors") Training the word vectors
+
+p
+ | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
+ | algorithms let you train useful word similarity models from unlabelled
+ | text. This is a key part of using
+ | #[+a("/usage/deep-learning") deep learning] for NLP with limited
+ | labelled data. The vectors are also useful by themselves β they power
+ | the #[code .similarity()] methods in spaCy. For best results, you should
+ | pre-process the text with spaCy before training the Word2vec model. This
+ | ensures your tokenization will match. You can use our
+ | #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script],
+ | which pre-processes the text with your language-specific tokenizer and
+ | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
+ | The #[code vectors.bin] file should consist of one word and vector per line.
+
++github("spacy-dev-resources", "training/word_vectors.py")
+
++h(3, "train-tagger-parser") Training the tagger and parser
+
+p
+ | You can now train the model using a corpus for your language annotated
+ | with #[+a("http://universaldependencies.org/") Universal Dependencies].
+ | If your corpus uses the
+ | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
+ | i.e. files with the extension #[code .conllu], you can use the
+ | #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
+ | #[+a("/api/annotation#json-input") JSON format] for training.
+ | Once you have your UD corpus transformed into JSON, you can train your
+ | model use the using spaCy's #[+api("cli#train") #[code train]] command.
+
++infobox
+ | For more details and examples of how to
+ | #[strong train the tagger and dependency parser], see the
+ | #[+a("/usage/training#tagger-parser") usage guide on training].
diff --git a/website/usage/_data.json b/website/usage/_data.json
new file mode 100644
index 000000000..b34304ed6
--- /dev/null
+++ b/website/usage/_data.json
@@ -0,0 +1,203 @@
+{
+ "sidebar": {
+ "Get started": {
+ "Installation": "./",
+ "Models & Languages": "models",
+ "Facts & Figures": "facts-figures",
+ "spaCy 101": "spacy-101",
+ "New in v2.0": "v2"
+ },
+ "Guides": {
+ "Linguistic Features": "linguistic-features",
+ "Processing Pipelines": "processing-pipelines",
+ "Vectors & Similarity": "vectors-similarity",
+ "Text Classification": "text-classification",
+ "Deep Learning": "deep-learning",
+ "Training Models": "training",
+ "Adding Languages": "adding-languages",
+ "Visualizers": "visualizers"
+ },
+ "In-depth": {
+ "Code Examples": "examples",
+ "Resources": "resources"
+ }
+ },
+
+ "index": {
+ "title": "Install spaCy",
+ "next": "models",
+ "quickstart": true,
+ "changelog": true,
+ "menu": {
+ "Quickstart": "quickstart",
+ "Instructions": "instructions",
+ "Troubleshooting": "troubleshooting",
+ "Changelog": "changelog"
+ }
+ },
+
+ "models": {
+ "title": "Models & Languages",
+ "next": "facts-figures",
+ "quickstart": true,
+ "menu": {
+ "Quickstart": "quickstart",
+ "Available Models": "available",
+ "Installation & Usage": "install",
+ "Language Support": "languages",
+ "Production Use": "production"
+ }
+ },
+
+ "facts-figures": {
+ "title": "Facts & Figures",
+ "teaser": "The hard numbers for spaCy and how it compares to other libraries and tools.",
+ "next": "spacy-101",
+ "menu": {
+ "Feature comparison": "comparison",
+ "Benchmarks": "benchmarks",
+ "Powered by spaCy": "powered-by",
+ "Other Libraries": "other-libraries"
+ }
+ },
+
+ "spacy-101": {
+ "title": "spaCy 101: Everything you need to know",
+ "teaser": "The most important concepts, explained in simple terms.",
+ "next": "index",
+ "quickstart": true,
+ "preview": "101",
+ "menu": {
+ "Features": "features",
+ "Lightning tour": "lightning-tour",
+ "Architecture": "architecture",
+ "Community & FAQ": "community-faq"
+ }
+ },
+
+ "v2": {
+ "title": "What's New in v2.0",
+ "teaser": "New features, backwards incompatibilities and migration guide.",
+ "menu": {
+ "New features": "features",
+ "Backwards Incompatibilities": "incompat",
+ "Migrating from v1.x": "migrating",
+ "Benchmarks": "benchmarks"
+ }
+ },
+
+ "linguistic-features": {
+ "title": "Linguistic Features",
+ "teaser": "Using spaCy to extract linguistic features like part-of-speech tags, dependency labels and named entities, customising the tokenizer and working with the rule-based matcher.",
+ "next": "processing-pipelines",
+ "menu": {
+ "POS Tagging": "pos-tagging",
+ "Dependency Parse": "dependency-parse",
+ "Named Entities": "named-entities",
+ "Tokenization": "tokenization",
+ "Rule-based Matching": "rule-based-matching"
+ }
+ },
+
+ "processing-pipelines": {
+ "title": "Language Processing Pipelines",
+ "next": "vectors-similarity",
+ "menu": {
+ "How pipelines work": "pipelines",
+ "Examples": "examples",
+ "Multi-threading": "multithreading",
+ "User Hooks": "user-hooks",
+ "Serialization": "serialization"
+ }
+ },
+
+ "vectors-similarity": {
+ "title": "Word Vectors and Semantic Similarity",
+ "next": "text-classification",
+ "menu": {
+ "Basics": "basics",
+ "Similarity in Context": "in-context",
+ "Custom Vectors": "custom",
+ "GPU Usage": "gpu"
+ }
+ },
+
+ "deep-learning": {
+ "title": "Deep Learning",
+ "teaser": "Using spaCy to pre-process text for deep learning, and how to plug in your own machine learning models.",
+ "next": "training",
+ "menu": {
+ "Pre-processing Text": "pre-processing",
+ "spaCy and Thinc": "thinc",
+ "TensorFlow / Keras": "tensorflow-keras",
+ "scikit-learn": "scikit-learn",
+ "PyTorch": "pytorch",
+ "DyNet": "dynet"
+ }
+ },
+
+ "text-classification": {
+ "title": "Text Classification",
+ "next": "training"
+ },
+
+ "training": {
+ "title": "Training spaCy's Statistical Models",
+ "next": "adding-languages",
+ "menu": {
+ "Basics": "basics",
+ "NER": "ner",
+ "Tagger & Parser": "tagger-parser",
+ "Similarity": "similarity",
+ "Text Classification": "textcat",
+ "Saving & Loading": "saving-loading"
+ }
+ },
+
+ "adding-languages": {
+ "title": "Adding Languages",
+ "teaser": "Adding full support for a language touches many different parts of the spaCy library. This guide explains how to fit everything together, and points you to the specific workflows for each component.",
+ "next": "training",
+ "menu": {
+ "Language data": "language-data",
+ "Testing": "testing",
+ "Training": "training"
+ }
+ },
+
+ "visualizers": {
+ "title": "Visualizers",
+ "tag_new": 2,
+ "teaser": "Visualize dependencies and entities in your browser and notebook, or export HTML.",
+ "next": "resources",
+ "menu": {
+ "Dependencies": "dep",
+ "Entities": "ent",
+ "Jupyter Notebooks": "jupyter",
+ "Rendering HTML": "html"
+ }
+ },
+
+ "resources": {
+ "title": "Resources",
+ "teaser": "Libraries, demos, books, courses and research systems featuring spaCy.",
+ "menu": {
+ "Third-party libraries": "libraries",
+ "Demos & Visualizations": "demos",
+ "Books & Courses": "books",
+ "Jupyter Notebooks": "notebooks",
+ "Research": "research"
+ }
+ },
+
+ "examples": {
+ "title": "Code Examples",
+ "teaser": "Full code examples you can modify and run.",
+ "next": "resources",
+ "menu": {
+ "Matching": "matching",
+ "Training": "training",
+ "Deep Learning": "deep-learning"
+ }
+ }
+}
diff --git a/website/usage/_deep-learning/_dynet.jade b/website/usage/_deep-learning/_dynet.jade
new file mode 100644
index 000000000..81aa4e066
--- /dev/null
+++ b/website/usage/_deep-learning/_dynet.jade
@@ -0,0 +1,11 @@
+//- π« DOCS > USAGE > DEEP LEARNING > DYNET
+
++infobox
+ +infobox-logos(["dynet", 80, 34, "http://dynet.readthedocs.io/"])
+ | #[strong DyNet] is a dynamic neural network library, which can be much
+ | easier to work with for NLP. Outside of Google, there's a general shift
+ | among NLP researchers to both DyNet and Pytorch. You can use DyNet to
+ | create spaCy pipeline components, to add annotations to the #[code Doc]
+ | object.
+
++under-construction
diff --git a/website/usage/_deep-learning/_pre-processing.jade b/website/usage/_deep-learning/_pre-processing.jade
new file mode 100644
index 000000000..ca87cee7b
--- /dev/null
+++ b/website/usage/_deep-learning/_pre-processing.jade
@@ -0,0 +1,3 @@
+//- π« DOCS > USAGE > DEEP LEARNING > PRE-PROCESSING
+
++under-construction
diff --git a/website/usage/_deep-learning/_pytorch.jade b/website/usage/_deep-learning/_pytorch.jade
new file mode 100644
index 000000000..cf0f692f9
--- /dev/null
+++ b/website/usage/_deep-learning/_pytorch.jade
@@ -0,0 +1,91 @@
+//- π« DOCS > USAGE > DEEP LEARNING > PYTORCH
+
++infobox
+ +infobox-logos(["pytorch", 100, 48, "http://pytorch.org"])
+ | #[strong PyTorch] is a dynamic neural network library, which can be much
+ | easier to work with for NLP. Outside of Google, there's a general shift
+ | among NLP researchers to both Pytorch and DyNet. spaCy is the front-end
+ | of choice for PyTorch's #[code torch.text] extension. You can use PyTorch
+ | to create spaCy pipeline components, to add annotations to the
+ | #[code Doc] object.
+
++under-construction
+
+p
+ | Here's how a #[code begin_update] function that wraps an arbitrary
+ | PyTorch model would look:
+
++code.
+ class PytorchWrapper(thinc.neural.Model):
+ def __init__(self, pytorch_model):
+ self.pytorch_model = pytorch_model
+
+ def begin_update(self, x_data, drop=0.):
+ x_var = Variable(x_data)
+ # Make prediction
+ y_var = pytorch_model.forward(x_var)
+ def backward(dy_data, sgd=None):
+ dy_var = Variable(dy_data)
+ dx_var = torch.autograd.backward(x_var, dy_var)
+ return dx_var
+ return y_var.data, backward
+
+p
+ | PyTorch requires data to be wrapped in a container, #[code Variable],
+ | that tracks the operations performed on the data. This "tape" of
+ | operations is then used by #[code torch.autograd.backward] to compute the
+ | gradient with respect to the input. For example, the following code
+ | constructs a PyTorch Linear layer that takes a vector of shape
+ | #[code (length, 2)], multiples it by a #[code (2, 2)] matrix of weights,
+ | adds a #[code (2,)] bias, and returns the resulting #[code (length, 2)]
+ | vector:
+
++code("PyTorch Linear").
+ from torch import autograd
+ from torch import nn
+ import torch
+ import numpy
+
+ pt_model = nn.Linear(2, 2)
+ length = 5
+
+ input_data = numpy.ones((5, 2), dtype='f')
+ input_var = autograd.Variable(torch.Tensor(input_data))
+
+ output_var = pt_model(input_var)
+ output_data = output_var.data.numpy()
+
+p
+ | Given target values we would like the output data to approximate, we can
+ | then "learn" values of the parameters within #[code pt_model], to give us
+ | output that's closer to our target. As a trivial example, let's make the
+ | linear layer compute the negative inverse of the input:
+
++code.
+ def get_target(input_data):
+ return -(1 / input_data)
+
+p
+ | To update the PyTorch model, we create an optimizer and give it
+ | references to the model's parameters. We'll then randomly generate input
+ | data and get the target result we'd like the function to produce. We then
+ | compute the #[strong gradient of the error] between the current output
+ | and the target. Using the most popular definition of "error", this is
+ | simply the average difference:
+
++code.
+ from torch import optim
+
+ optimizer = optim.SGD(pt_model.parameters(), lr = 0.01)
+ for i in range(10):
+ input_data = numpy.random.uniform(-1., 1., (length, 2))
+ target = -(1 / input_data)
+
+ output_var = pt_model(autograd.Variable(torch.Tensor(input_data)))
+ output_data = output_var.data.numpy()
+
+ d_output_data = (output_data - target) / length
+ d_output_var = autograd.Variable(torch.Tensor(d_output_data))
+
+ d_input_var = torch.autograg.backward(output_var, d_output_var)
+ optimizer.step()
diff --git a/website/usage/_deep-learning/_scikit-learn.jade b/website/usage/_deep-learning/_scikit-learn.jade
new file mode 100644
index 000000000..3d0f30397
--- /dev/null
+++ b/website/usage/_deep-learning/_scikit-learn.jade
@@ -0,0 +1,15 @@
+//- π« DOCS > USAGE > DEEP LEARNING > SCIKIT-LEARN
+
++infobox
+ +infobox-logos(["scikitlearn", 70, 34, "http://scikit-learn.org"])
+ | #[strong scikit-learn] features a number of useful NLP functions,
+ | especially for solving text classification problems using linear models
+ | with bag-of-words features. If you know you need exactly that, it might
+ | be better to use scikit-learn's built-in pipeline directly. However, if
+ | you want to extract more detailed features, using part-of-speech tags,
+ | named entity labels, or string transformations, you can use spaCy as a
+ | pre-process in your classification system. scikit-learn also provides a
+ | lot of experiment management and evaluation utilities that people use
+ | alongside spaCy.
+
++under-construction
diff --git a/website/usage/_deep-learning/_tensorflow-keras.jade b/website/usage/_deep-learning/_tensorflow-keras.jade
new file mode 100644
index 000000000..3efb2e2a6
--- /dev/null
+++ b/website/usage/_deep-learning/_tensorflow-keras.jade
@@ -0,0 +1,11 @@
+//- π« DOCS > USAGE > DEEP LEARNING > TENSORFLOW / KERAS
+
++infobox
+ +infobox-logos(["tensorflow", 35, 42, "https://www.tensorflow.org"], ["keras", 45, 45, "https://www.keras.io"])
+ | #[strong Tensorflow / Keras] is the most popular deep learning library.
+ | spaCy provides efficient and powerful feature extraction functionality,
+ | that can be used as a pre-process to any deep learning library. You can
+ | also use Tensorflow and Keras to create spaCy pipeline components, to add
+ | annotations to the #[code Doc] object.
+
++under-construction
diff --git a/website/usage/_deep-learning/_thinc.jade b/website/usage/_deep-learning/_thinc.jade
new file mode 100644
index 000000000..6c354f708
--- /dev/null
+++ b/website/usage/_deep-learning/_thinc.jade
@@ -0,0 +1,66 @@
+//- π« DOCS > USAGE > DEEP LEARNING > THINC
+
+p
+ | #[+a(gh("thinc")) Thinc] is the machine learning library powering spaCy.
+ | It's a practical toolkit for implementing models that follow the
+ | #[+a("https://explosion.ai/blog/deep-learning-formula-nlp", true) "Embed, encode, attend, predict"]
+ | architecture. It's designed to be easy to install, efficient for CPU
+ | usage and optimised for NLP and deep learning with text β in particular,
+ | hierarchically structured input and variable-length sequences.
+
+p
+ | spaCy's built-in pipeline components can all be powered by any object
+ | that follows Thinc's #[code Model] API. If a wrapper is not yet available
+ | for the library you're using, you should create a
+ | #[code thinc.neural.Model] subclass that implements a #[code begin_update]
+ | method. You'll also want to implement #[code to_bytes], #[code from_bytes],
+ | #[code to_disk] and #[code from_disk] methods, to save and load your
+ | model. Here's the tempate you'll need to fill in:
+
+ +code("Thinc Model API").
+ class ThincModel(thinc.neural.Model):
+ def __init__(self, *args, **kwargs):
+ pass
+
+ def begin_update(self, X, drop=0.):
+ def backprop(dY, sgd=None):
+ return dX
+ return Y, backprop
+
+ def to_disk(self, path, **exclude):
+ return None
+
+ def from_disk(self, path, **exclude):
+ return self
+
+ def to_bytes(self, **exclude):
+ return bytes
+
+ def from_bytes(self, msgpacked_bytes, **exclude):
+ return self
+
+p
+ | The #[code begin_update] method should return a callback, that takes the
+ | gradient with respect to the output, and returns the gradient with
+ | respect to the input. It's usually convenient to implement the callback
+ | as a nested function, so you can refer to any intermediate variables from
+ | the forward computation in the enclosing scope.
+
++h(3, "how-thinc-works") How Thinc works
+
+p
+ | Neural networks are all about composing small functions that we know how
+ | to differentiate into larger functions that we know how to differentiate.
+ | To differentiate a function efficiently, you usually need to store
+ | intermediate results, computed during the "forward pass", to reuse them
+ | during the backward pass. Most libraries require the data passed through
+ | the network to accumulate these intermediate result. This is the "tape"
+ | in tape-based differentiation.
+
+p
+ | In Thinc, a model that computes #[code y = f(x)] is required to also
+ | return a callback that computes #[code dx = f'(dy)]. The same
+ | intermediate state needs to be tracked, but this becomes an
+ | implementation detail for the model to take care of β usually, the
+ | callback is implemented as a closure, so the intermediate results can be
+ | read from the enclosing scope.
diff --git a/website/usage/_facts-figures/_benchmarks-choi-2015.jade b/website/usage/_facts-figures/_benchmarks-choi-2015.jade
new file mode 100644
index 000000000..5c3386ce6
--- /dev/null
+++ b/website/usage/_facts-figures/_benchmarks-choi-2015.jade
@@ -0,0 +1,45 @@
+//- π« DOCS > USAGE > FACTS & FIGURES > BENCHMARKS > CHOI ET AL. (2015)
+
++table(["System", "Year", "Language", "Accuracy", "Speed (wps)"])
+ +row
+ +cell #[strong spaCy v2.x]
+ +cell 2017
+ +cell Python / Cython
+ +cell.u-text-right #[strong 92.6]
+ +cell.u-text-right #[em n/a]
+ | #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]
+
+ +row
+ +cell #[strong spaCy v1.x]
+ +cell 2015
+ +cell Python / Cython
+ +cell.u-text-right 91.8
+ +cell.u-text-right 13,963
+
+ +row
+ +cell ClearNLP
+ +cell 2015
+ +cell Java
+ +cell.u-text-right 91.7
+ +cell.u-text-right 10,271
+
+ +row
+ +cell CoreNLP
+ +cell 2015
+ +cell Java
+ +cell.u-text-right 89.6
+ +cell.u-text-right 8,602
+
+ +row
+ +cell MATE
+ +cell 2015
+ +cell Java
+ +cell.u-text-right 92.5
+ +cell.u-text-right 550
+
+ +row
+ +cell Turbo
+ +cell 2015
+ +cell C++
+ +cell.u-text-right 92.4
+ +cell.u-text-right 349
diff --git a/website/usage/_facts-figures/_benchmarks-models.jade b/website/usage/_facts-figures/_benchmarks-models.jade
new file mode 100644
index 000000000..208e7da48
--- /dev/null
+++ b/website/usage/_facts-figures/_benchmarks-models.jade
@@ -0,0 +1,48 @@
+//- π« DOCS > USAGE > FACTS & FIGURES > BENCHMARKS > MODEL COMPARISON
+
+p
+ | In this section, we provide benchmark accuracies for the pre-trained
+ | model pipelines we distribute with spaCy. Evaluations are conducted
+ | end-to-end from raw text, with no "gold standard" pre-processing, over
+ | text from a mix of genres where possible.
+
++under-construction
+
++aside("Methodology")
+ | The evaluation was conducted on raw text with no gold standard
+ | information. The parser, tagger and entity recognizer were trained on the
+ | #[+a("https://www.gabormelli.com/RKB/OntoNotes_Corpus") OntoNotes 5]
+ | corpus, the word vectors on #[+a("http://commoncrawl.org") Common Crawl].
+
++table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
+ +row
+ +cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a5
+ each data in ["2.x", "neural"]
+ +cell.u-text-right=data
+ +cell.u-text-right 91.4
+ +cell.u-text-right 85.5
+ +cell.u-text-right 97.0
+ +cell.u-text-right 8.2k
+ +cell.u-text-right #[strong 36 MB]
+
+ +row
+ +cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a0
+ each data in ["2.x", "neural"]
+ +cell.u-text-right=data
+ +cell.u-text-right #[strong 91.9]
+ +cell.u-text-right #[strong 86.4]
+ +cell.u-text-right #[strong 97.2]
+ +cell.u-text-right #[em n/a]
+ +cell.u-text-right 667 MB
+
+ +row("divider")
+ +cell #[code en_core_web_sm] 1.2.0
+ each data in ["1.x", "linear", 86.6, 78.5, 96.6]
+ +cell.u-text-right=data
+ +cell.u-text-right #[strong 25.7k]
+ +cell.u-text-right 50 MB
+
+ +row
+ +cell #[code en_core_web_md] 1.2.1
+ each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1 GB"]
+ +cell.u-text-right=data
diff --git a/website/usage/_facts-figures/_benchmarks.jade b/website/usage/_facts-figures/_benchmarks.jade
new file mode 100644
index 000000000..f69eb5406
--- /dev/null
+++ b/website/usage/_facts-figures/_benchmarks.jade
@@ -0,0 +1,206 @@
+//- π« DOCS > USAGE > FACTS & FIGURES > BENCHMARKS
+
+p
+ | Two peer-reviewed papers in 2015 confirm that spaCy offers the
+ | #[strong fastest syntactic parser in the world] and that
+ | #[strong its accuracy is within 1% of the best] available. The few
+ | systems that are more accurate are 20× slower or more.
+
++aside("About the evaluation")
+ | The first of the evaluations was published by #[strong Yahoo! Labs] and
+ | #[strong Emory University], as part of a survey of current parsing
+ | technologies #[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") (Choi et al., 2015)].
+ | Their results and subsequent discussions helped us develop a novel
+ | psychologically-motivated technique to improve spaCy's accuracy, which
+ | we published in joint work with Macquarie University
+ | #[+a("https://aclweb.org/anthology/D/D15/D15-1162.pdf") (Honnibal and Johnson, 2015)].
+
+include _benchmarks-choi-2015
+
++h(3, "algorithm") Algorithm comparison
+
+p
+ | In this section, we compare spaCy's algorithms to recently published
+ | systems, using some of the most popular benchmarks. These benchmarks are
+ | designed to help isolate the contributions of specific algorithmic
+ | decisions, so they promote slightly "idealised" conditions. Specifically,
+ | the text comes pre-processed with "gold standard" token and sentence
+ | boundaries. The data sets also tend to be fairly small, to help
+ | researchers iterate quickly. These conditions mean the models trained on
+ | these data sets are not always useful for practical purposes.
+
++h(4, "parse-accuracy-penn") Parse accuracy (Penn Treebank / Wall Street Journal)
+
+p
+ | This is the "classic" evaluation, so it's the number parsing researchers
+ | are most easily able to put in context. However, it's quite far removed
+ | from actual usage: it uses sentences with gold-standard segmentation and
+ | tokenization, from a pretty specific type of text (articles from a single
+ | newspaper, 1984-1989).
+
++aside("Methodology")
+ | #[+a("http://arxiv.org/abs/1603.06042") Andor et al. (2016)] chose
+ | slightly different experimental conditions from
+ | #[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") Choi et al. (2015)],
+ | so the two accuracy tables here do not present directly comparable
+ | figures.
+
++table(["System", "Year", "Type", "Accuracy"])
+ +row
+ +cell spaCy v2.0.0
+ +cell 2017
+ +cell neural
+ +cell.u-text-right 94.48
+
+ +row
+ +cell spaCy v1.1.0
+ +cell 2016
+ +cell linear
+ +cell.u-text-right 92.80
+
+ +row("divider")
+ +cell
+ +a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
+ +cell 2017
+ +cell neural
+ +cell.u-text-right #[strong 95.75]
+
+ +row
+ +cell
+ +a("http://arxiv.org/abs/1603.06042") Andor et al.
+ +cell 2016
+ +cell neural
+ +cell.u-text-right 94.44
+
+ +row
+ +cell
+ +a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
+ +cell 2016
+ +cell neural
+ +cell.u-text-right 94.15
+
+ +row
+ +cell
+ +a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
+ +cell 2015
+ +cell neural
+ +cell.u-text-right 93.91
+
+ +row
+ +cell
+ +a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
+ +cell 2014
+ +cell linear
+ +cell.u-text-right 93.32
+
+ +row
+ +cell
+ +a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
+ +cell 2013
+ +cell linear
+ +cell.u-text-right 93.10
+
++h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)
+
+p
+ | This is the evaluation we use to tune spaCy's parameters are decide which
+ | algorithms are better than others. It's reasonably close to actual usage,
+ | because it requires the parses to be produced from raw text, without any
+ | pre-processing.
+
++table(["System", "Year", "Type", "Accuracy"])
+ +row
+ +cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0
+ +cell 2017
+ +cell neural
+ +cell.u-text-right 86.45
+
+ +row("divider")
+ +cell
+ +a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
+ +cell 2017
+ +cell neural
+ +cell.u-text-right #[strong 86.81]
+
+ +row
+ +cell
+ +a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
+ +cell 2016
+ +cell neural
+ +cell.u-text-right 86.19
+
+ +row
+ +cell
+ +a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
+ +cell 2014
+ +cell neural
+ +cell.u-text-right 84.04
+
+ +row
+ +cell
+ +a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
+ +cell 2009
+ +cell linear
+ +cell.u-text-right 83.45
+
++h(3, "spacy-models") Model comparison
+
+include _benchmarks-models
+
++h(3, "speed-comparison") Detailed speed comparison
+
+p
+ | Here we compare the per-document processing time of various spaCy
+ | functionalities against other NLP libraries. We show both absolute
+ | timings (in ms) and relative performance (normalized to spaCy). Lower is
+ | better.
+
++infobox("Important note", "β οΈ")
+ | This evaluation was conducted in 2015. We're working on benchmarks on
+ | current CPU and GPU hardware.
+
++aside("Methodology")
+ | #[strong Set up:] 100,000 plain-text documents were streamed from an
+ | SQLite3 database, and processed with an NLP library, to one of three
+ | levels of detail β tokenization, tagging, or parsing. The tasks are
+ | additive: to parse the text you have to tokenize and tag it. The
+ | pre-processing was not subtracted from the times β we report the time
+ | required for the pipeline to complete. We report mean times per document,
+ | in milliseconds.#[br]#[br]
+ | #[strong Hardware]: Intel i7-3770 (2012)#[br]
+ | #[strong Implementation]: #[+src(gh("spacy-benchmarks")) #[code spacy-benchmarks]]
+
++table
+ +row.u-text-label.u-text-center
+ +head-cell
+ +head-cell(colspan="3") Absolute (ms per doc)
+ +head-cell(colspan="3") Relative (to spaCy)
+
+ +row
+ each column in ["System", "Tokenize", "Tag", "Parse", "Tokenize", "Tag", "Parse"]
+ +head-cell=column
+
+ +row
+ +cell #[strong spaCy]
+ each data in [ "0.2ms", "1ms", "19ms"]
+ +cell.u-text-right #[strong=data]
+
+ each data in ["1x", "1x", "1x"]
+ +cell.u-text-right=data
+
+ +row
+ +cell CoreNLP
+ each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
+ +cell.u-text-right=data
+ +row
+ +cell ZPar
+ each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
+ +cell.u-text-right=data
+ +row
+ +cell NLTK
+ each data in ["4ms", "443ms"]
+ +cell.u-text-right=data
+ +cell.u-text-right #[em n/a]
+ each data in ["20x", "443x"]
+ +cell.u-text-right=data
+ +cell.u-text-right #[em n/a]
diff --git a/website/usage/_facts-figures/_feature-comparison.jade b/website/usage/_facts-figures/_feature-comparison.jade
new file mode 100644
index 000000000..c8fa5ffbe
--- /dev/null
+++ b/website/usage/_facts-figures/_feature-comparison.jade
@@ -0,0 +1,58 @@
+//- π« DOCS > USAGE > FACTS & FIGURES > FEATURE COMPARISON
+
+p
+ | Here's a quick comparison of the functionalities offered by spaCy,
+ | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet],
+ | #[+a("http://www.nltk.org/py-modindex.html") NLTK] and
+ | #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP].
+
++table(["", "spaCy", "SyntaxNet", "NLTK", "CoreNLP"])
+ +row
+ +cell Programming language
+ each lang in ["Python", "C++", "Python", "Java"]
+ +cell.u-text-small.u-text-center=lang
+
+ +row
+ +cell Neural network models
+ each icon in ["pro", "pro", "con", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Integrated word vectors
+ each icon in ["pro", "con", "con", "con"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Multi-language support
+ each icon in ["pro", "pro", "pro", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Tokenization
+ each icon in ["pro", "pro", "pro", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Part-of-speech tagging
+ each icon in ["pro", "pro", "pro", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Sentence segmentation
+ each icon in ["pro", "pro", "pro", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Dependency parsing
+ each icon in ["pro", "pro", "con", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Entity recognition
+ each icon in ["pro", "con", "pro", "pro"]
+ +cell.u-text-center #[+procon(icon)]
+
+ +row
+ +cell Coreference resolution
+ each icon in ["con", "con", "con", "pro"]
+ +cell.u-text-center #[+procon(icon)]
diff --git a/website/usage/_facts-figures/_other-libraries.jade b/website/usage/_facts-figures/_other-libraries.jade
new file mode 100644
index 000000000..427debb27
--- /dev/null
+++ b/website/usage/_facts-figures/_other-libraries.jade
@@ -0,0 +1,70 @@
+//- π« DOCS > USAGE > FACTS & FIGURES > OTHER LIBRARIES
+
+p
+ | Data scientists, researchers and machine learning engineers have
+ | converged on Python as the language for AI. This gives developers a rich
+ | ecosystem of NLP libraries to work with. Here's how we think the pieces
+ | fit together.
+
++aside("Using spaCy with other libraries")
+ | For details on how to use spaCy together with popular machine learning
+ | libraries like TensorFlow, Keras or PyTorch, see the
+ | #[+a("/usage/deep-learning") usage guide on deep learning].
+
++infobox
+ +infobox-logos(["nltk", 80, 25, "http://nltk.org"])
+ | #[+label-inline NLTK] offers some of the same functionality as spaCy.
+ | Although originally developed for teaching and research, its longevity
+ | and stability has resulted in a large number of industrial users. It's
+ | the main alternative to spaCy for tokenization and sentence segmentation.
+ | In comparison to spaCy, NLTK takes a much more "broad church" approach β
+ | so it has some functions that spaCy doesn't provide, at the expense of a
+ | bit more clutter to sift through. spaCy is also much more
+ | performance-focussed than NLTK: where the two libraries provide the same
+ | functionality, spaCy's implementation will usually be faster and more
+ | accurate.
+
++infobox
+ +infobox-logos(["gensim", 40, 40, "https://radimrehurek.com/gensim/"])
+ | #[+label-inline Gensim] provides unsupervised text modelling algorithms.
+ | Although Gensim isn't a runtime dependency of spaCy, we use it to train
+ | word vectors. There's almost no overlap between the libraries β the two
+ | work together.
+
++infobox
+ +infobox-logos(["tensorflow", 35, 42, "https://www.tensorflow.org"], ["keras", 45, 45, "https://www.keras.io"])
+ | #[+label-inline Tensorflow / Keras] is the most popular deep learning library.
+ | spaCy provides efficient and powerful feature extraction functionality,
+ | that can be used as a pre-process to any deep learning library. You can
+ | also use Tensorflow and Keras to create spaCy pipeline components, to add
+ | annotations to the #[code Doc] object.
+
++infobox
+ +infobox-logos(["scikitlearn", 90, 44, "http://scikit-learn.org"])
+ | #[+label-inline scikit-learn] features a number of useful NLP functions,
+ | especially for solving text classification problems using linear models
+ | with bag-of-words features. If you know you need exactly that, it might
+ | be better to use scikit-learn's built-in pipeline directly. However, if
+ | you want to extract more detailed features, using part-of-speech tags,
+ | named entity labels, or string transformations, you can use spaCy as a
+ | pre-process in your classification system. scikit-learn also provides a
+ | lot of experiment management and evaluation utilities that people use
+ | alongside spaCy.
+
++infobox
+ +infobox-logos(["pytorch", 100, 48, "http://pytorch.org"], ["dynet", 80, 34, "http://dynet.readthedocs.io/"], ["chainer", 80, 43, "http://chainer.org"])
+ | #[+label-inline PyTorch, DyNet and Chainer] are dynamic neural network
+ | libraries, which can be much easier to work with for NLP. Outside of
+ | Google, there's a general shift among NLP researchers to both DyNet and
+ | Pytorch. spaCy is the front-end of choice for PyTorch's
+ | #[code torch.text] extension. You can use any of these libraries to
+ | create spaCy pipeline components, to add annotations to the #[code Doc]
+ | object.
+
++infobox
+ +infobox-logos(["allennlp", 124, 22, "http://allennlp.org"])
+ | #[+label-inline AllenNLP] is a new library designed to accelerate NLP
+ | research, by providing a framework that supports modern deep learning
+ | workflows for cutting-edge language understanding problems. AllenNLP uses
+ | spaCy as a preprocessing component. You can also use AllenNLP to develop
+ | spaCy pipeline components, to add annotations to the #[code Doc] object.
diff --git a/website/usage/_install/_changelog.jade b/website/usage/_install/_changelog.jade
new file mode 100644
index 000000000..e966b6695
--- /dev/null
+++ b/website/usage/_install/_changelog.jade
@@ -0,0 +1,31 @@
+//- π« DOCS > USAGE > INSTALL > CHANGELOG
+
++h(2, "changelog") Changelog
+ +button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases
+
+div(data-tpl="changelog" data-tpl-key="error")
+ +infobox
+ | Unable to load changelog from GitHub. Please see the
+ | #[+a(gh("spacy") + "/releases") releases page] instead.
+
+section(data-tpl="changelog" data-tpl-key="table" style="display: none")
+ +table(["Date", "Version", "Title"])
+ tbody(data-tpl="changelog" data-tpl-key="releases")
+ +row(data-tpl="changelog" data-tpl-key="item")
+ +cell.u-nowrap
+ +label(data-changelog="date")
+ +cell(data-changelog="tag")
+ +cell.u-text-small(data-changelog="title")
+
+ +h(3) Pre-releases
+
+ +aside("About pre-releases")
+ .o-block-small
+ | Pre-releases include alpha and beta versions, as well as release
+ | candidates. They are not intended for production use. You can
+ | download spaCy pre-releases via the #[code spacy-nightly] package
+ | on pip.
+ +badge("https://img.shields.io/pypi/v/spacy-nightly.svg?style=flat-square", "https://pypi.python.org/pypi/spacy-nightly")
+
+ +table(["Date", "Version", "Title"])
+ tbody(data-tpl="changelog" data-tpl-key="prereleases")
diff --git a/website/usage/_install/_instructions.jade b/website/usage/_install/_instructions.jade
new file mode 100644
index 000000000..10132a646
--- /dev/null
+++ b/website/usage/_install/_instructions.jade
@@ -0,0 +1,185 @@
+//- π« DOCS > USAGE > INSTALL > INSTRUCTIONS
+
++h(3, "pip") pip
+ +badge("https://img.shields.io/pypi/v/spacy.svg?style=flat-square", "https://pypi.python.org/pypi/spacy")
+
+p Using pip, spaCy releases are currently only available as source packages.
+
++code(false, "bash").
+ pip install -U spacy
+
++aside("Download models")
+ | After installation you need to download a language model. For more info
+ | and available models, see the #[+a("/usage/models") docs on models].
+
+ +code.o-no-block.
+ spacy download en
+
+ >>> import spacy
+ >>> nlp = spacy.load('en')
+
+p
+ | When using pip it is generally recommended to install packages in a
+ | #[code virtualenv] to avoid modifying system state:
+
++code(false, "bash").
+ virtualenv .env
+ source .env/bin/activate
+ pip install spacy
+
++h(3, "conda") conda
+ +badge("https://anaconda.org/conda-forge/spacy/badges/version.svg", "https://anaconda.org/conda-forge/spacy")
+
+p
+ | Thanks to our great community, we've finally re-added conda support. You
+ | can now install spaCy via #[code conda-forge]:
+
++code(false, "bash").
+ conda config --add channels conda-forge
+ conda install spacy
+
+p
+ | For the feedstock including the build recipe and configuration, check out
+ | #[+a("https://github.com/conda-forge/spacy-feedstock") this repository].
+ | Improvements and pull requests to the recipe and setup are always
+ | appreciated.
+
++h(3, "gpu") Run spaCy with GPU
+
+p
+ | As of v2.0, spaCy's comes with neural network models that are implemented
+ | in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
+ | support, we've been grateful to use the work of
+ | #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
+ | a NumPy-compatible interface for GPU arrays.
+
+p
+ | First, install follows the normal CUDA installation procedure. Next, set
+ | your environment variables so that the installation will be able to find
+ | CUDA. Finally, install spaCy.
+
++code(false, "bash").
+ export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
+ export PATH=$PATH:$CUDA_HOME/bin
+
+ pip install spacy
+ python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
+
++h(3, "source") Compile from source
+
+p
+ | The other way to install spaCy is to clone its
+ | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
+ | the common way if you want to make changes to the code base. You'll need
+ | to make sure that you have a development environment consisting of a
+ | Python distribution including header files, a compiler,
+ | #[+a("https://pip.pypa.io/en/latest/installing/") pip],
+ | #[+a("https://virtualenv.pypa.io/") virtualenv] and
+ | #[+a("https://git-scm.com") git] installed. The compiler part is the
+ | trickiest. How to do that depends on your system. See notes on
+ | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") OS X] and
+ | #[a(href="#source-windows") Windows] for details.
+
++code(false, "bash").
+ # make sure you are using recent pip/virtualenv versions
+ python -m pip install -U pip virtualenv
+ git clone #{gh("spaCy")}
+ cd spaCy
+
+ virtualenv .env
+ source .env/bin/activate
+ pip install -r requirements.txt
+ pip install -e .
+
+p
+ | Compared to regular install via pip,
+ | #[+a(gh("spaCy", "requirements.txt")) requirements.txt]
+ | additionally installs developer dependencies such as Cython.
+
+p
+ | Instead of the above verbose commands, you can also use the following
+ | #[+a("http://www.fabfile.org/") Fabric] commands:
+
++table(["Command", "Description"])
+ +row
+ +cell #[code fab env]
+ +cell Create #[code virtualenv] and delete previous one, if it exists.
+
+ +row
+ +cell #[code fab make]
+ +cell Compile the source.
+
+ +row
+ +cell #[code fab clean]
+ +cell Remove compiled objects, including the generated C++.
+
+ +row
+ +cell #[code fab test]
+ +cell Run basic tests, aborting after first failure.
+
+p
+ | All commands assume that your #[code virtualenv] is located in a
+ | directory #[code .env]. If you're using a different directory, you can
+ | change it via the environment variable #[code VENV_DIR], for example:
+
++code(false, "bash").
+ VENV_DIR=".custom-env" fab clean make
+
++h(4, "source-ubuntu") Ubuntu
+
+p Install system-level dependencies via #[code apt-get]:
+
++code(false, "bash").
+ sudo apt-get install build-essential python-dev git
+
++h(4, "source-osx") macOS / OS X
+
+p
+ | Install a recent version of
+ | #[+a("https://developer.apple.com/xcode/") XCode], including the
+ | so-called "Command Line Tools". macOS and OS X ship with Python and git
+ | preinstalled. To compile spaCy with multi-threading support on macOS / OS X,
+ | #[+a("https://github.com/explosion/spaCy/issues/267") see here].
+
++h(4, "source-windows") Windows
+
+p
+ | Install a version of
+ | #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
+ | that matches the version that was used to compile your Python
+ | interpreter. For official distributions these are:
+
++table([ "Distribution", "Version"])
+ +row
+ +cell Python 2.7
+ +cell Visual Studio 2008
+
+ +row
+ +cell Python 3.4
+ +cell Visual Studio 2010
+
+ +row
+ +cell Python 3.5+
+ +cell Visual Studio 2015
+
++h(3, "tests") Run tests
+
+p
+ | spaCy comes with an #[+a(gh("spacy", "spacy/tests")) extensive test suite].
+ | First, find out where spaCy is installed:
+
++code(false, "bash").
+ python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
+
+p
+ | Then run #[code pytest] on that directory. The flags #[code --slow] and
+ | #[code --model] are optional and enable additional tests.
+
++code(false, "bash").
+ # make sure you are using recent pytest version
+ python -m pip install -U pytest
+
+ python -m pytest <spacy-directory> # basic tests
+ python -m pytest <spacy-directory> --slow # basic and slow tests
+ python -m pytest <spacy-directory> --models --all # basic and all model tests
+ python -m pytest <spacy-directory> --models --en # basic and English model tests
diff --git a/website/usage/_install/_quickstart.jade b/website/usage/_install/_quickstart.jade
new file mode 100644
index 000000000..8e581994c
--- /dev/null
+++ b/website/usage/_install/_quickstart.jade
@@ -0,0 +1,26 @@
+//- π« DOCS > USAGE > INSTALL > QUICKSTART
+
+- QUICKSTART[QUICKSTART.length - 1].options = Object.keys(MODELS).map(m => ({ id: m, title: LANGUAGES[m] }))
+
++quickstart(QUICKSTART, "Quickstart")
+ +qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
+ +qs({config: 'venv', python: 3}) python -m pip install -U venv
+ +qs({config: 'venv', python: 2}) virtualenv .env
+ +qs({config: 'venv', python: 3}) venv .env
+ +qs({config: 'venv', os: 'mac'}) source .env/bin/activate
+ +qs({config: 'venv', os: 'linux'}) source .env/bin/activate
+ +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
+
+ +qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+ +qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+
+ +qs({package: 'pip'}) pip install -U spacy
+ +qs({package: 'conda'}) conda install -c conda-forge spacy
+
+ +qs({package: 'source'}) git clone https://github.com/explosion/spaCy
+ +qs({package: 'source'}) cd spaCy
+ +qs({package: 'source'}) pip install -r requirements.txt
+ +qs({package: 'source'}) pip install -e .
+
+ for _, model in MODELS
+ +qs({model: model}) spacy download #{model}
diff --git a/website/usage/_install/_troubleshooting.jade b/website/usage/_install/_troubleshooting.jade
new file mode 100644
index 000000000..9fb92f17b
--- /dev/null
+++ b/website/usage/_install/_troubleshooting.jade
@@ -0,0 +1,147 @@
+//- π« DOCS > USAGE > INSTALL > TROUBLESHOOTING
+
+p
+ | This section collects some of the most common errors you may come
+ | across when installing, loading and using spaCy, as well as their solutions.
+
++aside("Help us improve this guide")
+ | Did you come across a problem like the ones listed here and want to
+ | share the solution? You can find the "Suggest edits" button at the
+ | bottom of this page that points you to the source. We always
+ | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+
++h(3, "compatible-model") No compatible model found
+
++code(false, "text").
+ No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
+
+p
+ | This usually means that the model you're trying to download does not
+ | exist, or isn't available for your version of spaCy. Check the
+ | #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
+ | to see which models are available for your spaCy version. If you're using
+ | an old version, consider upgrading to the latest release. Note that while
+ | spaCy supports tokenization for
+ | #[+a("/usage/models/#languages") a variety of languages],
+ | not all of them come with statistical models. To only use the tokenizer,
+ | import the language's #[code Language] class instead, for example
+ | #[code from spacy.fr import French].
+
++h(3, "symlink-privilege") Symbolic link privilege not held
+
++code(false, "text").
+ OSError: symbolic link privilege not held
+
+p
+ | To create #[+a("/usage/models/#usage") shortcut links] that let you
+ | load models by name, spaCy creates a symbolic link in the
+ | #[code spacy/data] directory. This means your user needs permission to do
+ | this. The above error mostly occurs when doing a system-wide installation,
+ | which will create the symlinks in a system directory. Run the
+ | #[code download] or #[code link] command as administrator, or use a
+ | #[code virtualenv] to install spaCy in a user directory, instead
+ | of doing a system-wide installation.
+
++h(3, "no-cache-dir") No such option: --no-cache-dir
+
++code(false, "text").
+ no such option: --no-cache-dir
+
+p
+ | The #[code download] command uses pip to install the models and sets the
+ | #[code --no-cache-dir] flag to prevent it from requiring too much memory.
+ | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
+ | requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to
+ | the latest version of pip. To see which version you have installed,
+ | run #[code pip --version].
+
++h(3, "import-error") Import error
+
++code(false, "text").
+ Import Error: No module named spacy
+
+p
+ | This error means that the spaCy module can't be located on your system, or in
+ | your environment. Make sure you have spaCy installed. If you're using a
+ | #[code virtualenv], make sure it's activated and check that spaCy is
+ | installed in that environment β otherwise, you're trying to load a system
+ | installation. You can also run #[code which python] to find out where
+ | your Python executable is located.
+
++h(3, "import-error-models") Import error: models
+
++code(false, "text").
+ ImportError: No module named 'en_core_web_sm'
+
+p
+ | As of spaCy v1.7, all models can be installed as Python packages. This means
+ | that they'll become importable modules of your application. When creating
+ | #[+a("/usage/models/#usage") shortcut links], spaCy will also try
+ | to import the model to load its meta data. If this fails, it's usually a
+ | sign that the package is not installed in the current environment.
+ | Run #[code pip list] or #[code pip freeze] to check which model packages
+ | you have installed, and install the
+ | #[+a("/models") correct models] if necessary. If you're
+ | importing a model manually at the top of a file, make sure to use the name
+ | of the package, not the shortcut link you've created.
+
++h(3, "vocab-strings") File not found: vocab/strings.json
+
++code(false, "text").
+ FileNotFoundError: No such file or directory: [...]/vocab/strings.json
+
+p
+ | This error may occur when using #[code spacy.load()] to load
+ | a language model β either because you haven't set up a
+ | #[+a("/usage/models/#usage") shortcut link] for it, or because it
+ | doesn't actually exist. Set up a
+ | #[+a("/usage/models/#usage") shortcut link] for the model
+ | you want to load. This can either be an installed model package, or a
+ | local directory containing the model data. If you want to use one of the
+ | #[+a("/usage/models#languages") alpha tokenizers] for
+ | languages that don't yet have a statistical model, you should import its
+ | #[code Language] class instead, for example
+ | #[code from spacy.lang.bn import Bengali].
+
++h(3, "command-not-found") Command not found
+
++code(false, "text").
+ command not found: spacy
+
+p
+ | This error may occur when running the #[code spacy] command from the
+ | command line. spaCy does not currently add an entry to our #[code PATH]
+ | environment variable, as this can lead to unexpected results, especially
+ | when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
+ | maps #[code spacy] to #[code python -m spacy]. If this is not working as
+ | expected, run the command with #[code python -m], yourself β
+ | for example #[code python -m spacy download en]. For more info on this,
+ | see #[+api("cli#download") download].
+
++h(3, "module-load") 'module' object has no attribute 'load'
+
++code(false, "text").
+ AttributeError: 'module' object has no attribute 'load'
+
+p
+ | While this could technically have many causes, including spaCy being
+ | broken, the most likely one is that your script's file or directory name
+ | is "shadowing" the module β e.g. your file is called #[code spacy.py],
+ | or a directory you're importing from is called #[code spacy]. So, when
+ | using spaCy, never call anything else #[code spacy].
+
++h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-]
+
++code.
+ doc = nlp(u'They are')
+ print(doc[0].lemma_)
+ # -PRON-
+
+p
+ | This is in fact expected behaviour and not a bug.
+ | Unlike verbs and common nouns, there's no clear base form of a personal
+ | pronoun. Should the lemma of "me" be "I", or should we normalize person
+ | as well, giving "it" β or maybe "he"? spaCy's solution is to introduce a
+ | novel symbol, #[code -PRON-], which is used as the lemma for
+ | all personal pronouns. For more info on this, see the
+ | #[+api("annotation#lemmatization") annotation specs] on lemmatization.
diff --git a/website/docs/usage/dependency-parse.jade b/website/usage/_linguistic-features/_dependency-parse.jade
similarity index 93%
rename from website/docs/usage/dependency-parse.jade
rename to website/usage/_linguistic-features/_dependency-parse.jade
index beae36578..85d9179df 100644
--- a/website/docs/usage/dependency-parse.jade
+++ b/website/usage/_linguistic-features/_dependency-parse.jade
@@ -1,6 +1,4 @@
-//- π« DOCS > USAGE > DEPENDENCY PARSE
-
-include ../../_includes/_mixins
+//- π« DOCS > USAGE > LINGUISTIC FEATURES > DEPENDENCY PARSE
p
| spaCy features a fast and accurate syntactic dependency parser, and has
@@ -11,8 +9,7 @@ p
| boolean value. If this attribute is #[code False], the default sentence
| iterator will raise an exception.
-+h(2, "noun-chunks") Noun chunks
- +tag-model("dependency parse")
++h(3, "noun-chunks") Noun chunks
p
| Noun chunks are "base noun phrases" β flat phrases that have a noun as
@@ -41,7 +38,7 @@ p
+annotation-row(["insurance liability", "liability", "dobj", "shift"], style)
+annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style)
-+h(2, "navigating") Navigating the parse tree
++h(3, "navigating") Navigating the parse tree
p
| spaCy uses the terms #[strong head] and #[strong child] to describe the words
@@ -110,7 +107,7 @@ p
| attribute, which provides a sequence of #[+api("token") #[code Token]]
| objects.
-+h(3, "navigating-around") Iterating around the local tree
++h(4, "navigating-around") Iterating around the local tree
p
| A few more convenience attributes are provided for iterating around the
@@ -135,7 +132,7 @@ p
| method.
+aside("Projective vs. non-projective")
- | For the #[+a("/docs/usage/models#available") default English model], the
+ | For the #[+a("/models/en") default English model], the
| parse tree is #[strong projective], which means that there are no crossing
| brackets. The tokens returned by #[code .subtree] are therefore guaranteed
| to be contiguous. This is not true for the German model, which has many
@@ -181,7 +178,7 @@ p
+annotation-row(["their", "ADJ", "poss", "requests"], style)
+annotation-row(["requests", "NOUN", "dobj", "submit"], style)
-+h(2, "displacy") Visualizing dependencies
++h(3, "displacy") Visualizing dependencies
p
| The best way to understand spaCy's dependency parser is interactively.
@@ -201,14 +198,14 @@ p
+infobox
| For more details and examples, see the
- | #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy]. You
+ | #[+a("/usage/visualizers") usage guide on visualizing spaCy]. You
| can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo].
-+h(2, "disabling") Disabling the parser
++h(3, "disabling") Disabling the parser
p
- | In the #[+a("/docs/usage/models/available") default models], the parser
- | is loaded and enabled as part of the
+ | In the #[+a("/models") default models], the parser is loaded and enabled
+ | as part of the
| #[+a("docs/usage/language-processing-pipelines") standard processing pipeline].
| If you don't need any of the syntactic information, you should disable
| the parser. Disabling the parser will make spaCy load and run much faster.
@@ -225,7 +222,7 @@ p
| Since spaCy v2.0 comes with better support for customising the
| processing pipeline components, the #[code parser] keyword argument
| has been replaced with #[code disable], which takes a list of
- | #[+a("/docs/usage/language-processing-pipeline") pipeline component names].
+ | #[+a("/usage/processing-pipelines") pipeline component names].
| This lets you disable both default and custom components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
diff --git a/website/docs/usage/entity-recognition.jade b/website/usage/_linguistic-features/_named-entities.jade
similarity index 74%
rename from website/docs/usage/entity-recognition.jade
rename to website/usage/_linguistic-features/_named-entities.jade
index 826de1543..f42df3342 100644
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/usage/_linguistic-features/_named-entities.jade
@@ -1,6 +1,4 @@
-//- π« DOCS > USAGE > NAMED ENTITY RECOGNITION
-
-include ../../_includes/_mixins
+//- π« DOCS > USAGE > LINGUISTIC FEATURES > NAMED ENTITY RECOGNITION
p
| spaCy features an extremely fast statistical entity recognition system,
@@ -9,12 +7,11 @@ p
| locations, organizations and products. You can add arbitrary classes to
| the entity recognition system, and update the model with new examples.
-+h(2, "101") Named Entity Recognition 101
- +tag-model("named entities")
++h(3, "101") Named Entity Recognition 101
-include _spacy-101/_named-entities
+include ../_spacy-101/_named-entities
-+h(2, "accessing") Accessing entity annotations
++h(3, "accessing") Accessing entity annotations
p
| The standard way to access entity annotations is the
@@ -62,7 +59,7 @@ p
+annotation-row(["delivery", 2, "O", '""', "outside an entity"], style)
+annotation-row(["robots", 2, "O", '""', "outside an entity"], style)
-+h(2, "setting") Setting entity annotations
++h(3, "setting") Setting entity annotations
p
| To ensure that the sequence of token annotations remains consistent, you
@@ -92,7 +89,7 @@ p
| but at the document level, the entity will have the start and end
| indices #[code (0, 7)].
-+h(3, "setting-from-array") Setting entity annotations from array
++h(4, "setting-from-array") Setting entity annotations from array
p
| You can also assign entity annotations using the
@@ -114,7 +111,7 @@ p
doc.from_array(header, attr_array)
assert list(doc.ents)[0].text == u'London'
-+h(3, "setting-cython") Setting entity annotations in Cython
++h(4, "setting-cython") Setting entity annotations in Cython
p
| Finally, you can always write to the underlying struct, if you compile
@@ -137,18 +134,16 @@ p
| you'll have responsibility for ensuring that the data is left in a
| consistent state.
-+h(2, "entity-types") Built-in entity types
++h(3, "entity-types") Built-in entity types
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
-include ../api/_annotation/_named-entities
+include ../../api/_annotation/_named-entities
-+h(2, "updating") Training and updating
-
-+under-construction
++h(3, "updating") Training and updating
p
| To provide training examples to the entity recogniser, you'll first need
@@ -166,65 +161,24 @@ p
+code.
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
- gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
+ gold = GoldParse(doc, entities=[u'U-ANIMAL', u'O', u'O', u'O'])
+infobox
| For more details on #[strong training and updating] the named entity
- | recognizer, see the usage guides on #[+a("/docs/usage/training") training]
- | and #[+a("/docs/usage/training-ner") training the named entity recognizer],
+ | recognizer, see the usage guides on #[+a("/usage/training") training]
| or check out the runnable
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
| on GitHub.
-+h(3, "updating-biluo") The BILUO Scheme
++h(4, "updating-biluo") The BILUO Scheme
p
| You can also provide token-level entity annotation, using the
| following tagging scheme to describe the entity boundaries:
-+table([ "Tag", "Description" ])
- +row
- +cell #[code #[span.u-color-theme B] EGIN]
- +cell The first token of a multi-token entity.
+include ../../api/_annotation/_biluo
- +row
- +cell #[code #[span.u-color-theme I] N]
- +cell An inner token of a multi-token entity.
-
- +row
- +cell #[code #[span.u-color-theme L] AST]
- +cell The final token of a multi-token entity.
-
- +row
- +cell #[code #[span.u-color-theme U] NIT]
- +cell A single-token entity.
-
- +row
- +cell #[code #[span.u-color-theme O] UT]
- +cell A non-entity token.
-
-+aside("Why BILUO, not IOB?")
- | There are several coding schemes for encoding entity annotations as
- | token tags. These coding schemes are equally expressive, but not
- | necessarily equally learnable.
- | #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
- | showed that the minimal #[strong Begin], #[strong In], #[strong Out]
- | scheme was more difficult to learn than the #[strong BILUO] scheme that
- | we use, which explicitly marks boundary tokens.
-
-p
- | spaCy translates the character offsets into this scheme, in order to
- | decide the cost of each action given the current state of the entity
- | recogniser. The costs are then used to calculate the gradient of the
- | loss, to train the model. The exact algorithm is a pastiche of
- | well-known methods, and is not currently described in any single
- | publication. The model is a greedy transition-based parser guided by a
- | linear model whose weights are learned using the averaged perceptron
- | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
- | imitation learning strategy. The transition system is equivalent to the
- | BILOU tagging scheme.
-
-+h(2, "displacy") Visualizing named entities
++h(3, "displacy") Visualizing named entities
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
@@ -238,7 +192,7 @@ p
p
| For more details and examples, see the
- | #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy].
+ | #[+a("/usage/visualizers") usage guide on visualizing spaCy].
+code("Named Entity example").
import spacy
diff --git a/website/docs/usage/pos-tagging.jade b/website/usage/_linguistic-features/_pos-tagging.jade
similarity index 76%
rename from website/docs/usage/pos-tagging.jade
rename to website/usage/_linguistic-features/_pos-tagging.jade
index effc185e9..4e845cdaf 100644
--- a/website/docs/usage/pos-tagging.jade
+++ b/website/usage/_linguistic-features/_pos-tagging.jade
@@ -1,20 +1,10 @@
-//- π« DOCS > USAGE > PART-OF-SPEECH TAGGING
+//- π« DOCS > USAGE > LINGUISTIC FEATURES > PART-OF-SPEECH TAGGING
-include ../../_includes/_mixins
+include ../_spacy-101/_pos-deps
-p
- | Part-of-speech tags are labels like noun, verb, adjective etc that are
- | assigned to each token in the document. They're useful in rule-based
- | processes. They can also be useful features in some statistical models.
+//-+aside("Help β spaCy's output is wrong!")
-+h(2, "101") Part-of-speech tagging 101
- +tag-model("tagger", "dependency parse")
-
-include _spacy-101/_pos-deps
-
-+aside("Help β spaCy's output is wrong!")
-
-+h(2, "rule-based-morphology") Rule-based morphology
++h(3, "rule-based-morphology") Rule-based morphology
p
| Inflectional morphology is the process by which a root form of a word is
@@ -54,7 +44,7 @@ p
+list("numbers")
+item
| The tokenizer consults a
- | #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table]
+ | #[+a("/usage/adding-languages#tokenizer-exceptions") mapping table]
| #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters
| to be mapped to multiple tokens. Each token may be assigned a part
| of speech and one or more morphological features.
@@ -68,7 +58,7 @@ p
+item
| For words whose POS is not set by a prior process, a
- | #[+a("/docs/usage/adding-languages#tag-map") mapping table]
+ | #[+a("/usage/adding-languages#tag-map") mapping table]
| #[code TAG_MAP] maps the tags to a part-of-speech and a set of
| morphological features.
@@ -80,6 +70,4 @@ p
| list-based exception files, acquired from
| #[+a("https://wordnet.princeton.edu/") WordNet].
-+h(2, "pos-schemes") Part-of-speech tag schemes
-
-include ../api/_annotation/_pos-tags
+include ../../api/_annotation/_pos-tags
diff --git a/website/docs/usage/rule-based-matching.jade b/website/usage/_linguistic-features/_rule-based-matching.jade
similarity index 95%
rename from website/docs/usage/rule-based-matching.jade
rename to website/usage/_linguistic-features/_rule-based-matching.jade
index 71400ea55..88a713ffc 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/usage/_linguistic-features/_rule-based-matching.jade
@@ -1,19 +1,18 @@
//- π« DOCS > USAGE > RULE-BASED MATCHING
-include ../../_includes/_mixins
-
p
- | spaCy features a rule-matching engine that operates over tokens, similar
+ | spaCy features a rule-matching engine, the #[+api("matcher") #[code Matcher]],
+ | that operates over tokens, similar
| to regular expressions. The rules can refer to token annotations (e.g.
| the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
| The rule matcher also lets you pass in a custom callback
| to act on matches β for example, to merge entities and apply custom labels.
| You can also associate patterns with entity IDs, to allow some basic
- | entity linking or disambiguation.
+ | entity linking or disambiguation. To match large terminology lists,
+ | you can use the #[+api("phrasematcher") #[code PhraseMatcher]], which
+ | accepts #[code Doc] objects as match patterns.
-//-+aside("What about \"real\" regular expressions?")
-
-+h(2, "adding-patterns") Adding patterns
++h(3, "adding-patterns") Adding patterns
p
| Let's say we want to enable spaCy to find a combination of three tokens:
@@ -76,7 +75,7 @@ p
| other pattern types. You shouldn't have to create different matchers for
| each of those processes.
-+h(2, "on_match") Adding #[code on_match] rules
++h(3, "on_match") Adding #[code on_match] rules
p
| To move on to a more realistic example, let's say you're working with a
@@ -142,7 +141,7 @@ p
options={'ents': ['EVENT']})
| For more info and examples, see the usage guide on
- | #[+a("/docs/usage/visualizers") visualizing spaCy].
+ | #[+a("/usage/visualizers") visualizing spaCy].
p
| We can now call the matcher on our documents. The patterns will be
@@ -184,7 +183,7 @@ p
| A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]].
-+h(2, "quantifiers") Using operators and quantifiers
++h(3, "quantifiers") Using operators and quantifiers
p
| The matcher also lets you use quantifiers, specified as the #[code 'OP']
@@ -221,7 +220,7 @@ p
+cell match 0 or 1 times
+cell optional, max one
-+h(2, "example1") Example: Using linguistic annotations
++h(3, "example1") Example: Using linguistic annotations
p
| Let's say you're analysing user comments and you want to find out what
@@ -246,13 +245,13 @@ p
p
| To get a quick overview of the results, you could collect all sentences
| containing a match and render them with the
- | #[+a("/docs/usage/visualizers") displaCy visualizer].
+ | #[+a("/usage/visualizers") displaCy visualizer].
| In the callback function, you'll have access to the #[code start] and
| #[code end] of each match, as well as the parent #[code Doc]. This lets
| you determine the sentence containing the match,
| #[code doc[start : end].sent], and calculate the start and end of the
| matched span within the sentence. Using displaCy in
- | #[+a("/docs/usage/visualizers#manual-usage") "manual" mode] lets you
+ | #[+a("/usage/visualizers#manual-usage") "manual" mode] lets you
| pass in a list of dictionaries containing the text and entities to render.
+code.
@@ -283,7 +282,7 @@ p
# set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True)
-+h(2, "example2") Example: Phone numbers
++h(3, "example2") Example: Phone numbers
p
| Phone numbers can have many different formats and matching them is often
@@ -321,7 +320,7 @@ p
| extend, and doesn't require any training data β only a set of
| test cases.
-+h(2, "example3") Example: Hashtags and emoji on social media
++h(3, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.
diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/usage/_linguistic-features/_tokenization.jade
similarity index 76%
rename from website/docs/usage/customizing-tokenizer.jade
rename to website/usage/_linguistic-features/_tokenization.jade
index 0bc81771d..182bc31e9 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/usage/_linguistic-features/_tokenization.jade
@@ -1,6 +1,4 @@
-//- π« DOCS > USAGE > TOKENIZER
-
-include ../../_includes/_mixins
+//- π« DOCS > USAGE > LINGUISTIC FEATURES > TOKENIZATION
p
| Tokenization is the task of splitting a text into meaningful segments,
@@ -11,15 +9,14 @@ p
| #[code spaces] booleans, which allow you to maintain alignment of the
| tokens into the original string.
-+h(2, "101") Tokenizer 101
+include ../_spacy-101/_tokenization
-include _spacy-101/_tokenization
-
-+h(3, "101-data") Tokenizer data
++h(4, "101-data") Tokenizer data
p
| #[strong Global] and #[strong language-specific] tokenizer data is
- | supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang].
+ | supplied via the language data in
+ | #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]].
| The tokenizer exceptions define special cases like "don't" in English,
| which needs to be split into two tokens: #[code {ORTH: "do"}] and
| #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
@@ -27,16 +24,14 @@ p
| (at the end of a sentence), and when to leave token containing periods
| intact (abbreviations like "U.S.").
-+image
- include ../../assets/img/docs/language_data.svg
- .u-text-right
- +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/language_data.svg")
+ include ../../assets/img/language_data.svg
+infobox
| For more details on the language-specific data, see the
- | usage guide on #[+a("/docs/usage/adding-languages") adding languages].
+ | usage guide on #[+a("/usage/adding-languages") adding languages].
-+h(2, "special-cases") Adding special case tokenization rules
++h(3, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncrasies that require custom
@@ -46,7 +41,7 @@ p
+aside("Language data vs. custom tokenization")
| Tokenization rules that are specific to one language, but can be
| #[strong generalised across that language] should ideally live in the
- | language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang] βΒ we
+ | language data in #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]] βΒ we
| always appreciate pull requests! Anything that's specific to a domain or
| text type β like financial trading abbreviations, or Bavarian youth slang
| β should be added as a special case rule to your tokenizer instance. If
@@ -69,9 +64,12 @@ p
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
nlp.tokenizer.add_special_case(u'gimme', special_case)
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
- assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
+ # Pronoun lemma is returned as -PRON-!
+ assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
p
+ | For details on spaCy's custom pronoun lemma #[code -PRON-],
+ | #[+a("/usage/#pron-lemma") see here].
| The special case doesn't have to match an entire whitespace-delimited
| substring. The tokenizer will incrementally split off punctuation, and
| keep looking up the remaining substring:
@@ -97,7 +95,7 @@ p
| #[+api("language") #[code Language]] class itself.
-+h(2, "how-tokenizer-works") How spaCy's tokenizer works
++h(3, "how-tokenizer-works") How spaCy's tokenizer works
p
| spaCy introduces a novel tokenization algorithm, that gives a better
@@ -113,8 +111,8 @@ p
| algorithm in Python, optimized for readability rather than performance:
+code.
- def tokenizer_pseudo_code(text, find_prefix, find_suffix,
- find_infixes, special_cases):
+ def tokenizer_pseudo_code(text, special_cases,
+ find_prefix, find_suffix, find_infixes):
tokens = []
for substring in text.split(' '):
suffixes = []
@@ -162,11 +160,11 @@ p
| like hyphens etc.
+item Once we can't consume any more of the string, handle it as a single token.
-+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
++h(3, "native-tokenizers") Customizing spaCy's Tokenizer class
p
| Let's imagine you wanted to create a tokenizer for a new language or
- | specific domain. There are four things you would need to define:
+ | specific domain. There are five things you would need to define:
+list("numbers")
+item
@@ -188,6 +186,11 @@ p
| A function #[code infixes_finditer], to handle non-whitespace
| separators, such as hyphens etc.
+ +item
+ | An optional boolean function #[code token_match] matching strings
+ | that should never be split, overriding the previous rules.
+ | Useful for things like URLs or numbers.
+
p
| You shouldn't usually need to create a #[code Tokenizer] subclass.
| Standard usage is to use #[code re.compile()] to build a regular
@@ -200,10 +203,14 @@ p
prefix_re = re.compile(r'''[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']''')
+ infix_re = re.compile(r'''[-~]''')
+ simple_url_re = re.compile(r'''^https?://''')
def custom_tokenizer(nlp):
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
- suffix_search=suffix_re.search)
+ suffix_search=suffix_re.search,
+ infix_finditer=infix_re.finditer,
+ token_match=simple_url_re.match)
nlp = spacy.load('en')
nlp.tokenizer = custom_tokenizer(nlp)
@@ -213,7 +220,7 @@ p
| specialize are #[code find_prefix], #[code find_suffix] and
| #[code find_infix].
-+h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
++h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
p
| The tokenizer is the first component of the processing pipeline and the
@@ -222,11 +229,8 @@ p
| it takes a text and returns a #[code Doc], whereas all other components
| expect to already receive a tokenized #[code Doc].
-+image
- include ../../assets/img/docs/pipeline.svg
- .u-text-right
- +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
-
++graphic("/assets/img/pipeline.svg")
+ include ../../assets/img/pipeline.svg
p
| To overwrite the existing tokenizer, you need to replace
@@ -243,7 +247,7 @@ p
+cell unicode
+cell The raw text to tokenize.
- +footrow
+ +row("foot")
+cell returns
+cell #[code Doc]
+cell The tokenized document.
@@ -295,3 +299,36 @@ p
+code.
nlp = spacy.load('en')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
+
++h(3, "own-annotations") Bringing your own annotations
+
+p
+ | spaCy generally assumes by default that your data is raw text. However,
+ | sometimes your data is partially annotated, e.g. with pre-existing
+ | tokenization, part-of-speech tags, etc. The most common situation is
+ | that you have pre-defined tokenization. If you have a list of strings,
+ | you can create a #[code Doc] object directly. Optionally, you can also
+ | specify a list of boolean values, indicating whether each word has a
+ | subsequent space.
+
++code.
+ doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
+
+p
+ | If provided, the spaces list must be the same length as the words list.
+ | The spaces list affects the #[code doc.text], #[code span.text],
+ | #[code token.idx], #[code span.start_char] and #[code span.end_char]
+ | attributes. If you don't provide a #[code spaces] sequence, spaCy will
+ | assume that all words are whitespace delimited.
+
++code.
+ good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
+ bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
+ assert bad_spaces.text == u'Hello , world !'
+ assert good_spaces.text == u'Hello, world!'
+
+p
+ | Once you have a #[+api("doc") #[code Doc]] object, you can write to its
+ | attributes to set the part-of-speech tags, syntactic dependencies, named
+ | entities and other attributes. For details, see the respective usage
+ | pages.
diff --git a/website/usage/_models/_available-models.jade b/website/usage/_models/_available-models.jade
new file mode 100644
index 000000000..b4fa1fc90
--- /dev/null
+++ b/website/usage/_models/_available-models.jade
@@ -0,0 +1,22 @@
+//- π« DOCS > USAGE > MODELS > AVAILABE MODELS
+
+p
+ | Model differences are mostly statistical. In general, we do expect larger
+ | models to be "better" and more accurate overall. Ultimately, it depends on
+ | your use case and requirements, and we recommend starting with the default
+ | models (marked with a star below). For a more detailed overview, see the
+ | #[+a("/models") models directory].
+
++table(["Name", "Language", "Type"])
+ for models, lang in MODELS
+ for model, i in models
+ - var comps = getModelComponents(model)
+ +row
+ +cell #[+a("/models/" + lang + "#" + model) #[code=model]]
+ if i == 0
+ +icon("star", 16).o-icon--inline.u-color-theme
+ +cell #{LANGUAGES[comps.lang]}
+ +cell #{MODEL_META[comps.type]}
+
+.u-text-right
+ +button("/models", true, "primary", "small") View models directory
diff --git a/website/usage/_models/_install-basics.jade b/website/usage/_models/_install-basics.jade
new file mode 100644
index 000000000..a8029cc10
--- /dev/null
+++ b/website/usage/_models/_install-basics.jade
@@ -0,0 +1,33 @@
+//- π« DOCS > USAGE > MODELS > INSTALLATION BASICS
+
+p
+ | The easiest way to download a model is via spaCy's
+ | #[+api("cli#download") #[code download]] command. It takes care of
+ | finding the best-matching model compatible with your spaCy installation.
+
+- var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang })
++code(false, "bash").
+ # out-of-the-box: download best-matching default model
+ #{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')}
+
+ # download best-matching version of specific model for your spaCy installation
+ spacy download en_core_web_sm
+
+ # download exact model version (doesn't create shortcut link)
+ spacy download en_core_web_sm-2.0.0 --direct
+
+p
+ | The download command will #[+a("/usage/models#download-pip") install the model] via
+ | pip, place the package in your #[code site-packages] directory and create
+ | a #[+a("/usage/models#usage") shortcut link] that lets you load the model by a custom
+ | name. The shortcut link will be the same as the model name used in
+ | #[code spacy download].
+
++code(false, "bash").
+ pip install spacy
+ spacy download en
+
++code.
+ import spacy
+ nlp = spacy.load('en')
+ doc = nlp(u'This is a sentence.')
diff --git a/website/docs/usage/models.jade b/website/usage/_models/_install.jade
similarity index 67%
rename from website/docs/usage/models.jade
rename to website/usage/_models/_install.jade
index 7421e8aad..1d15199a2 100644
--- a/website/docs/usage/models.jade
+++ b/website/usage/_models/_install.jade
@@ -1,38 +1,4 @@
-//- π« DOCS > USAGE > MODELS
-
-include ../../_includes/_mixins
-
-p
- | As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
- | This means that they're a component of your application, just like any
- | other module. They're versioned and can be defined as a dependency in your
- | #[code requirements.txt]. Models can be installed from a download URL or
- | a local directory, manually or via #[+a("https://pypi.python.org/pypi/pip") pip].
- | Their data can be located anywhere on your file system.
-
-+aside("Important note")
- | If you're upgrading to spaCy v1.7.x or v2.x, you need to
- | #[strong download the new models]. If you've trained statistical models
- | that use spaCy's annotations, you should #[strong retrain your models]
- | after updating spaCy. If you don't retrain, you may suffer train/test
- | skew, which might decrease your accuracy.
-
-+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
- for models, lang in MODELS
- - var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
- +qs({lang: lang}) spacy download #{lang}
- +qs({lang: lang}, "divider")
- +qs({lang: lang, load: "module"}, "python") import #{package.id}
- +qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
- +qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
- +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
- +qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])
-
-+h(2, "available") Available models
-
-include _models-list
-
-+h(2, "download") Downloading models
+//- π« DOCS > USAGE > MODELS > INSTALLATION
+aside("Downloading models in spaCy < v1.7")
| In older versions of spaCy, you can still use the old download commands.
@@ -47,37 +13,8 @@ include _models-list
| The old models are also #[+a(gh("spacy") + "/tree/v1.6.0") attached to the v1.6.0 release].
| To download and install them manually, unpack the archive, drop the
| contained directory into #[code spacy/data].
-p
- | The easiest way to download a model is via spaCy's
- | #[+api("cli#download") #[code download]] command. It takes care of
- | finding the best-matching model compatible with your spaCy installation.
-- var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang })
-+code(false, "bash").
- # out-of-the-box: download best-matching default model
- #{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')}
-
- # download best-matching version of specific model for your spaCy installation
- spacy download en_core_web_md
-
- # download exact model version (doesn't create shortcut link)
- spacy download en_core_web_md-1.2.0 --direct
-
-p
- | The download command will #[+a("#download-pip") install the model] via
- | pip, place the package in your #[code site-packages] directory and create
- | a #[+a("#usage") shortcut link] that lets you load the model by a custom
- | name. The shortcut link will be the same as the model name used in
- | #[code spacy.download].
-
-+code(false, "bash").
- pip install spacy
- spacy download en
-
-+code.
- import spacy
- nlp = spacy.load('en')
- doc = nlp(u'This is a sentence.')
+include _install-basics
+h(3, "download-pip") Installation via pip
@@ -107,8 +44,8 @@ p
+infobox
| You can also add the direct download link to your application's
| #[code requirements.txt]. For more details,
- | see the usage guide on
- | #[+a("/docs/usage/production-use#models") working with models in production].
+ | see the section on
+ | #[+a("/models/#production") working with models in production].
+h(3, "download-manual") Manual download and installation
@@ -135,7 +72,7 @@ p
| local file system. To use it with spaCy, simply assign it a name by
| creating a #[+a("#usage") shortcut link] for the data directory.
-+h(2, "usage") Using models with spaCy
++h(3, "usage") Using models with spaCy
p
| To load a model, use #[+api("spacy#load") #[code spacy.load()]] with the
@@ -201,7 +138,7 @@ p
| privileges, the #[code spacy link] command may fail. The easiest solution
| is to re-run the command as admin, or use a #[code virtualenv]. For more
| info on this, see the
- | #[+a("/docs/usage/#symlink-privilege") troubleshooting guide].
+ | #[+a("/usage/#symlink-privilege") troubleshooting guide].
+h(3, "usage-import") Importing models as modules
@@ -227,15 +164,15 @@ p
| #[code spacy.load()].
+infobox
- | For more details, see the usage guide on
- | #[+a("/docs/usage/production-use#models") working with models in production].
+ | For more details, see the section on
+ | #[+a("/models/#production") working with models in production].
-+h(2, "own-models") Using your own models
++h(3, "own-models") Using your own models
p
| If you've trained your own model, for example for
- | #[+a("/docs/usage/adding-languages") additional languages] or
- | #[+a("/docs/usage/train-ner") custom named entities], you can save its
+ | #[+a("/usage/adding-languages") additional languages] or
+ | #[+a("/usage/training#ner") custom named entities], you can save its
| state using the #[+api("language#to_disk") #[code Language.to_disk()]]
| method. To make the model more convenient to deploy, we recommend
| wrapping it as a Python package.
@@ -243,4 +180,4 @@ p
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
- | #[+a("/docs/usage/saving-loading#models") saving and loading models].
+ | #[+a("/usage/training#saving-loading") saving and loading models].
diff --git a/website/usage/_models/_languages.jade b/website/usage/_models/_languages.jade
new file mode 100644
index 000000000..abdad01ad
--- /dev/null
+++ b/website/usage/_models/_languages.jade
@@ -0,0 +1,72 @@
+//- π« DOCS > USAGE > MODELS > LANGUAGE SUPPORT
+
+p spaCy currently provides models for the following languages:
+
++table(["Language", "Code", "Language data", "Models"])
+ for models, code in MODELS
+ - var count = Object.keys(models).length
+ +row
+ +cell=LANGUAGES[code]
+ +cell #[code=code]
+ +cell
+ +src(gh("spaCy", "spacy/lang/" + code)) #[code lang/#{code}]
+ +cell
+ +a("/models/" + code) #{count} #{(count == 1) ? "model" : "models"}
+
++h(3, "alpha-support") Alpha tokenization support
+
+p
+ | Work has started on the following languages. You can help by
+ | #[+a("/usage/adding-languages#language-data") improving the existing language data]
+ | and extending the tokenization patterns.
+
++aside("Usage note")
+ | Note that the alpha languages don't yet come with a language model. In
+ | order to use them, you have to import them directly, or use
+ | #[+api("spacy#blank") #[code spacy.blank]]:
+
+ +code.o-no-block.
+ from spacy.lang.fi import Finnish
+ nlp = Finnish() # use directly
+ nlp = spacy.blank('fi') # blank instance
+
++table(["Language", "Code", "Language data"])
+ for lang, code in LANGUAGES
+ if !Object.keys(MODELS).includes(code)
+ +row
+ +cell #{LANGUAGES[code]}
+ +cell #[code=code]
+ +cell
+ +src(gh("spaCy", "spacy/lang/" + code)) #[code lang/#{code}]
+
++infobox("Dependencies")
+ | Some language tokenizers require external dependencies. To use #[strong Chinese],
+ | you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed.
+ | The #[strong Japanese] tokenizer requires
+ | #[+a("https://github.com/mocobeta/janome") Janome].
+
++h(3, "multi-language") Multi-language support
+ +tag-new(2)
+
+p
+ | As of v2.0, spaCy supports models trained on more than one language. This
+ | is especially useful for named entity recognition. The language ID used
+ | for multi-language or language-neutral models is #[code xx]. The
+ | language class, a generic subclass containing only the base language data,
+ | can be found in #[+src(gh("spaCy", "spacy/lang/xx")) #[code lang/xx]].
+
+p
+ | To load your model with the neutral, multi-language class, simply set
+ | #[code "language": "xx"] in your
+ | #[+a("/usage/training#models-generating") model package]'s
+ | meta.json. You can also import the class directly, or call
+ | #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
+ | lazy-loading.
+
++code("Standard import").
+ from spacy.lang.xx import MultiLanguage
+ nlp = MultiLanguage()
+
++code("With lazy-loading").
+ from spacy.util import get_lang_class
+ nlp = get_lang_class('xx')
diff --git a/website/usage/_models/_production.jade b/website/usage/_models/_production.jade
new file mode 100644
index 000000000..43f4b1ba9
--- /dev/null
+++ b/website/usage/_models/_production.jade
@@ -0,0 +1,81 @@
+//- π« DOCS > USAGE > MODELS > PRODUCTION USE
+
+p
+ | If your application depends on one or more models,
+ | you'll usually want to integrate them into your continuous integration
+ | workflow and build process. While spaCy provides a range of useful helpers
+ | for downloading, linking and loading models, the underlying functionality
+ | is entirely based on native Python packages. This allows your application
+ | to handle a model like any other package dependency.
+
++infobox("Training models for production")
+ | For an example of an automated model training and build process, see
+ | #[+a("/usage/training#example-training-spacy") this example] of how
+ | we're training and packaging our models for spaCy.
+
++h(3, "models-download") Downloading and requiring model dependencies
+
+p
+ | spaCy's built-in #[+api("cli#download") #[code download]] command
+ | is mostly intended as a convenient, interactive wrapper. It performs
+ | compatibility checks and prints detailed error messages and warnings.
+ | However, if you're downloading models as part of an automated build
+ | process, this only adds an unnecessary layer of complexity. If you know
+ | which models your application needs, you should be specifying them directly.
+
+p
+ | Because all models are valid Python packages, you can add them to your
+ | application's #[code requirements.txt]. If you're running your own
+ | internal PyPi installation, you can simply upload the models there. pip's
+ | #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format]
+ | supports both package names to download via a PyPi server, as well as direct
+ | URLs.
+
++code("requirements.txt", "text").
+ spacy>=2.0.0,<3.0.0
+ -e #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#en_core_web_sm
+
+p
+ | Specifying #[code #egg=] with the package name tells pip
+ | which package to expect from the download URL. This way, the
+ | package won't be re-downloaded and overwritten if it's already
+ | installed - just like when you're downloading a package from PyPi.
+
+p
+ | All models are versioned and specify their spaCy dependency. This ensures
+ | cross-compatibility and lets you specify exact version requirements for
+ | each model. If you've trained your own model, you can use the
+ | #[+api("cli#package") #[code package]] command to generate the required
+ | meta data and turn it into a loadable package.
+
+
++h(3, "models-loading") Loading and testing models
+
+p
+ | Downloading models directly via pip won't call spaCy's link
+ | #[+api("cli#link") #[code link]] command, which creates
+ | symlinks for model shortcuts. This means that you'll have to run this
+ | command separately, or use the native #[code import] syntax to load the
+ | models:
+
++code.
+ import en_core_web_sm
+ nlp = en_core_web_sm.load()
+
+p
+ | In general, this approach is recommended for larger code bases, as it's
+ | more "native", and doesn't depend on symlinks or rely on spaCy's loader
+ | to resolve string names to model packages. If a model can't be
+ | imported, Python will raise an #[code ImportError] immediately. And if a
+ | model is imported but not used, any linter will catch that.
+
+p
+ | Similarly, it'll give you more flexibility when writing tests that
+ | require loading models. For example, instead of writing your own
+ | #[code try] and #[code except] logic around spaCy's loader, you can use
+ | #[+a("http://pytest.readthedocs.io/en/latest/") pytest]'s
+ | #[+a("https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip") #[code importorskip()]]
+ | method to only run a test if a specific model or model version is
+ | installed. Each model package exposes a #[code __version__] attribute
+ | which you can also use to perform your own version compatibility checks
+ | before loading a model.
diff --git a/website/usage/_models/_quickstart.jade b/website/usage/_models/_quickstart.jade
new file mode 100644
index 000000000..c8f702cb4
--- /dev/null
+++ b/website/usage/_models/_quickstart.jade
@@ -0,0 +1,17 @@
+//- π« DOCS > USAGE > MODELS > QUICKSTART
+
+- QUICKSTART_MODELS[0].options = Object.keys(MODELS).map(m => ({ id: m, title: LANGUAGES[m], checked: m == 'en'}))
++quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
+ for models, lang in MODELS
+ - var package = models[0]
+ +qs({lang: lang}) spacy download #{lang}
+ +qs({lang: lang}, "divider")
+ +qs({lang: lang, load: "module"}, "python") import #{package}
+ +qs({lang: lang, load: "module"}, "python") nlp = #{package}.load()
+ +qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
+ +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
+
+ if lang != "xx"
+ +qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])
+ else
+ +qs({lang: lang, config: "example"}, "python") print([(ent.text, ent.label) for ent in doc.ents])
diff --git a/website/usage/_processing-pipelines/_examples.jade b/website/usage/_processing-pipelines/_examples.jade
new file mode 100644
index 000000000..616bed32c
--- /dev/null
+++ b/website/usage/_processing-pipelines/_examples.jade
@@ -0,0 +1,126 @@
+//- π« DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
+
+p
+ | To see real-world examples of pipeline factories and components in action,
+ | you can have a look at the source of spaCy's built-in components, e.g.
+ | the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
+ | #[+api("entityrecognizer") #[code EntityRecongnizer]].
+
++h(3, "example1") Example: Custom sentence segmentation logic
+
+p
+ | Let's say you want to implement custom logic to improve spaCy's sentence
+ | boundary detection. Currently, sentence segmentation is based on the
+ | dependency parse, which doesn't always produce ideal results. The custom
+ | logic should therefore be applied #[strong after] tokenization, but
+ | #[strong before] the dependency parsing β this way, the parser can also
+ | take advantage of the sentence boundaries.
+
++code.
+ def sbd_component(doc):
+ for i, token in enumerate(doc[:-2]):
+ # define sentence start if period + titlecase token
+ if token.text == '.' and doc[i+1].is_title:
+ doc[i+1].sent_start = True
+ return doc
+
+p
+ | In this case, we simply want to add the component to the existing
+ | pipeline of the English model. We can do this by inserting it at index 0
+ | of #[code nlp.pipeline]:
+
++code.
+ nlp = spacy.load('en')
+ nlp.pipeline.insert(0, sbd_component)
+
+p
+ | When you call #[code nlp] on some text, spaCy will tokenize it to create
+ | a #[code Doc] object, and first call #[code sbd_component] on it, followed
+ | by the model's default pipeline.
+
++h(3, "example2") Example: Sentiment model
+
+p
+ | Let's say you have trained your own document sentiment model on English
+ | text. After tokenization, you want spaCy to first execute the
+ | #[strong default tensorizer], followed by a custom
+ | #[strong sentiment component] that adds a #[code .sentiment]
+ | property to the #[code Doc], containing your model's sentiment precition.
+
+p
+ | Your component class will have a #[code from_disk()] method that spaCy
+ | calls to load the model data. When called, the component will compute
+ | the sentiment score, add it to the #[code Doc] and return the modified
+ | document. Optionally, the component can include an #[code update()] method
+ | to allow training the model.
+
++code.
+ import pickle
+ from pathlib import Path
+
+ class SentimentComponent(object):
+ def __init__(self, vocab):
+ self.weights = None
+
+ def __call__(self, doc):
+ doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
+ return doc
+
+ def from_disk(self, path): # path = model path + factory ID ('sentiment')
+ self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
+ return self
+
+ def update(self, doc, gold): # update weights β allows training!
+ prediction = sum(self.weights*doc.vector)
+ self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
+
+p
+ | The factory will initialise the component with the #[code Vocab] object.
+ | To be able to add it to your model's pipeline as #[code 'sentiment'],
+ | it also needs to be registered via
+ | #[+api("spacy#set_factory") #[code set_factory()]].
+
++code.
+ def sentiment_factory(vocab):
+ component = SentimentComponent(vocab) # initialise component
+ return component
+
+ spacy.set_factory('sentiment', sentiment_factory)
+
+p
+ | The above code should be #[strong shipped with your model]. You can use
+ | the #[+api("cli#package") #[code package]] command to create all required
+ | files and directories. The model package will include an
+ | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
+ | with a #[code load()] method, that will initialise the language class with
+ | the model's pipeline and call the #[code from_disk()] method to load
+ | the model data.
+
+p
+ | In the model package's meta.json, specify the language class and pipeline
+ | IDs:
+
++code("meta.json (excerpt)", "json").
+ {
+ "name": "sentiment_model",
+ "lang": "en",
+ "version": "1.0.0",
+ "spacy_version": ">=2.0.0,<3.0.0",
+ "pipeline": ["tensorizer", "sentiment"]
+ }
+
+p
+ | When you load your new model, spaCy will call the model's #[code load()]
+ | method. This will return a #[code Language] object with a pipeline
+ | containing the default tensorizer, and the sentiment component returned
+ | by your custom #[code "sentiment"] factory.
+
++code.
+ nlp = spacy.load('en_sentiment_model')
+ doc = nlp(u'I love pizza')
+ assert doc.sentiment
+
++infobox("Saving and loading models")
+ | For more information and a detailed guide on how to package your model,
+ | see the documentation on
+ | #[+a("/usage/training#saving-loading") saving and loading models].
diff --git a/website/usage/_processing-pipelines/_multithreading.jade b/website/usage/_processing-pipelines/_multithreading.jade
new file mode 100644
index 000000000..1e08508b8
--- /dev/null
+++ b/website/usage/_processing-pipelines/_multithreading.jade
@@ -0,0 +1,40 @@
+//- π« DOCS > USAGE > PROCESSING PIPELINES > MULTI-THREADING
+
+p
+ | If you have a sequence of documents to process, you should use the
+ | #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
+ | an iterator of texts, and accumulates an internal buffer,
+ | which it works on in parallel. It then yields the documents in order,
+ | one-by-one. After a long and bitter struggle, the global interpreter
+ | lock was freed around spaCy's main parsing loop in v0.100.3. This means
+ | that #[code .pipe()] will be significantly faster in most
+ | practical situations, because it allows shared memory parallelism.
+
++code.
+ for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
+ pass
+
+p
+ | To make full use of the #[code .pipe()] function, you might want to
+ | brush up on #[strong Python generators]. Here are a few quick hints:
+
++list
+ +item
+ | Generator comprehensions can be written as
+ | #[code (item for item in sequence)].
+
+ +item
+ | The
+ | #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
+ | and the
+ | #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
+ | provide a lot of handy #[strong generator tools].
+
+ +item
+ | Often you'll have an input stream that pairs text with some
+ | important meta data, e.g. a JSON document. To
+ | #[strong pair up the meta data] with the processed #[code Doc]
+ | object, you should use the #[code itertools.tee] function to split
+ | the generator in two, and then #[code izip] the extra stream to the
+ | document stream. Here's
+ | #[+a(gh("spacy") + "/issues/172#issuecomment-183963403") an example].
diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/usage/_processing-pipelines/_pipelines.jade
similarity index 56%
rename from website/docs/usage/language-processing-pipeline.jade
rename to website/usage/_processing-pipelines/_pipelines.jade
index 03f6c28f5..d09ed4ead 100644
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/usage/_processing-pipelines/_pipelines.jade
@@ -1,12 +1,4 @@
-//- π« DOCS > USAGE > PIPELINE
-
-include ../../_includes/_mixins
-
-+h(2, "101") Pipelines 101
-
-include _spacy-101/_pipelines
-
-+h(2, "pipelines") How pipelines work
+//- π« DOCS > USAGE > PROCESSING PIPELINES > PIPELINES
p
| spaCy makes it very easy to create your own pipelines consisting of
@@ -15,11 +7,11 @@ p
| functions. A pipeline component can be added to an already existing
| #[code nlp] object, specified when initialising a #[code Language] class,
| or defined within a
- | #[+a("/docs/usage/saving-loading#models-generating") model package].
+ | #[+a("/usage/saving-loading#models-generating") model package].
p
| When you load a model, spaCy first consults the model's
- | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
+ | #[+a("/usage/saving-loading#models-generating") meta.json]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
@@ -29,7 +21,7 @@ p
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
- "pipeline": ["token_vectors", "tagger"]
+ "pipeline": ["tensorizer", "tagger"]
}
+list("numbers")
@@ -56,24 +48,50 @@ p
p
| ... the model tells spaCy to use the pipeline
- | #[code ["tensorizer", "tagger", "parser", "ner"]]. spaCy will then look
- | up each string in its internal factories registry and initialise the
- | individual components. It'll then load #[code spacy.lang.en.English],
- | pass it the path to the model's data directory, and return it for you
- | to use as the #[code nlp] object.
+ | #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
+ | then look up each string in its internal factories registry and
+ | initialise the individual components. It'll then load
+ | #[code spacy.lang.en.English], pass it the path to the model's data
+ | directory, and return it for you to use as the #[code nlp] object.
+
+p
+ | Fundamentally, a #[+a("/models") spaCy model] consists of three
+ | components: #[strong the weights], i.e. binary data loaded in from a
+ | directory, a #[strong pipeline] of functions called in order,
+ | and #[strong language data] like the tokenization rules and annotation
+ | scheme. All of this is specific to each model, and defined in the
+ | model's #[code meta.json] β for example, a Spanish NER model requires
+ | different weights, language data and pipeline components than an English
+ | parsing and tagging model. This is also why the pipeline state is always
+ | held by the #[code Language] class.
+ | #[+api("spacy#load") #[code spacy.load]] puts this all together and
+ | returns an instance of #[code Language] with a pipeline set and access
+ | to the binary data:
+
++code("spacy.load under the hood").
+ lang = 'en'
+ pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
+ data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
+
+ cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
+ nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline
+ nlp.from_disk(model_data_path) # 3. load in the binary data
p
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
| then #[strong call each component] on the #[code Doc], in order.
- | Components all return the modified document, which is then processed by
- | the component next in the pipeline.
+ | Since the model data is loaded, the components can access it to assign
+ | annotations to the #[code Doc] object, and subsequently to the
+ | #[code Token] and #[code Span] which are only views of the #[code Doc],
+ | and don't own any data themselves. All components return the modified
+ | document, which is then processed by the component next in the pipeline.
+code("The pipeline under the hood").
doc = nlp.make_doc(u'This is a sentence')
for proc in nlp.pipeline:
doc = proc(doc)
-+h(2, "creating") Creating pipeline components and factories
++h(3, "creating") Creating pipeline components and factories
p
| spaCy lets you customise the pipeline with your own components. Components
@@ -82,7 +100,7 @@ p
| pipeline. You can do that by defining and registering a factory which
| receives the shared #[code Vocab] object and returns a component.
-+h(3, "creating-component") Creating a component
++h(4, "creating-component") Creating a component
p
| A component receives a #[code Doc] object and
@@ -103,7 +121,7 @@ p
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
- +footrow
+ +row("foot")
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
@@ -123,7 +141,7 @@ p
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
-+h(3, "creating-factory") Creating a factory
++h(4, "creating-factory") Creating a factory
p
| A factory is a #[strong function that returns a pipeline component].
@@ -149,7 +167,7 @@ p
| Shared data between components, including strings, morphology,
| vectors etc.
- +footrow
+ +row("foot")
+cell returns
+cell callable
+cell The pipeline component.
@@ -171,148 +189,22 @@ p
| by looking it up in the available factories. The factory will then be
| initialised with the #[code Vocab]. Providing factory names instead of
| callables also makes it easy to specify them in the model's
- | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're
+ | #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
| training your own model and want to use one of spaCy's default components,
| you won't have to worry about finding and implementing it either β to use
| the default tagger, simply add #[code "tagger"] to the pipeline, and
| #[strong spaCy will know what to do].
-
+infobox("Important note")
| Because factories are #[strong resolved on initialisation] of the
| #[code Language] class, it's #[strong not possible] to add them to the
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
| works with individual component functions. To use factories, you need to
| create a new #[code Language] object, or generate a
- | #[+a("/docs/usage/saving-loading#models-generating") model package] with
+ | #[+a("/usage/training#models-generating") model package] with
| a custom pipeline.
-+aside("Real-world examples")
- | To see real-world examples of pipeline factories and components in action,
- | you can have a look at the source of spaCy's built-in components, e.g.
- | the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
- | #[+api("entityrecognizer") #[code EntityRecongnizer]].
-
-+h(2, "example1") Example: Custom sentence segmentation logic
-
-p
- | Let's say you want to implement custom logic to improve spaCy's sentence
- | boundary detection. Currently, sentence segmentation is based on the
- | dependency parse, which doesn't always produce ideal results. The custom
- | logic should therefore be applied #[strong after] tokenization, but
- | #[strong before] the dependency parsing β this way, the parser can also
- | take advantage of the sentence boundaries.
-
-+code.
- def sbd_component(doc):
- for i, token in enumerate(doc[:-2]):
- # define sentence start if period + titlecase token
- if token.text == '.' and doc[i+1].is_title:
- doc[i+1].sent_start = True
- return doc
-
-p
- | In this case, we simply want to add the component to the existing
- | pipeline of the English model. We can do this by inserting it at index 0
- | of #[code nlp.pipeline]:
-
-+code.
- nlp = spacy.load('en')
- nlp.pipeline.insert(0, sbd_component)
-
-p
- | When you call #[code nlp] on some text, spaCy will tokenize it to create
- | a #[code Doc] object, and first call #[code sbd_component] on it, followed
- | by the model's default pipeline.
-
-+h(2, "example2") Example: Sentiment model
-
-p
- | Let's say you have trained your own document sentiment model on English
- | text. After tokenization, you want spaCy to first execute the
- | #[strong default tensorizer], followed by a custom
- | #[strong sentiment component] that adds a #[code .sentiment]
- | property to the #[code Doc], containing your model's sentiment precition.
-
-p
- | Your component class will have a #[code from_disk()] method that spaCy
- | calls to load the model data. When called, the component will compute
- | the sentiment score, add it to the #[code Doc] and return the modified
- | document. Optionally, the component can include an #[code update()] method
- | to allow training the model.
-
-+code.
- import pickle
- from pathlib import Path
-
- class SentimentComponent(object):
- def __init__(self, vocab):
- self.weights = None
-
- def __call__(self, doc):
- doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
- return doc
-
- def from_disk(self, path): # path = model path + factory ID ('sentiment')
- self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
- return self
-
- def update(self, doc, gold): # update weights β allows training!
- prediction = sum(self.weights*doc.vector)
- self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
-
-p
- | The factory will initialise the component with the #[code Vocab] object.
- | To be able to add it to your model's pipeline as #[code 'sentiment'],
- | it also needs to be registered via
- | #[+api("spacy#set_factory") #[code set_factory()]].
-
-+code.
- def sentiment_factory(vocab):
- component = SentimentComponent(vocab) # initialise component
- return component
-
- spacy.set_factory('sentiment', sentiment_factory)
-
-p
- | The above code should be #[strong shipped with your model]. You can use
- | the #[+api("cli#package") #[code package]] command to create all required
- | files and directories. The model package will include an
- | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]
- | with a #[code load()] method, that will initialise the language class with
- | the model's pipeline and call the #[code from_disk()] method to load
- | the model data.
-
-p
- | In the model package's meta.json, specify the language class and pipeline
- | IDs:
-
-+code("meta.json (excerpt)", "json").
- {
- "name": "sentiment_model",
- "lang": "en",
- "version": "1.0.0",
- "spacy_version": ">=2.0.0,<3.0.0",
- "pipeline": ["tensorizer", "sentiment"]
- }
-
-p
- | When you load your new model, spaCy will call the model's #[code load()]
- | method. This will return a #[code Language] object with a pipeline
- | containing the default tensorizer, and the sentiment component returned
- | by your custom #[code "sentiment"] factory.
-
-+code.
- nlp = spacy.load('en_sentiment_model')
- doc = nlp(u'I love pizza')
- assert doc.sentiment
-
-+infobox("Saving and loading models")
- | For more information and a detailed guide on how to package your model,
- | see the documentation on
- | #[+a("/docs/usage/saving-loading#models") saving and loading models].
-
-+h(2, "disabling") Disabling pipeline components
++h(3, "disabling") Disabling pipeline components
p
| If you don't need a particular component of the pipeline β for
diff --git a/website/usage/_processing-pipelines/_serialization.jade b/website/usage/_processing-pipelines/_serialization.jade
new file mode 100644
index 000000000..e29cbc558
--- /dev/null
+++ b/website/usage/_processing-pipelines/_serialization.jade
@@ -0,0 +1,38 @@
+//- π« DOCS > USAGE > PROCESSING PIPELINES > SERIALIZATION
+
+include ../_spacy-101/_serialization
+
++infobox("Important note")
+ | In spaCy v2.0, the API for saving and loading has changed to only use the
+ | four methods listed above consistently across objects and classes. For an
+ | overview of the changes, see #[+a("/usage/v2#incompat") this table]
+ | and the notes on #[+a("/usage/v2#migrating-saving-loading") migrating].
+
++h(3, "example-doc") Example: Saving and loading a document
+
+p
+ | For simplicity, let's assume you've
+ | #[+a("/usage/entity-recognition#setting") added custom entities] to
+ | a #[code Doc], either manually, or by using a
+ | #[+a("/usage/rule-based-matching#on_match") match pattern]. You can
+ | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
+ | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
+ | This will overwrite the existing object and return it.
+
++code.
+ import spacy
+ from spacy.tokens import Span
+
+ text = u'Netflix is hiring a new VP of global policy'
+
+ nlp = spacy.load('en')
+ doc = nlp(text)
+ assert len(doc.ents) == 0 # Doc has no entities
+ doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
+ doc.to_disk('/path/to/doc') # save Doc to disk
+
+ new_doc = nlp(text)
+ assert len(new_doc.ents) == 0 # new Doc has no entities
+ new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
+ assert len(new_doc.ents) == 1 # entity is now recognised!
+ assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]
diff --git a/website/usage/_processing-pipelines/_user-hooks.jade b/website/usage/_processing-pipelines/_user-hooks.jade
new file mode 100644
index 000000000..e7dce53fe
--- /dev/null
+++ b/website/usage/_processing-pipelines/_user-hooks.jade
@@ -0,0 +1,61 @@
+//- π« DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
+
+p
+ | Hooks let you customize some of the behaviours of the #[code Doc],
+ | #[code Span] or #[code Token] objects by adding a component to the
+ | pipeline. For instance, to customize the
+ | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
+ | component that sets a custom function to
+ | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
+ | method will check the #[code user_hooks] dict, and delegate to your
+ | function if you've set one. Similar results can be achieved by setting
+ | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+
++code("Polymorphic similarity example").
+ span.similarity(doc)
+ token.similarity(span)
+ doc1.similarity(doc2)
+
+p
+ | By default, this just averages the vectors for each document, and
+ | computes their cosine. Obviously, spaCy should make it easy for you to
+ | install your own similarity model. This introduces a tricky design
+ | challenge. The current solution is to add three more dicts to the
+ | #[code Doc] object:
+
++aside("Implementation note")
+ | The hooks live on the #[code Doc] object because the #[code Span] and
+ | #[code Token] objects are created lazily, and don't own any data. They
+ | just proxy to their parent #[code Doc]. This turns out to be convenient
+ | here β we only have to worry about installing hooks in one place.
+
++table(["Name", "Description"])
+ +row
+ +cell #[code user_hooks]
+ +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
+
+ +row
+ +cell #[code user_token_hooks]
+ +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
+
+ +row
+ +cell #[code user_span_hooks]
+ +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
+
+p
+ | To sum up, here's an example of hooking in custom #[code .similarity()]
+ | methods:
+
++code("Add custom similarity hooks").
+ class SimilarityModel(object):
+ def __init__(self, model):
+ self._model = model
+
+ def __call__(self, doc):
+ doc.user_hooks['similarity'] = self.similarity
+ doc.user_span_hooks['similarity'] = self.similarity
+ doc.user_token_hooks['similarity'] = self.similarity
+
+ def similarity(self, obj1, obj2):
+ y = self._model([obj1.vector, obj2.vector])
+ return float(y[0])
diff --git a/website/docs/usage/_spacy-101/_architecture.jade b/website/usage/_spacy-101/_architecture.jade
similarity index 83%
rename from website/docs/usage/_spacy-101/_architecture.jade
rename to website/usage/_spacy-101/_architecture.jade
index c5a85f0b0..c9b299036 100644
--- a/website/docs/usage/_spacy-101/_architecture.jade
+++ b/website/usage/_spacy-101/_architecture.jade
@@ -20,18 +20,12 @@ p
| returning an #[strong annotated document]. It also orchestrates training
| and serialization.
-+image
- include ../../../assets/img/docs/architecture.svg
- .u-text-right
- +button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/architecture.svg")
+ include ../../assets/img/architecture.svg
+
++h(3, "architecture-containers") Container objects
+table(["Name", "Description"])
- +row
- +cell #[+api("language") #[code Language]]
- +cell
- | A text-processing pipeline. Usually you'll load this once per
- | process as #[code nlp] and pass the instance around your application.
-
+row
+cell #[+api("doc") #[code Doc]]
+cell A container for accessing linguistic annotations.
@@ -53,43 +47,25 @@ p
| opposed to a word token. It therefore has no part-of-speech tag,
| dependency parse etc.
- +row
- +cell #[+api("vocab") #[code Vocab]]
- +cell
- | A lookup table for the vocabulary that allows you to access
- | #[code Lexeme] objects.
-
- +row
- +cell #[code Morphology]
- +cell
- | Assign linguistic features like lemmas, noun case, verb tense etc.
- | based on the word and its part-of-speech tag.
-
- +row
- +cell #[+api("stringstore") #[code StringStore]]
- +cell Map strings to and from hash values.
-
- +row
- +cell #[+api("tokenizer") #[code Tokenizer]]
- +cell
- | Segment text, and create #[code Doc] objects with the discovered
- | segment boundaries.
-
- +row
- +cell #[code Lemmatizer]
- +cell
- | Determine the base forms of words.
-
- +row
- +cell #[+api("matcher") #[code Matcher]]
- +cell
- | Match sequences of tokens, based on pattern rules, similar to
- | regular expressions.
-
-
-+h(3, "architecture-pipeline") Pipeline components
++h(3, "architecture-pipeline") Processing pipeline
+table(["Name", "Description"])
+ +row
+ +cell #[+api("language") #[code Language]]
+ +cell
+ | A text-processing pipeline. Usually you'll load this once per
+ | process as #[code nlp] and pass the instance around your application.
+
+ +row
+ +cell #[+api("pipe") #[code Pipe]]
+ +cell Base class for processing pipeline components.
+
+ +row
+ +cell #[+api("tensorizer") #[code Tensorizer]]
+ +cell
+ | Add tensors with position-sensitive meaning representations to
+ | #[code Doc] objects.
+
+row
+cell #[+api("tagger") #[code Tagger]]
+cell Annotate part-of-speech tags on #[code Doc] objects.
@@ -104,16 +80,54 @@ p
| Annotate named entities, e.g. persons or products, on #[code Doc]
| objects.
+ +row
+ +cell #[+api("textcategorizer") #[code TextCategorizer]]
+ +cell Assigning categories or labels to #[code Doc] objects.
+
+ +row
+ +cell #[+api("tokenizer") #[code Tokenizer]]
+ +cell
+ | Segment text, and create #[code Doc] objects with the discovered
+ | segment boundaries.
+
+ +row
+ +cell #[+api("lemmatizer") #[code Lemmatizer]]
+ +cell
+ | Determine the base forms of words.
+
+ +row
+ +cell #[code Morphology]
+ +cell
+ | Assign linguistic features like lemmas, noun case, verb tense etc.
+ | based on the word and its part-of-speech tag.
+
+ +row
+ +cell #[+api("matcher") #[code Matcher]]
+ +cell
+ | Match sequences of tokens, based on pattern rules, similar to
+ | regular expressions.
+
+ +row
+ +cell #[+api("phrasematcher") #[code PhraseMatcher]]
+ +cell Match sequences of tokens based on phrases.
+
+
+h(3, "architecture-other") Other classes
+table(["Name", "Description"])
+row
- +cell #[+api("vectors") #[code Vectors]]
- +cell Container class for vector data keyed by string.
+ +cell #[+api("vocab") #[code Vocab]]
+ +cell
+ | A lookup table for the vocabulary that allows you to access
+ | #[code Lexeme] objects.
+row
- +cell #[+api("binder") #[code Binder]]
- +cell Container class for serializing collections of #[code Doc] objects.
+ +cell #[+api("stringstore") #[code StringStore]]
+ +cell Map strings to and from hash values.
+
+ +row
+ +cell #[+api("vectors") #[code Vectors]]
+ +cell Container class for vector data keyed by string.
+row
+cell #[+api("goldparse") #[code GoldParse]]
@@ -124,3 +138,7 @@ p
+cell
| An annotated corpus, using the JSON file format. Manages
| annotations for tagging, dependency parsing and NER.
+
+ +row
+ +cell #[+api("binder") #[code Binder]]
+ +cell Container class for serializing collections of #[code Doc] objects.
diff --git a/website/usage/_spacy-101/_community-faq.jade b/website/usage/_spacy-101/_community-faq.jade
new file mode 100644
index 000000000..f91248bfd
--- /dev/null
+++ b/website/usage/_spacy-101/_community-faq.jade
@@ -0,0 +1,141 @@
+//- π« DOCS > USAGE > SPACY 101 > COMMUNITY & FAQ
+
+p
+ | We're very happy to see the spaCy community grow and include a mix of
+ | people from all kinds of different backgrounds β computational
+ | linguistics, data science, deep learning, research and more. If you'd
+ | like to get involved, below are some answers to the most important
+ | questions and resources for further reading.
+
++h(3, "faq-help-code") Help, my code isn't working!
+
+p
+ | Bugs suck, and we're doing our best to continuously improve the tests
+ | and fix bugs as soon as possible. Before you submit an issue, do a
+ | quick search and check if the problem has already been reported. If
+ | you're having installation or loading problems, make sure to also check
+ | out the #[+a("/usage/#troubleshooting") troubleshooting guide]. Help
+ | with spaCy is available via the following platforms:
+
++aside("How do I know if something is a bug?")
+ | Of course, it's always hard to know for sure, so don't worry β we're not
+ | going to be mad if a bug report turns out to be a typo in your
+ | code. As a simple rule, any C-level error without a Python traceback,
+ | like a #[strong segmentation fault] or #[strong memory error],
+ | is #[strong always] a spaCy bug.#[br]#[br]
+ | Because models are statistical, their performance will never be
+ | #[em perfect]. However, if you come across
+ | #[strong patterns that might indicate an underlying issue], please do
+ | file a report. Similarly, we also care about behaviours that
+ | #[strong contradict our docs].
+
++table(["Platform", "Purpose"])
+ +row
+ +cell #[+a("https://stackoverflow.com/questions/tagged/spacy") StackOverflow]
+ +cell
+ | #[strong Usage questions] and everything related to problems with
+ | your specific code. The StackOverflow community is much larger
+ | than ours, so if your problem can be solved by others, you'll
+ | receive help much quicker.
+
+ +row
+ +cell #[+a("https://gitter.im/" + SOCIAL.gitter) Gitter chat]
+ +cell
+ | #[strong General discussion] about spaCy, meeting other community
+ | members and exchanging #[strong tips, tricks and best practices].
+ | If we're working on experimental models and features, we usually
+ | share them on Gitter first.
+
+ +row
+ +cell #[+a(gh("spaCy") + "/issues") GitHub issue tracker]
+ +cell
+ | #[strong Bug reports] and #[strong improvement suggestions], i.e.
+ | everything that's likely spaCy's fault. This also includes
+ | problems with the models beyond statistical imprecisions, like
+ | patterns that point to a bug.
+
++infobox
+ | Please understand that we won't be able to provide individual support via
+ | email. We also believe that help is much more valuable if it's shared
+ | publicly, so that #[strong more people can benefit from it]. If you come
+ | across an issue and you think you might be able to help, consider posting
+ | a quick update with your solution. No matter how simple, it can easily
+ | save someone a lot of time and headache β and the next time you need help,
+ | they might repay the favour.
+
++h(3, "faq-contributing") How can I contribute to spaCy?
+
+p
+ | You don't have to be an NLP expert or Python pro to contribute, and we're
+ | happy to help you get started. If you're new to spaCy, a good place to
+ | start is the
+ | #[+a(gh("spaCy") + '/issues?q=is%3Aissue+is%3Aopen+label%3A"help+wanted+%28easy%29"') #[code help wanted (easy)] label]
+ | on GitHub, which we use to tag bugs and feature requests that are easy
+ | and self-contained. We also appreciate contributions to the docs β whether
+ | it's fixing a typo, improving an example or adding additional explanations.
+ | You'll find a "Suggest edits" link at the bottom of each page that points
+ | you to the source.
+
+p
+ | Another way of getting involved is to help us improve the
+ | #[+a("/usage/adding-languages#language-data") language data] β
+ | especially if you happen to speak one of the languages currently in
+ | #[+a("/usage/models#languages") alpha support]. Even
+ | adding simple tokenizer exceptions, stop words or lemmatizer data
+ | can make a big difference. It will also make it easier for us to provide
+ | a statistical model for the language in the future. Submitting a test
+ | that documents a bug or performance issue, or covers functionality that's
+ | especially important for your application is also very helpful. This way,
+ | you'll also make sure we never accidentally introduce regressions to the
+ | parts of the library that you care about the most.
+
+p
+ strong
+ | For more details on the types of contributions we're looking for, the
+ | code conventions and other useful tips, make sure to check out the
+ | #[+a(gh("spaCy", "CONTRIBUTING.md")) contributing guidelines].
+
++infobox("Code of Conduct")
+ | spaCy adheres to the
+ | #[+a("http://contributor-covenant.org/version/1/4/") Contributor Covenant Code of Conduct].
+ | By participating, you are expected to uphold this code.
+
++h(3, "faq-project-with-spacy")
+ | I've built something cool with spaCy β how can I get the word out?
+
+p
+ | First, congrats β we'd love to check it out! When you share your
+ | project on Twitter, don't forget to tag
+ | #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] so we
+ | don't miss it. If you think your project would be a good fit for the
+ | #[+a("/usage/resources") resources], #[strong feel free to submit it!]
+ | Tutorials are also incredibly valuable to other users and a great way to
+ | get exposure. So we strongly encourage #[strong writing up your experiences],
+ | or sharing your code and some tips and tricks on your blog. Since our
+ | website is open-source, you can add your project or tutorial by making a
+ | pull request on GitHub.
+
++aside("Contributing to spacy.io")
+ | All showcase and tutorial links are stored in a
+ | #[+a(gh("spaCy", "website/usage/_data.json")) JSON file], so you
+ | won't even have to edit any markup. For more info on how to submit
+ | your project, see the
+ | #[+a(gh("spaCy", "CONTRIBUTING.md#submitting-a-project-to-the-showcase")) contributing guidelines]
+ | and our #[+a(gh("spaCy", "website")) website docs].
+
+p
+ | If you would like to use the spaCy logo on your site, please get in touch
+ | and ask us first. However, if you want to show support and tell others
+ | that your project is using spaCy, you can grab one of our
+ | #[strong spaCy badges] here:
+
+- SPACY_BADGES = ["built%20with-spaCy-09a3d5.svg", "made%20with%20β€%20and-spaCy-09a3d5.svg", "spaCy-v2-09a3d5.svg"]
++quickstart([{id: "badge", input_style: "check", options: SPACY_BADGES.map(function(badge, i) { return {id: i, title: "", checked: (i == 0) ? true : false}}) }], false, false, true)
+ .c-code-block(data-qs-results)
+ for badge, i in SPACY_BADGES
+ - var url = "https://img.shields.io/badge/" + badge
+ +code(false, "text", false, false, "star").o-no-block(data-qs-badge=i)=url
+ +code(false, "text", false, false, "code").o-no-block(data-qs-badge=i).
+ <a href="#{SITE_URL}"><img src="#{url}" height="20"></a>
+ +code(false, "text", false, false, "markdown").o-no-block(data-qs-badge=i).
+ [![spaCy](#{url})](#{SITE_URL})
diff --git a/website/docs/usage/_spacy-101/_language-data.jade b/website/usage/_spacy-101/_language-data.jade
similarity index 86%
rename from website/docs/usage/_spacy-101/_language-data.jade
rename to website/usage/_spacy-101/_language-data.jade
index 1f75b47e8..628152524 100644
--- a/website/docs/usage/_spacy-101/_language-data.jade
+++ b/website/usage/_spacy-101/_language-data.jade
@@ -5,7 +5,7 @@ p
| #[strong exceptions and special cases], especially amongst the most
| common words. Some of these exceptions are shared across languages, while
| others are #[strong entirely specific] β usually so specific that they need
- | to be hard-coded. The #[+src(gh("spaCy", "spacy/lang")) lang] module
+ | to be hard-coded. The #[+src(gh("spaCy", "spacy/lang")) #[code lang]] module
| contains all language-specific data, organised in simple Python files.
| This makes the data easy to update and extend.
@@ -27,15 +27,13 @@ p
nlp_en = English() # includes English data
nlp_de = German() # includes German data
-+image
- include ../../../assets/img/docs/language_data.svg
- .u-text-right
- +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/language_data.svg")
+ include ../../assets/img/language_data.svg
+table(["Name", "Description"])
+row
+cell #[strong Stop words]#[br]
- | #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
+ | #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) #[code stop_words.py]]
+cell
| List of most common words of a language that are often useful to
| filter out, for example "and" or "I". Matching tokens will
@@ -43,21 +41,21 @@ p
+row
+cell #[strong Tokenizer exceptions]#[br]
- | #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) tokenizer_exceptions.py]
+ | #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) #[code tokenizer_exceptions.py]]
+cell
| Special-case rules for the tokenizer, for example, contractions
| like "can't" and abbreviations with punctuation, like "U.K.".
+row
+cell #[strong Norm exceptions]
- | #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py]
+ | #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) #[code norm_exceptions.py]]
+cell
| Special-case rules for normalising tokens to improve the model's
| predictions, for example on American vs. British spelling.
+row
+cell #[strong Punctuation rules]
- | #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
+ | #[+src(gh("spaCy", "spacy/lang/punctuation.py")) #[code punctuation.py]]
+cell
| Regular expressions for splitting tokens, e.g. on punctuation or
| special characters like emoji. Includes rules for prefixes,
@@ -65,14 +63,14 @@ p
+row
+cell #[strong Character classes]
- | #[+src(gh("spaCy", "spacy/lang/char_classes.py")) char_classes.py]
+ | #[+src(gh("spaCy", "spacy/lang/char_classes.py")) #[code char_classes.py]]
+cell
| Character classes to be used in regular expressions, for example,
| latin characters, quotes, hyphens or icons.
+row
+cell #[strong Lexical attributes]
- | #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) lex_attrs.py]
+ | #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) #[code lex_attrs.py]]
+cell
| Custom functions for setting lexical attributes on tokens, e.g.
| #[code like_num], which includes language-specific words like "ten"
@@ -80,22 +78,22 @@ p
+row
+cell #[strong Syntax iterators]
- | #[+src(gh("spaCy", "spacy/lang/en/syntax_iterators.py")) syntax_iterators.py]
+ | #[+src(gh("spaCy", "spacy/lang/en/syntax_iterators.py")) #[code syntax_iterators.py]]
+cell
| Functions that compute views of a #[code Doc] object based on its
| syntax. At the moment, only used for
- | #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks].
+ | #[+a("/usage/linguistic-features#noun-chunks") noun chunks].
+row
+cell #[strong Lemmatizer]
- | #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py]
+ | #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) #[code lemmatizer.py]]
+cell
| Lemmatization rules or a lookup-based lemmatization table to
| assign base forms, for example "be" for "was".
+row
+cell #[strong Tag map]#[br]
- | #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) tag_map.py]
+ | #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) #[code tag_map.py]]
+cell
| Dictionary mapping strings in your tag set to
| #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
@@ -103,7 +101,7 @@ p
+row
+cell #[strong Morph rules]
- | #[+src(gh("spaCy", "spacy/lang/en/morph_rules.py")) morph_rules.py]
+ | #[+src(gh("spaCy", "spacy/lang/en/morph_rules.py")) #[code morph_rules.py]]
+cell
| Exception rules for morphological analysis of irregular words like
| personal pronouns.
diff --git a/website/docs/usage/lightning-tour.jade b/website/usage/_spacy-101/_lightning-tour.jade
similarity index 82%
rename from website/docs/usage/lightning-tour.jade
rename to website/usage/_spacy-101/_lightning-tour.jade
index 2b0cf0880..061ec7758 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/usage/_spacy-101/_lightning-tour.jade
@@ -1,13 +1,11 @@
-//- π« DOCS > USAGE > LIGHTNING TOUR
-
-include ../../_includes/_mixins
+//- π« DOCS > USAGE > SPACY 101 > LIGHTNING TOUR
p
| The following examples and code snippets give you an overview of spaCy's
| functionality and its usage. If you're new to spaCy, make sure to check
- | out the #[+a("/docs/usage/spacy-101") spaCy 101 guide].
+ | out the #[+a("/usage/spacy-101") spaCy 101 guide].
-+h(2, "models") Install models and process text
++h(3, "lightning-tour-models") Install models and process text
+code(false, "bash").
spacy download en
@@ -23,10 +21,10 @@ p
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
- | #[strong Usage:] #[+a("/docs/usage/models") Models],
- | #[+a("/docs/usage/spacy-101") spaCy 101]
+ | #[strong Usage:] #[+a("/usage/models") Models],
+ | #[+a("/usage/spacy-101") spaCy 101]
-+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
++h(3, "lightning-tour-tokens-sentences") Get tokens, noun chunks & sentences
+tag-model("dependency parse")
+code.
@@ -45,9 +43,9 @@ p
+infobox
| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
- | #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
+ | #[strong Usage:] #[+a("/usage/spacy-101") spaCy 101]
-+h(2, "examples-pos-tags") Get part-of-speech tags and flags
++h(3, "lightning-tour-pos-tags") Get part-of-speech tags and flags
+tag-model("tagger")
+code.
@@ -66,9 +64,9 @@ p
+infobox
| #[strong API:] #[+api("token") #[code Token]]
- | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
+ | #[strong Usage:] #[+a("/usage/linguistic-features#pos-tagging") Part-of-speech tagging]
-+h(2, "examples-hashes") Use hash values for any string
++h(3, "lightning-tour-hashes") Use hash values for any string
+code.
doc = nlp(u'I love coffee')
@@ -86,9 +84,9 @@ p
+infobox
| #[strong API:] #[+api("stringstore") #[code stringstore]]
- | #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+ | #[strong Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
-+h(2, "examples-entities") Recongnise and update named entities
++h(3, "lightning-tour-entities") Recongnise and update named entities
+tag-model("NER")
+code.
@@ -103,9 +101,9 @@ p
assert ents == [(0, 7, u'ORG')]
+infobox
- | #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
+ | #[strong Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition]
-+h(2, "displacy") Visualize a dependency parse and named entities in your browser
++h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser
+tag-model("dependency parse", "NER")
+aside
@@ -156,9 +154,9 @@ p
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
- | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
+ | #[strong Usage:] #[+a("/usage/visualizers") Visualizers]
-+h(2, "examples-word-vectors") Get word vectors and similarity
++h(3, "lightning-tour-word-vectors") Get word vectors and similarity
+tag-model("word vectors")
+code.
@@ -171,9 +169,9 @@ p
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
+infobox
- | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
+ | #[strong Usage:] #[+a("/usage/vectors-similarity") Word vectors and similarity]
-+h(2, "examples-serialization") Simple and efficient serialization
++h(3, "lightning-tour-serialization") Simple and efficient serialization
+code.
import spacy
@@ -190,9 +188,9 @@ p
+infobox
| #[strong API:] #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]]
- | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+ | #[strong Usage:] #[+a("/usage/models#saving-loading") Saving and loading models]
-+h(2, "rule-matcher") Match text with token rules
++h(3, "lightning-tour-rule-matcher") Match text with token rules
+code.
import spacy
@@ -212,9 +210,9 @@ p
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
- | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+ | #[strong Usage:] #[+a("/usage/linguistic-features#rule-based-matching") Rule-based matching]
-+h(2, "multi-threaded") Multi-threaded generator
++h(3, "lightning-tour-multi-threaded") Multi-threaded generator
+code.
texts = [u'One document.', u'...', u'Lots of documents']
@@ -227,9 +225,9 @@ p
+infobox
| #[strong API:] #[+api("doc") #[code Doc]]
- | #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
+ | #[strong Usage:] #[+a("/usage/processing-pipelines#multithreading") Processing pipelines]
-+h(2, "examples-dependencies") Get syntactic dependencies
++h(3, "lightning-tour-dependencies") Get syntactic dependencies
+tag-model("dependency parse")
+code.
@@ -243,9 +241,9 @@ p
+infobox
| #[strong API:] #[+api("token") #[code Token]]
- | #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
+ | #[strong Usage:] #[+a("/usage/linguistic-features#dependency-parse") Using the dependency parse]
-+h(2, "examples-numpy-arrays") Export to numpy arrays
++h(3, "lightning-tour-numpy-arrays") Export to numpy arrays
+code.
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
@@ -258,7 +256,7 @@ p
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
-+h(2, "examples-inline") Calculate inline markup on original string
++h(3, "lightning-tour-inline") Calculate inline markup on original string
+code.
def put_spans_around_tokens(doc, get_classes):
diff --git a/website/docs/usage/_spacy-101/_named-entities.jade b/website/usage/_spacy-101/_named-entities.jade
similarity index 90%
rename from website/docs/usage/_spacy-101/_named-entities.jade
rename to website/usage/_spacy-101/_named-entities.jade
index a3c539564..d9c595e6a 100644
--- a/website/docs/usage/_spacy-101/_named-entities.jade
+++ b/website/usage/_spacy-101/_named-entities.jade
@@ -3,7 +3,7 @@
p
| A named entity is a "real-world object" that's assigned a name β for
| example, a person, a country, a product or a book title. spaCy can
- | #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types]
+ | #[strong recognise] #[+a("/api/annotation#named-entities") various types]
| of named entities in a document, by asking the model for a
| #[strong prediction]. Because models are statistical and strongly depend
| on the examples they were trained on, this doesn't always work
@@ -32,7 +32,7 @@ p
+annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style)
p
- | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
+ | Using spaCy's built-in #[+a("/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its named entities look like:
+codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160)
diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/usage/_spacy-101/_pipelines.jade
similarity index 89%
rename from website/docs/usage/_spacy-101/_pipelines.jade
rename to website/usage/_spacy-101/_pipelines.jade
index c21c9f97c..4e9cd8aeb 100644
--- a/website/docs/usage/_spacy-101/_pipelines.jade
+++ b/website/usage/_spacy-101/_pipelines.jade
@@ -5,15 +5,13 @@ p
| produce a #[code Doc] object. The #[code Doc] is then processed in several
| different steps β this is also referred to as the
| #[strong processing pipeline]. The pipeline used by the
- | #[+a("/docs/usage/models") default models] consists of a
+ | #[+a("/models") default models] consists of a
| tensorizer, a tagger, a parser and an entity recognizer. Each pipeline
| component returns the processed #[code Doc], which is then passed on to
| the next component.
-+image
- include ../../../assets/img/docs/pipeline.svg
- .u-text-right
- +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/pipeline.svg")
+ include ../../assets/img/pipeline.svg
+aside
| #[strong Name:] ID of the pipeline component.#[br]
@@ -30,7 +28,7 @@ p
+row("divider")
+cell tensorizer
- +cell #[code TokenVectorEncoder]
+ +cell #[+api("tensorizer") Tensorizer]
+cell #[code Doc.tensor]
+cell Create feature representation tensor for #[code Doc].
@@ -54,6 +52,12 @@ p
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
+cell Detect and label named entities.
+ +row
+ +cell textcat
+ +cell #[+api("textcategorizer") #[code TextCategorizer]]
+ +cell #[code Doc.cats]
+ +cell Assign document labels.
+
p
| The processing pipeline always #[strong depends on the statistical model]
| and its capabilities. For example, a pipeline can only include an entity
diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/usage/_spacy-101/_pos-deps.jade
similarity index 95%
rename from website/docs/usage/_spacy-101/_pos-deps.jade
rename to website/usage/_spacy-101/_pos-deps.jade
index 52a7fdd3c..a8f7f04b5 100644
--- a/website/docs/usage/_spacy-101/_pos-deps.jade
+++ b/website/usage/_spacy-101/_pos-deps.jade
@@ -1,7 +1,7 @@
//- π« DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING
p
- | After tokenization, spaCy can also #[strong parse] and #[strong tag] a
+ | After tokenization, spaCy can #[strong parse] and #[strong tag] a
| given #[code Doc]. This is where the statistical model comes in, which
| enables spaCy to #[strong make a prediction] of which tag or label most
| likely applies in this context. A model consists of binary data and is
@@ -56,7 +56,7 @@ p
| singular present".
p
- | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
+ | Using spaCy's built-in #[+a("/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its dependencies look like:
+codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460)
diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/usage/_spacy-101/_serialization.jade
similarity index 100%
rename from website/docs/usage/_spacy-101/_serialization.jade
rename to website/usage/_spacy-101/_serialization.jade
diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/usage/_spacy-101/_similarity.jade
similarity index 100%
rename from website/docs/usage/_spacy-101/_similarity.jade
rename to website/usage/_spacy-101/_similarity.jade
diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/usage/_spacy-101/_tokenization.jade
similarity index 90%
rename from website/docs/usage/_spacy-101/_tokenization.jade
rename to website/usage/_spacy-101/_tokenization.jade
index d6911387c..602209ec8 100644
--- a/website/docs/usage/_spacy-101/_tokenization.jade
+++ b/website/usage/_spacy-101/_tokenization.jade
@@ -49,14 +49,12 @@ p
| #[strong Infix:] Character(s) in between, e.g.
| #[code -], #[code --], #[code /], #[code β¦].#[br]
-+image
- include ../../../assets/img/docs/tokenization.svg
- .u-text-right
- +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/tokenization.svg")
+ include ../../assets/img/tokenization.svg
p
| While punctuation rules are usually pretty general, tokenizer exceptions
| strongly depend on the specifics of the individual language. This is
- | why each #[+a("/docs/api/language-models") available language] has its
+ | why each #[+a("/models/#languages") available language] has its
| own subclass like #[code English] or #[code German], that loads in lists
| of hard-coded data and exception rules.
diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/usage/_spacy-101/_training.jade
similarity index 94%
rename from website/docs/usage/_spacy-101/_training.jade
rename to website/usage/_spacy-101/_training.jade
index 9b283c0eb..5d97a86df 100644
--- a/website/docs/usage/_spacy-101/_training.jade
+++ b/website/usage/_spacy-101/_training.jade
@@ -24,10 +24,8 @@ p
| #[strong Gradient:] Gradient of the loss function calculating the
| difference between input and expected output.
-+image
- include ../../../assets/img/docs/training.svg
- .u-text-right
- +button("/assets/img/docs/training.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/training.svg")
+ include ../../assets/img/training.svg
p
| When training a model, we don't just want it to memorise our examples β
diff --git a/website/docs/usage/_spacy-101/_vocab.jade b/website/usage/_spacy-101/_vocab.jade
similarity index 96%
rename from website/docs/usage/_spacy-101/_vocab.jade
rename to website/usage/_spacy-101/_vocab.jade
index 3063262d5..185e634fe 100644
--- a/website/docs/usage/_spacy-101/_vocab.jade
+++ b/website/usage/_spacy-101/_vocab.jade
@@ -19,10 +19,8 @@ p
| #[strong StringStore]: The dictionary mapping hash values to strings, for
| example #[code 3197928453018144401] → "coffee".
-+image
- include ../../../assets/img/docs/vocab_stringstore.svg
- .u-text-right
- +button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/vocab_stringstore.svg")
+ include ../../assets/img/vocab_stringstore.svg
p
| If you process lots of documents containing the word "coffee" in all
diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade
similarity index 98%
rename from website/docs/usage/_spacy-101/_word-vectors.jade
rename to website/usage/_spacy-101/_word-vectors.jade
index cbb9d06f2..bb9add8a6 100644
--- a/website/docs/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@@ -5,7 +5,7 @@ p
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
- | #[+a("/docs/usage/models") default models] come with
+ | #[+a("/models") default models] come with
| #[strong 300-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
@@ -148,5 +148,5 @@ p
p
| If your application will benefit from a large vocabulary with more
| vectors, you should consider using one of the
- | #[+a("/docs/usage/models#available") larger models] instead of the default,
+ | #[+a("/models") larger models] instead of the default,
| smaller ones, which usually come with a clipped vocabulary.
diff --git a/website/docs/usage/training.jade b/website/usage/_training/_basics.jade
similarity index 89%
rename from website/docs/usage/training.jade
rename to website/usage/_training/_basics.jade
index c1a7c1835..05e67c2c1 100644
--- a/website/docs/usage/training.jade
+++ b/website/usage/_training/_basics.jade
@@ -1,14 +1,6 @@
-include ../../_includes/_mixins
+//- π« DOCS > USAGE > TRAINING > BASICS
-p
- | This guide describes how to train new statistical models for spaCy's
- | part-of-speech tagger, named entity recognizer and dependency parser.
- | Once the model is trained, you can then
- | #[+a("/docs/usage/saving-loading") save and load] it.
-
-+h(2, "101") Training 101
-
-include _spacy-101/_training
+include ../_spacy-101/_training
+h(3, "training-data") How do I get training data?
@@ -50,7 +42,7 @@ p
p
| Alternatively, the
- | #[+a("/docs/usage/rule-based-matching#example3") rule-based matcher]
+ | #[+a("/usage/linguistic-features#rule-based-matching") rule-based matcher]
| can be a useful tool to extract tokens or combinations of tokens, as
| well as their start and end index in a document. In this case, we'll
| extract mentions of Google and assume they're an #[code ORG].
@@ -73,7 +65,7 @@ p
| #[strong what you want the model to learn]. While there are some entity
| annotations that are more or less universally correct β like Canada being
| a geopolitical entity β your application may have its very own definition
- | of the #[+a("/docs/api/annotation#named-entities") NER annotation scheme].
+ | of the #[+a("/api/annotation#named-entities") NER annotation scheme].
+code.
train_data = [
@@ -84,7 +76,7 @@ p
("Google rebrands its business apps", [(0, 6, "ORG")]),
("look what i found on google! π", [(21, 27, "PRODUCT")])]
-+h(2) Training with annotations
++h(3, "annotations") Training with annotations
p
| The #[+api("goldparse") #[code GoldParse]] object collects the annotated
@@ -103,7 +95,7 @@ p
p
| Using the #[code Doc] and its gold-standard annotations, the model can be
| updated to learn a sentence of three words with their assigned
- | part-of-speech tags. The #[+a("/docs/usage/adding-languages#tag-map") tag map]
+ | part-of-speech tags. The #[+a("/usage/adding-languages#tag-map") tag map]
| is part of the vocabulary and defines the annotation scheme. If you're
| training a new language model, this will let you map the tags present in
| the treebank you train on to spaCy's tag scheme.
@@ -115,7 +107,7 @@ p
p
| The same goes for named entities. The letters added before the labels
| refer to the tags of the
- | #[+a("/docs/usage/entity-recognition#updating-biluo") BILUO scheme] β
+ | #[+a("/usage/linguistic-features#updating-biluo") BILUO scheme] β
| #[code O] is a token outside an entity, #[code U] an single entity unit,
| #[code B] the beginning of an entity, #[code I] a token inside an entity
| and #[code L] the last token of an entity.
@@ -130,10 +122,8 @@ p
| #[strong Update]: Update the model's weights.#[br]
| #[strong ]
-+image
- include ../../assets/img/docs/training-loop.svg
- .u-text-right
- +button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
++graphic("/assets/img/training-loop.svg")
+ include ../../assets/img/training-loop.svg
p
| Of course, it's not enough to only show a model a single example once.
@@ -192,11 +182,7 @@ p
+infobox
| For the #[strong full example and more details], see the usage guide on
- | #[+a("/docs/usage/training-ner") training the named entity recognizer],
+ | #[+a("/usage/training#ner") training the named entity recognizer],
| or the runnable
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
| on GitHub.
-
-+h(2) Examples
-
-+under-construction
diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade
new file mode 100644
index 000000000..ff3101c8f
--- /dev/null
+++ b/website/usage/_training/_ner.jade
@@ -0,0 +1,61 @@
+//- π« DOCS > USAGE > TRAINING > NER
+
+p
+ | All #[+a("/models") spaCy models] support online learning, so
+ | you can update a pre-trained model with new examples. To update the
+ | model, you first need to create an instance of
+ | #[+api("goldparse") #[code GoldParse]], with the entity labels
+ | you want to learn. You'll usually need to provide many examples to
+ | meaningfully improve the system β a few hundred is a good start, although
+ | more is better.
+
+p
+ | You should avoid iterating over the same few examples multiple times, or
+ | the model is likely to "forget" how to annotate other examples. If you
+ | iterate over the same few examples, you're effectively changing the loss
+ | function. The optimizer will find a way to minimize the loss on your
+ | examples, without regard for the consequences on the examples it's no
+ | longer paying attention to. One way to avoid this
+ | #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) "catastrophic forgetting" problem]
+ | is to "remind"
+ | the model of other examples by augmenting your annotations with sentences
+ | annotated with entities automatically recognised by the original model.
+ | Ultimately, this is an empirical process: you'll need to
+ | #[strong experiment on your own data] to find a solution that works best
+ | for you.
+
++h(3, "example-new-entity-type") Example: Training an additional entity type
+
+p
+ | This script shows how to add a new entity type to an existing pre-trained
+ | NER model. To keep the example short and simple, only a few sentences are
+ | provided as examples. In practice, you'll need many more β a few hundred
+ | would be a good start. You will also likely need to mix in examples of
+ | other entity types, which might be obtained by running the entity
+ | recognizer over unlabelled sentences, and adding their annotations to the
+ | training set.
+
+p
+ | The actual training is performed by looping over the examples, and
+ | calling #[+api("language#update") #[code nlp.update()]]. The
+ | #[code update] method steps through the words of the input. At each word,
+ | it makes a prediction. It then consults the annotations provided on the
+ | #[+api("goldparse") #[code GoldParse]] instance, to see whether it was
+ | right. If it was wrong, it adjusts its weights so that the correct
+ | action will score higher next time.
+
++github("spacy", "examples/training/train_new_entity_type.py")
+
++h(3, "example-ner-from-scratch") Example: Training an NER system from scratch
+
+p
+ | This example is written to be self-contained and reasonably transparent.
+ | To achieve that, it duplicates some of spaCy's internal functionality.
+ | Specifically, in this example, we don't use spaCy's built-in
+ | #[+api("language") #[code Language]] class to wire together the
+ | #[+api("vocab") #[code Vocab]], #[+api("tokenizer") #[code Tokenizer]]
+ | and #[+api("entityrecognizer") #[code EntityRecognizer]]. Instead, we
+ | write our own simle #[code Pipeline] class, so that it's easier to see
+ | how the pieces interact.
+
++github("spacy", "examples/training/train_ner_standalone.py")
diff --git a/website/docs/usage/saving-loading.jade b/website/usage/_training/_saving-loading.jade
similarity index 70%
rename from website/docs/usage/saving-loading.jade
rename to website/usage/_training/_saving-loading.jade
index de7e4ed33..e6e54385c 100644
--- a/website/docs/usage/saving-loading.jade
+++ b/website/usage/_training/_saving-loading.jade
@@ -1,45 +1,4 @@
-include ../../_includes/_mixins
-
-+h(2, "101") Serialization 101
-
-include _spacy-101/_serialization
-
-+infobox("Important note")
- | In spaCy v2.0, the API for saving and loading has changed to only use the
- | four methods listed above consistently across objects and classes. For an
- | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table]
- | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating].
-
-+h(3, "example-doc") Example: Saving and loading a document
-
-p
- | For simplicity, let's assume you've
- | #[+a("/docs/usage/entity-recognition#setting") added custom entities] to
- | a #[code Doc], either manually, or by using a
- | #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can
- | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
- | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
- | This will overwrite the existing object and return it.
-
-+code.
- import spacy
- from spacy.tokens import Span
-
- text = u'Netflix is hiring a new VP of global policy'
-
- nlp = spacy.load('en')
- doc = nlp(text)
- assert len(doc.ents) == 0 # Doc has no entities
- doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
- doc.to_disk('/path/to/doc') # save Doc to disk
-
- new_doc = nlp(text)
- assert len(new_doc.ents) == 0 # new Doc has no entities
- new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
- assert len(new_doc.ents) == 1 # entity is now recognised!
- assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]
-
-+h(2, "models") Saving models
+//- π« DOCS > USAGE > TRAINING > SAVING & LOADING
p
| After training your model, you'll usually want to save its state, and load
@@ -55,6 +14,7 @@ p
| will be written out. To make the model more convenient to deploy, we
| recommend wrapping it as a Python package.
+
+h(3, "models-generating") Generating a model package
+infobox("Important note")
@@ -105,13 +65,14 @@ p
| need to be named according to the naming conventions of
| #[code lang_name] and #[code lang_name-version].
+
+h(3, "models-custom") Customising the model setup
p
| The meta.json includes the model details, like name, requirements and
| license, and lets you customise how the model should be initialised and
| loaded. You can define the language data to be loaded and the
- | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
+ | #[+a("/usage/processing-pipelines") processing pipeline] to
| execute.
+table(["Setting", "Type", "Description"])
@@ -126,7 +87,7 @@ p
+cell
| A list of strings mapping to the IDs of pipeline factories to
| apply in that order. If not set, spaCy's
- | #[+a("/docs/usage/language-processing/pipelines") default pipeline]
+ | #[+a("/usage/processing-pipelines") default pipeline]
| will be used.
p
@@ -135,7 +96,7 @@ p
| #[code Language] object with the loaded pipeline and data. If your model
| requires custom pipeline components, you should
| #[strong ship then with your model] and register their
- | #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories]
+ | #[+a("/usage/processing-pipelines#creating-factory") factories]
| via #[+api("spacy#set_factory") #[code set_factory()]].
+aside-code("Factory example").
@@ -152,7 +113,7 @@ p
+infobox("Custom models with pipeline components")
| For more details and an example of how to package a sentiment model
| with a custom pipeline component, see the usage guide on
- | #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines].
+ | #[+a("/usage/processing-pipelines#example2") language processing pipelines].
+h(3, "models-building") Building the model package
@@ -176,7 +137,7 @@ p
| You can then load the model via its name, #[code en_example_model], or
| import it directly as a module and then call its #[code load()] method.
-+h(2, "loading") Loading a custom model package
++h(3, "loading") Loading a custom model package
p
| To load a model from a data directory, you can use
@@ -209,3 +170,38 @@ p
+code-new nlp = English().from_disk('/path/to/data')
+code-old nlp = spacy.load('en', path='/path/to/data')
+
++h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy
+
+p
+ | Publishing a new version of spaCy often means re-training all available
+ | models β currently, that's #{MODEL_COUNT} models for #{MODEL_LANG_COUNT}
+ | languages. To make this run smoothly, we're using an automated build
+ | process and a #[+api("cli#train") #[code spacy train]] template that
+ | looks like this:
+
++code(false, "bash", "$", false, false, true).
+ spacy train {lang} {models_dir}/{name} {train_data} {dev_data} -m meta/{name}.json -V {version} -g {gpu_id} -n {n_epoch} -ns {n_sents}
+
++aside-code("meta.json template", "json").
+ {
+ "lang": "en",
+ "name": "core_web_sm",
+ "license":"CC BY-SA 3.0",
+ "author":"Explosion AI",
+ "url":"https://explosion.ai",
+ "email":"contact@explosion.ai",
+ "sources": ["OntoNotes 5", "Common Crawl"],
+ "description":"English multi-task CNN trained on OntoNotes, with GloVe vectors trained on common crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities."
+ }
+
+p In a directory #[code meta], we keep #[code meta.json] templates for the individual models, containing all relevant information that doesn't change across versions, like the name, description, author info and training data sources. When we train the model, we pass in the file to the meta template as the #[code --meta] argument, and specify the current model version as the #[code --version] argument.
+
+p On each epoch, the model is saved out with a #[code meta.json] using our template and added properties, like the #[code pipeline], #[code accuracy] scores and the #[code spacy_version] used to train the model. After training completion, the best model is selected automatically and packaged using the #[+api("cli#package") #[code package]] command. Since a full meta file is already present on the trained model, no further setup is required to build a valid model package.
+
++code(false, "bash").
+ spacy package -f {best_model} dist/
+ cd dist/{model_name}
+ python setup.py sdist
+
+p This process allows us to quickly trigger the model training and build process for all available models and languages, and generate the correct meta data automatically.
diff --git a/website/usage/_training/_similarity.jade b/website/usage/_training/_similarity.jade
new file mode 100644
index 000000000..eb7991c37
--- /dev/null
+++ b/website/usage/_training/_similarity.jade
@@ -0,0 +1,3 @@
+//- π« DOCS > USAGE > TRAINING > SIMILARITY
+
++under-construction
diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade
new file mode 100644
index 000000000..4011464c7
--- /dev/null
+++ b/website/usage/_training/_tagger-parser.jade
@@ -0,0 +1,3 @@
+//- π« DOCS > USAGE > TRAINING > TAGGER & PARSER
+
++under-construction
diff --git a/website/usage/_training/_textcat.jade b/website/usage/_training/_textcat.jade
new file mode 100644
index 000000000..5c90519db
--- /dev/null
+++ b/website/usage/_training/_textcat.jade
@@ -0,0 +1,13 @@
+//- π« DOCS > USAGE > TRAINING > TEXT CLASSIFICATION
+
++under-construction
+
++h(3, "example-textcat") Example: Training spaCy's text classifier
+ +tag-new(2)
+
+p
+ | This example shows how to use and train spaCy's new
+ | #[+api("textcategorizer") #[code TextCategorizer]] pipeline component
+ | on IMDB movie reviews.
+
++github("spacy", "examples/training/train_textcat.py")
diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade
new file mode 100644
index 000000000..b8f8d834c
--- /dev/null
+++ b/website/usage/_vectors-similarity/_basics.jade
@@ -0,0 +1,15 @@
+//- π« DOCS > USAGE > VECTORS & SIMILARITY > BASICS
+
++aside("Training word vectors")
+ | Dense, real valued vectors representing distributional similarity
+ | information are now a cornerstone of practical NLP. The most common way
+ | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
+ | family of algorithms. The default
+ | #[+a("/models/en") English model] installs
+ | 300-dimensional vectors trained on the
+ | #[+a("http://commoncrawl.org") Common Crawl] corpus.
+ | If you need to train a word2vec model, we recommend the implementation in
+ | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+
+include ../_spacy-101/_similarity
+include ../_spacy-101/_word-vectors
diff --git a/website/usage/_vectors-similarity/_custom.jade b/website/usage/_vectors-similarity/_custom.jade
new file mode 100644
index 000000000..da4be39fd
--- /dev/null
+++ b/website/usage/_vectors-similarity/_custom.jade
@@ -0,0 +1,91 @@
+//- π« DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS
+
+p
+ | By default, #[+api("token#vector") #[code Token.vector]] returns the
+ | vector for its underlying #[+api("lexeme") #[code Lexeme]], while
+ | #[+api("doc#vector") #[code Doc.vector]] and
+ | #[+api("span#vector") #[code Span.vector]] return an average of the
+ | vectors of their tokens. You can customize these
+ | behaviours by modifying the #[code doc.user_hooks],
+ | #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
+ | dictionaries.
+
++infobox
+ | For more details on #[strong adding hooks] and #[strong overwriting] the
+ | built-in #[code Doc], #[code Span] and #[code Token] methods, see the
+ | usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
+
++h(3, "custom-vectors-add") Adding vectors
+ +tag-new(2)
+
+p
+ | The new #[+api("vectors") #[code Vectors]] class makes it easy to add
+ | your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]],
+ | it is initialised with a #[+api("stringstore") #[code StringStore]] or
+ | a list of strings.
+
++code("Adding vectors one-by-one").
+ from spacy.strings import StringStore
+ from spacy.vectors import Vectors
+
+ vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)),
+ 'cat': numpy.random.uniform(-1, 1, (300,)),
+ 'orange': numpy.random.uniform(-1, 1, (300,))}
+
+ vectors = Vectors(StringStore(), 300)
+ for word, vector in vector_data.items():
+ vectors.add(word, vector)
+
+p
+ | You can also add the vector values directly on initialisation:
+
++code("Adding vectors on initialisation").
+ from spacy.vectors import Vectors
+
+ vector_table = numpy.zeros((3, 300), dtype='f')
+ vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+
++h(3, "custom-loading-glove") Loading GloVe vectors
+ +tag-new(2)
+
+p
+ | spaCy comes with built-in support for loading
+ | #[+a("https://nlp.stanford.edu/projects/glove/") GloVe] vectors from
+ | a directory. The #[+api("vectors#from_glove") #[code Vectors.from_glove]]
+ | method assumes a binary format, the vocab provided in a
+ | #[code vocab.txt], and the naming scheme of
+ | #[code vectors.{size}.[fd].bin]. For example:
+
++aside-code("Directory structure", "yaml").
+ βββ vectors
+ βββ vectors.128.f.bin # vectors file
+ βββ vocab.txt # vocabulary
+
++table(["File name", "Dimensions", "Data type"])
+ +row
+ +cell #[code vectors.128.f.bin]
+ +cell 128
+ +cell float32
+
+ +row
+ +cell #[code vectors.300.d.bin]
+ +cell 300
+ +cell float64 (double)
+
++code.
+ from spacy.vectors import Vectors
+
+ vectors = Vectors([], 128)
+ vectors.from_glove('/path/to/vectors')
+
++h(3, "custom-loading-other") Loading other vectors
+ +tag-new(2)
+
+p
+ | You can also choose to load in vectors from other sources, like the
+ | #[+a("https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md") fastText vectors]
+ | for 294 languages, trained on Wikipedia. After reading in the file,
+ | the vectors are added to the #[code Vocab] using the
+ | #[+api("vocab#set_vector") #[code set_vector]] method.
+
++github("spacy", "examples/vectors_fast_text.py")
diff --git a/website/usage/_vectors-similarity/_gpu.jade b/website/usage/_vectors-similarity/_gpu.jade
new file mode 100644
index 000000000..9f1201da9
--- /dev/null
+++ b/website/usage/_vectors-similarity/_gpu.jade
@@ -0,0 +1,30 @@
+//- π« DOCS > USAGE > VECTORS & SIMILARITY > GPU
+
+p
+ | If you're using a GPU, it's much more efficient to keep the word vectors
+ | on the device. You can do that by setting the
+ | #[+api("vectors#attributes") #[code Vectors.data]] attribute to a
+ | #[code cupy.ndarray] object if you're using spaCy
+ | or #[+a("https://chainer.org") Chainer], or a
+ | #[code torch.Tensor] object if you're using
+ | #[+a("http://pytorch.org") PyTorch]. The #[code data] object just needs
+ | to support #[code __iter__] and #[code __getitem__], so if you're using
+ | another library such as #[+a("https://www.tensorflow.org") TensorFlow],
+ | you could also create a wrapper for your vectors data.
+
++code("spaCy, Thinc or Chainer").
+ import cupy.cuda
+ from spacy.vectors import Vectors
+
+ vector_table = numpy.zeros((3, 300), dtype='f')
+ vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+ with cupy.cuda.Device(0):
+ vectors.data = cupy.asarray(vectors.data)
+
++code("PyTorch").
+ import torch
+ from spacy.vectors import Vectors
+
+ vector_table = numpy.zeros((3, 300), dtype='f')
+ vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+ vectors.data = torch.Tensor(vectors.data).cuda(0)
diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/usage/_vectors-similarity/_in-context.jade
similarity index 72%
rename from website/docs/usage/word-vectors-similarities.jade
rename to website/usage/_vectors-similarity/_in-context.jade
index 937fbfbd0..d8e864d9d 100644
--- a/website/docs/usage/word-vectors-similarities.jade
+++ b/website/usage/_vectors-similarity/_in-context.jade
@@ -1,34 +1,11 @@
-//- π« DOCS > USAGE > WORD VECTORS & SIMILARITIES
-
-include ../../_includes/_mixins
-
-p
- | Dense, real valued vectors representing distributional similarity
- | information are now a cornerstone of practical NLP. The most common way
- | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
- | family of algorithms. The default
- | #[+a("/docs/usage/models#available") English model] installs
- | 300-dimensional vectors trained on the
- | #[+a("http://commoncrawl.org") Common Crawl] corpus.
-
-+aside("Tip: Training a word2vec model")
- | If you need to train a word2vec model, we recommend the implementation in
- | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
-
-+h(2, "101") Similarity and word vectors 101
- +tag-model("vectors")
-
-include _spacy-101/_similarity
-include _spacy-101/_word-vectors
-
-+h(2, "similarity-context") Similarities in context
+//- π« DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT
p
| Aside from spaCy's built-in word vectors, which were trained on a lot of
| text with a wide vocabulary, the parsing, tagging and NER models also
| rely on vector representations of the #[strong meanings of words in context].
| As the first component of the
- | #[+a("/docs/usage/language-processing-pipeline") processing pipeline], the
+ | #[+a("/usage/processing-pipelines") processing pipeline], the
| tensorizer encodes a document's internal meaning representations as an
| array of floats, also called a tensor. This allows spaCy to make a
| reasonable guess at a word's meaning, based on its surrounding words.
@@ -117,8 +94,8 @@ p
nlp(u"man dog bites"), nlp(u"dog man bites")]
for doc in docs:
- for other_doc in docs:
- print(doc.similarity(other_doc))
+ for other_doc in docs:
+ print(doc.similarity(other_doc))
p
| Interestingly, "man bites dog" and "man dog bites" are seen as slightly
@@ -143,17 +120,3 @@ p
+cell.u-text-center #[code=cell.toFixed(2)]
| #[+procon(cell < 0.7 ? "con" : cell != 1 ? "pro" : "neutral")]
- counter++
-
-+h(2, "custom") Customising word vectors
-
-+under-construction
-
-p
- | By default, #[+api("token#vector") #[code Token.vector]] returns the
- | vector for its underlying #[+api("lexeme") #[code Lexeme]], while
- | #[+api("doc#vector") #[code Doc.vector]] and
- | #[+api("span#vector") #[code Span.vector]] return an average of the
- | vectors of their tokens. You can customize these
- | behaviours by modifying the #[code doc.user_hooks],
- | #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
- | dictionaries.
diff --git a/website/usage/_visualizers/_dep.jade b/website/usage/_visualizers/_dep.jade
new file mode 100644
index 000000000..b028ba4cf
--- /dev/null
+++ b/website/usage/_visualizers/_dep.jade
@@ -0,0 +1,62 @@
+//- π« DOCS > USAGE > VISUALIZERS > DEPENDENCIES
+
+p
+ | The dependency visualizer, #[code dep], shows part-of-speech tags
+ | and syntactic dependencies.
+
++code("Dependency example").
+ import spacy
+ from spacy import displacy
+
+ nlp = spacy.load('en')
+ doc = nlp(u'This is a sentence.')
+ displacy.serve(doc, style='dep')
+
++codepen("f0e85b64d469d6617251d8241716d55f", 370)
+
+p
+ | The argument #[code options] lets you specify a dictionary of settings
+ | to customise the layout, for example:
+
++aside("Important note")
+ | There's currently a known issue with the #[code compact] mode for
+ | sentences with short arrows and long dependency labels, that causes labels
+ | longer than the arrow to wrap. So if you come across this problem,
+ | especially when using custom labels, you'll have to increase the
+ | #[code distance] setting in the #[code options] to allow longer arcs.
+
++table(["Name", "Type", "Description", "Default"])
+ +row
+ +cell #[code compact]
+ +cell bool
+ +cell "Compact mode" with square arrows that takes up less space.
+ +cell #[code False]
+
+ +row
+ +cell #[code color]
+ +cell unicode
+ +cell Text color (HEX, RGB or color names).
+ +cell #[code '#000000']
+
+ +row
+ +cell #[code bg]
+ +cell unicode
+ +cell Background color (HEX, RGB or color names).
+ +cell #[code '#ffffff']
+
+ +row
+ +cell #[code font]
+ +cell unicode
+ +cell Font name or font family for all text.
+ +cell #[code 'Arial']
+
+p
+ | For a list of all available options, see the
+ | #[+api("displacy#options") #[code displacy] API documentation].
+
++aside-code("Options example").
+ options = {'compact': True, 'bg': '#09a3d5',
+ 'color': 'white', 'font': 'Source Sans Pro'}
+ displacy.serve(doc, style='dep', options=options)
+
++codepen("39c02c893a84794353de77a605d817fd", 360)
diff --git a/website/usage/_visualizers/_ent.jade b/website/usage/_visualizers/_ent.jade
new file mode 100644
index 000000000..e9174cc55
--- /dev/null
+++ b/website/usage/_visualizers/_ent.jade
@@ -0,0 +1,80 @@
+//- π« DOCS > USAGE > VISUALIZERS > ENTITIES
+
+p
+ | The entity visualizer, #[code ent], highlights named entities and
+ | their labels in a text.
+
++code("Named Entity example").
+ import spacy
+ from spacy import displacy
+
+ text = """But Google is starting from behind. The company made a late push
+ into hardware, and Appleβs Siri, available on iPhones, and Amazonβs Alexa
+ software, which runs on its Echo and Dot devices, have clear leads in
+ consumer adoption."""
+
+ nlp = spacy.load('custom_ner_model')
+ doc = nlp(text)
+ displacy.serve(doc, style='ent')
+
++codepen("a73f8b68f9af3157855962b283b364e4", 345)
+
+p The entity visualizer lets you customise the following #[code options]:
+
++table(["Name", "Type", "Description", "Default"])
+ +row
+ +cell #[code ents]
+ +cell list
+ +cell
+ | Entity types to highlight (#[code None] for all types).
+ +cell #[code None]
+
+ +row
+ +cell #[code colors]
+ +cell dict
+ +cell
+ | Color overrides. Entity types in lowercase should be mapped to
+ | color names or values.
+ +cell #[code {}]
+
+p
+ | If you specify a list of #[code ents], only those entity types will be
+ | rendered β for example, you can choose to display #[code PERSON] entities.
+ | Internally, the visualizer knows nothing about available entity types and
+ | will render whichever spans and labels it receives. This makes it
+ | especially easy to work with custom entity types. By default, displaCy
+ | comes with colours for all
+ | #[+a("/api/annotation#named-entities") entity types supported by spaCy].
+ | If you're using custom entity types, you can use the #[code colors]
+ | setting to add your own colours for them.
+
++aside-code("Options example").
+ colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
+ options = {'ents': ['ORG'], 'colors': colors}
+ displacy.serve(doc, style='ent', options=options)
+
++codepen("f42ec690762b6f007022a7acd6d0c7d4", 300)
+
+p
+ | The above example uses a little trick: Since the background colour values
+ | are added as the #[code background] style attribute, you can use any
+ | #[+a("https://tympanus.net/codrops/css_reference/background/") valid background value]
+ | or shorthand β including gradients and even images!
+
++h(3, "ent-titles") Adding titles to documents
+
+p
+ | Rendering several large documents on one page can easily become confusing.
+ | To add a headline to each visualization, you can add a #[code title] to
+ | its #[code user_data]. User data is never touched or modified by spaCy.
+
++code.
+ doc = nlp(u'This is a sentence about Google.')
+ doc.user_data['title'] = 'This is a title'
+ displacy.serve(doc, style='ent')
+
+p
+ | This feature is espeically handy if you're using displaCy to compare
+ | performance at different stages of a process, e.g. during training. Here
+ | you could use the title for a brief description of the text example and
+ | the number of iterations.
diff --git a/website/usage/_visualizers/_html.jade b/website/usage/_visualizers/_html.jade
new file mode 100644
index 000000000..595192442
--- /dev/null
+++ b/website/usage/_visualizers/_html.jade
@@ -0,0 +1,162 @@
+//- π« DOCS > USAGE > VISUALIZERS > HTML
+
+p
+ | If you don't need the web server and just want to generate the markup
+ | β for example, to export it to a file or serve it in a custom
+ | way β you can use #[+api("displacy#render") #[code displacy.render]].
+ | It works the same way, but returns a string containing the markup.
+
++code("Example").
+ import spacy
+ from spacy import displacy
+
+ nlp = spacy.load('en')
+ doc1 = nlp(u'This is a sentence.')
+ doc2 = nlp(u'This is another sentence.')
+ html = displacy.render([doc1, doc2], style='dep', page=True)
+
+p
+ | #[code page=True] renders the markup wrapped as a full HTML page.
+ | For minified and more compact HTML markup, you can set #[code minify=True].
+ | If you're rendering a dependency parse, you can also export it as an
+ | #[code .svg] file.
+
++aside("What's SVG?")
+ | Unlike other image formats, the SVG (Scalable Vector Graphics) uses XML
+ | markup that's easy to manipulate
+ | #[+a("https://www.smashingmagazine.com/2014/11/styling-and-animating-svgs-with-css/") using CSS] or
+ | #[+a("https://css-tricks.com/smil-is-dead-long-live-smil-a-guide-to-alternatives-to-smil-features/") JavaScript].
+ | Essentially, SVG lets you design with code, which makes it a perfect fit
+ | for visualizing dependency trees. SVGs can be embedded online in an
+ | #[code <img>] tag, or inlined in an HTML document. They're also
+ | pretty easy to #[+a("https://convertio.co/image-converter/") convert].
+
++code.
+ svg = displacy.render(doc, style='dep')
+ output_path = Path('/images/sentence.svg')
+ output_path.open('w', encoding='utf-8').write(svg)
+
++infobox("Important note")
+ | Since each visualization is generated as a separate SVG, exporting
+ | #[code .svg] files only works if you're rendering #[strong one single doc]
+ | at a time. (This makes sense β after all, each visualization should be
+ | a standalone graphic.) So instead of rendering all #[code Doc]s at one,
+ | loop over them and export them separately.
+
+
++h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses
+
++code("Example").
+ import spacy
+ from spacy import displacy
+ from pathlib import Path
+
+ nlp = spacy.load('en')
+ sentences = ["This is an example.", "This is another one."]
+ for sent in sentences:
+ doc = nlp(sentence)
+ svg = displacy.render(doc, style='dep')
+ file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
+ output_path = Path('/images/' + file_name)
+ output_path.open('w', encoding='utf-8').write(svg)
+
+p
+ | The above code will generate the dependency visualizations and them to
+ | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+
+
++h(3, "manual-usage") Rendering data manually
+
+p
+ | You can also use displaCy to manually render data. This can be useful if
+ | you want to visualize output from other libraries, like
+ | #[+a("http://www.nltk.org") NLTK] or
+ | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
+ | Simply convert the dependency parse or recognised entities to displaCy's
+ | format and set #[code manual=True] on either #[code render()] or
+ | #[code serve()].
+
++aside-code("Example").
+ ex = [{'text': 'But Google is starting from behind.',
+ 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
+ 'title': None}]
+ html = displacy.render(ex, style='ent', manual=True)
+
++code("DEP input").
+ {
+ 'words': [
+ {'text': 'This', 'tag': 'DT'},
+ {'text': 'is', 'tag': 'VBZ'},
+ {'text': 'a', 'tag': 'DT'},
+ {'text': 'sentence', 'tag': 'NN'}],
+ 'arcs': [
+ {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
+ {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
+ {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
+ }
+
++code("ENT input").
+ {
+ 'text': 'But Google is starting from behind.',
+ 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
+ 'title': None
+ }
+
++h(3, "webapp") Using displaCy in a web application
+
+p
+ | If you want to use the visualizers as part of a web application, for
+ | example to create something like our
+ | #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to
+ | simply wrap and serve the displaCy renderer. Instead, you should only
+ | rely on the server to perform spaCy's processing capabilities, and use
+ | #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output.
+
++aside("Why not return the HTML by the server?")
+ | It's certainly possible to just have your server return the markup.
+ | But outputting raw, unsanitised HTML is risky and makes your app vulnerable to
+ | #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting]
+ | (XSS). All your user needs to do is find a way to make spaCy return text
+ | like #[code <script src="malicious-code.js"><script>], which
+ | is pretty easy in NER mode. Instead of relying on the server to render
+ | and sanitise HTML, you can do this on the client in JavaScript.
+ | displaCy.js creates the markup as DOM nodes and will never insert raw
+ | HTML.
+
+p
+ | The #[code parse_deps] function takes a #[code Doc] object and returns
+ | a dictionary in a format that can be rendered by displaCy.
+
++code("Example").
+ import spacy
+ from spacy import displacy
+
+ nlp = spacy.load('en')
+
+ def displacy_service(text):
+ doc = nlp(text)
+ return displacy.parse_deps(doc)
+
+p
+ | Using a library like #[+a("https://falconframework.org/") Falcon] or
+ | #[+a("http://www.hug.rest/") Hug], you can easily turn the above code
+ | into a simple REST API that receives a text and returns a JSON-formatted
+ | parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and
+ | initialise it with the API URL and the ID or query selector of the
+ | container to render the visualisation in, e.g. #[code '#displacy'] for
+ | #[code <div id="displacy">].
+
++code("script.js", "javascript").
+ var displacy = new displaCy('http://localhost:8080', {
+ container: '#displacy'
+ })
+
+ function parse(text) {
+ displacy.parse(text);
+ }
+
+p
+ | When you call #[code parse()], it will make a request to your API,
+ | receive the JSON-formatted parse and render it in your container. To
+ | create an interactive experience, you could trigger this function by
+ | a button and read the text from an #[code <input>] field.
diff --git a/website/usage/_visualizers/_jupyter.jade b/website/usage/_visualizers/_jupyter.jade
new file mode 100644
index 000000000..f7227e4d1
--- /dev/null
+++ b/website/usage/_visualizers/_jupyter.jade
@@ -0,0 +1,36 @@
+//- π« DOCS > USAGE > VISUALIZERS > JUPYTER
+
+p
+ | displaCy is able to detect whether you're working in a
+ | #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
+ | that can be rendered in a cell straight away. When you export your
+ | notebook, the visualizations will be included as HTML.
+
++code("Jupyter Example").
+ # don't forget to install a model, e.g.: spacy download en
+ import spacy
+ from spacy import displacy
+
+ doc = nlp(u'Rats are various medium-sized, long-tailed rodents.')
+ displacy.render(doc, style='dep')
+
+ doc2 = nlp(LONG_NEWS_ARTICLE)
+ displacy.render(doc2, style='ent')
+
++aside("Enabling or disabling Jupyter mode")
+ | To explicitly enable or disable "Jupyter mode", you can use the
+ | #[code jupyter] keyword argument β e.g. to return raw HTML in a notebook,
+ | or to force Jupyter rendering if auto-detection fails.
+
++image("/assets/img/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook")
+
+p
+ | Internally, displaCy imports #[code display] and #[code HTML] from
+ | #[code IPython.core.display] and returns a Jupyter HTML object. If you
+ | were doing it manually, it'd look like this:
+
++code.
+ from IPython.core.display import display, HTML
+
+ html = displacy.render(doc, style='dep')
+ return display(HTML(html))
diff --git a/website/usage/adding-languages.jade b/website/usage/adding-languages.jade
new file mode 100644
index 000000000..0690c8738
--- /dev/null
+++ b/website/usage/adding-languages.jade
@@ -0,0 +1,59 @@
+//- π« DOCS > USAGE > ADDING LANGUAGES
+
+include ../_includes/_mixins
+
++aside("Working on spaCy's source")
+ | To add a new language to spaCy, you'll need to
+ | #[strong modify the library's code]. The easiest way to do this is to
+ | clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
+ | For more information on this, see the #[+a("/usage") installation guide].
+ | Unlike spaCy's core, which is mostly written in Cython, all language
+ | data is stored in regular Python files. This means that you won't have to
+ | rebuild anything in between β you can simply make edits and reload spaCy
+ | to test them.
+
++grid.o-no-block
+ +grid-col("half")
+ p
+ | Obviously, there are lots of ways you can organise your code when
+ | you implement your own language data. This guide will focus on
+ | how it's done within spaCy. For full language support, you'll
+ | need to create a #[code Language] subclass, define custom
+ | #[strong language data], like a stop list and tokenizer
+ | exceptions and test the new tokenizer. Once the language is set
+ | up, you can #[strong build the vocabulary], including word
+ | frequencies, Brown clusters and word vectors. Finally, you can
+ | #[strong train the tagger and parser], and save the model to a
+ | directory.
+
+ p
+ | For some languages, you may also want to develop a solution for
+ | lemmatization and morphological analysis.
+
+ +table-of-contents
+ +item #[+a("#101") Language data 101]
+ +item #[+a("#language-subclass") The Language subclass]
+ +item #[+a("#stop-words") Stop words]
+ +item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
+ +item #[+a("#norm-exceptions") Norm exceptions]
+ +item #[+a("#lex-attrs") Lexical attributes]
+ +item #[+a("#syntax-iterators") Syntax iterators]
+ +item #[+a("#lemmatizer") Lemmatizer]
+ +item #[+a("#tag-map") Tag map]
+ +item #[+a("#morph-rules") Morph rules]
+ +item #[+a("#testing") Testing the language]
+ +item #[+a("#vocabulary") Building the vocabulary]
+ +item #[+a("#training") Training]
+
++section("language-data")
+ +h(2, "language-data") Language data
+ include _spacy-101/_language-data
+ include _adding-languages/_language-data
+
++section("testing")
+ +h(2, "testing") Testing the new language
+ include _adding-languages/_testing
+
++section("training")
+ +h(2, "training") Training a language model
+ include _adding-languages/_training
diff --git a/website/usage/deep-learning.jade b/website/usage/deep-learning.jade
new file mode 100644
index 000000000..4c33c0572
--- /dev/null
+++ b/website/usage/deep-learning.jade
@@ -0,0 +1,29 @@
+//- π« DOCS > USAGE > DEEP LEARNING
+
+include ../_includes/_mixins
++section
+ +under-construction
+
++section("pre-processing")
+ +h(2, "pre-processing") Pre-processing text for deep learning
+ include _deep-learning/_pre-processing
+
++section("thinc")
+ +h(2, "thinc") spaCy and Thinc
+ include _deep-learning/_thinc
+
++section("tensorflow-keras")
+ +h(2, "tensorflow-keras") Using spaCy with TensorFlow / Keras
+ include _deep-learning/_tensorflow-keras
+
++section("scikit-learn")
+ +h(2, "scikit-learn") Using spaCy with scikit-learn
+ include _deep-learning/_scikit-learn
+
++section("pytorch")
+ +h(2, "pytorch") Using spaCy with PyTorch
+ include _deep-learning/_pytorch
+
++section("dynet")
+ +h(2, "dynet") Using spaCy with DyNet
+ include _deep-learning/_dynet
diff --git a/website/usage/examples.jade b/website/usage/examples.jade
new file mode 100644
index 000000000..75d05e339
--- /dev/null
+++ b/website/usage/examples.jade
@@ -0,0 +1,73 @@
+//- π« DOCS > USAGE > EXAMPLES
+
+include ../_includes/_mixins
+
++section("matching")
+ +h(3, "matcher") Using spaCy's rule-based matcher
+
+ p
+ | This example shows how to use spaCy's rule-based
+ | #[+api("matcher") #[code Matcher]] to find and label entities across
+ | documents.
+
+ +github("spacy", "examples/matcher_example.py")
+
+ +h(3, "phrase-matcher") Using spaCy's phrase matcher
+ +tag-new(2)
+
+ p
+ | This example shows how to use the new
+ | #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
+ | entities from a large terminology list.
+
+ +github("spacy", "examples/phrase_matcher.py")
+
++section("training")
+ +h(3, "new-entity-type") Training an additional entity type
+
+ p
+ | This script shows how to add a new entity type to an existing
+ | pre-trained NER model. To keep the example short and simple, only
+ | four sentences are provided as examples. In practice, you'll need
+ | many more β a few hundred would be a good start.
+
+ +github("spacy", "examples/training/train_new_entity_type.py")
+
+ +h(3, "ner-standalone") Training an NER system from scratch
+
+ p
+ | This example is written to be self-contained and reasonably
+ | transparent. To achieve that, it duplicates some of spaCy's internal
+ | functionality.
+
+ +github("spacy", "examples/training/train_ner_standalone.py")
+
+ +h(3, "textcat") Training spaCy's text classifier
+ +tag-new(2)
+
+ p
+ | This example shows how to use and train spaCy's new
+ | #[+api("textcategorizer") #[code TextCategorizer]] pipeline component
+ | on IMDB movie reviews.
+
+ +github("spacy", "examples/training/train_textcat.py")
+
++section("deep-learning")
+ +h(3, "keras") Text classification with Keras
+
+ p
+ | In this example, we're using spaCy to pre-process text for use with
+ | a #[+a("https://keras.io") Keras] text classification model.
+
+ +github("spacy", "examples/deep_learning_keras.py")
+
+ +h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
+
+ p
+ | This example contains an implementation of the entailment prediction
+ | model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
+ | The model is notable for its competitive performance with very few
+ | parameters, and was implemented using #[+a("https://keras.io") Keras]
+ | and spaCy.
+
+ +github("spacy", "examples/keras_parikh_entailment/__main__.py", "examples/keras_parikh_entailment")
diff --git a/website/usage/facts-figures.jade b/website/usage/facts-figures.jade
new file mode 100644
index 000000000..b6a548121
--- /dev/null
+++ b/website/usage/facts-figures.jade
@@ -0,0 +1,32 @@
+//- π« DOCS > USAGE > FACTS & FIGURES
+
+include ../_includes/_mixins
+
++section("comparison")
+ +h(2, "comparison") Feature comparison
+ include _facts-figures/_feature-comparison
+
++section("benchmarks")
+ +h(2, "benchmarks") Benchmarks
+ include _facts-figures/_benchmarks
+
+
++section("powered-by")
+ +h(2, "powered-by") Powered by spaCy
+
+ p
+ | Here's an overview of other tools and libraries that are using spaCy
+ | behind the scenes.
+
+ +grid
+ +card("torchtext", "https://github.com/pytorch/text", "PyTorch", "github")
+ | PyTorch's NLP datasets and loaders use spaCy for pre-processing
+ | and tokenization.
+
+ +card("allennlp", "https://github.com/allenai/allennlp", "Allen Institute for Artificial Intelligence", "github")
+ | The open-source NLP research library based on PyTorch uses spaCy
+ | for pre-processing and tokenization.
+
++section("other-libraries")
+ +h(2, "other-libraries") spaCy and other libraries
+ include _facts-figures/_other-libraries
diff --git a/website/usage/index.jade b/website/usage/index.jade
new file mode 100644
index 000000000..495a9863b
--- /dev/null
+++ b/website/usage/index.jade
@@ -0,0 +1,27 @@
+//- π« DOCS > USAGE
+
+include ../_includes/_mixins
+
+p
+ | spaCy is compatible with #[strong 64-bit CPython 2.6+∕3.3+] and
+ | runs on #[strong Unix/Linux], #[strong macOS/OS X] and
+ | #[strong Windows]. The latest spaCy releases are
+ | available over #[+a("https://pypi.python.org/pypi/spacy") pip] (source
+ | packages only) and #[+a("https://anaconda.org/conda-forge/spacy") conda].
+ | Installation requires a working build environment. See notes on
+ | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
+ | and #[a(href="#source-windows") Windows] for details.
+
++section("quickstart")
+ include _install/_quickstart
+
++section("instructions")
+ +h(2, "installation") Installation instructions
+ include _install/_instructions
+
++section("troubleshooting")
+ +h(2, "troubleshooting") Troubleshooting guide
+ include _install/_troubleshooting
+
++section("changelog")
+ include _install/_changelog
diff --git a/website/usage/linguistic-features.jade b/website/usage/linguistic-features.jade
new file mode 100644
index 000000000..ef8783471
--- /dev/null
+++ b/website/usage/linguistic-features.jade
@@ -0,0 +1,38 @@
+//- π« DOCS > USAGE > LINGUISTIC FEATURES
+
+include ../_includes/_mixins
+
+p
+ | Processing raw text intelligently is difficult: most words are rare, and
+ | it's common for words that look completely different to mean almost the
+ | same thing. The same words in a different order can mean something
+ | completely different. Even splitting text into useful word-like units can
+ | be difficult in many languages. While it's possible to solve some
+ | problems starting from only the raw characters, it's usually better to
+ | use linguistic knowledge to add useful information. That's exactly what
+ | spaCy is designed to do: you put in raw text, and get back a
+ | #[+api("doc") #[code Doc]] object, that comes with a variety of
+ | annotations.
+
++section("pos-tagging")
+ +h(2, "pos-tagging") Part-of-speech tagging
+ +tag-model("tagger", "dependency parse")
+ include _linguistic-features/_pos-tagging
+
++section("dependency-parse")
+ +h(2, "dependency-parse") Dependency parsing
+ +tag-model("dependency parse")
+ include _linguistic-features/_dependency-parse
+
++section("named-entities")
+ +h(2, "named-entities") Named Entities
+ +tag-model("named entities")
+ include _linguistic-features/_named-entities
+
++section("tokenization")
+ +h(2, "tokenization") Tokenization
+ include _linguistic-features/_tokenization
+
++section("rule-based-matching")
+ +h(2, "rule-based-matching") Rule-based matching
+ include _linguistic-features/_rule-based-matching
diff --git a/website/usage/models.jade b/website/usage/models.jade
new file mode 100644
index 000000000..11a0901f4
--- /dev/null
+++ b/website/usage/models.jade
@@ -0,0 +1,37 @@
+//- π« DOCS > USAGE > MODELS
+
+include ../_includes/_mixins
+
+p
+ | As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
+ | This means that they're a component of your application, just like any
+ | other module. They're versioned and can be defined as a dependency in your
+ | #[code requirements.txt]. Models can be installed from a download URL or
+ | a local directory, manually or via #[+a("https://pypi.python.org/pypi/pip") pip].
+ | Their data can be located anywhere on your file system.
+
++aside("Important note")
+ | If you're upgrading to spaCy v1.7.x or v2.x, you need to
+ | #[strong download the new models]. If you've trained statistical models
+ | that use spaCy's annotations, you should #[strong retrain your models]
+ | after updating spaCy. If you don't retrain, you may suffer train/test
+ | skew, which might decrease your accuracy.
+
++section("quickstart")
+ include _models/_quickstart
+
++section("available")
+ +h(2, "available") Available models
+ include _models/_available-models
+
++section("install")
+ +h(2, "download") Installing and using models
+ include _models/_install
+
++section("languages")
+ +h(2, "languages") Language support
+ include _models/_languages
+
++section("production")
+ +h(2, "production") Using models in production
+ include _models/_production
diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade
new file mode 100644
index 000000000..0bb96780e
--- /dev/null
+++ b/website/usage/processing-pipelines.jade
@@ -0,0 +1,25 @@
+//- π« DOCS > USAGE > PIPELINE
+
+include ../_includes/_mixins
+
+include _spacy-101/_pipelines
+
++section("pipelines")
+ +h(2, "pipelines") How pipelines work
+ include _processing-pipelines/_pipelines
+
++section("examples")
+ +h(2, "examples") Examples
+ include _processing-pipelines/_examples
+
++section("multithreading")
+ +h(2, "multithreading") Multi-threading
+ include _processing-pipelines/_multithreading
+
++section("user-hooks")
+ +h(2, "user-hooks") User hooks
+ include _processing-pipelines/_user-hooks
+
++section("serialization")
+ +h(2, "serialization") Serialization
+ include _processing-pipelines/_serialization
diff --git a/website/usage/resources.jade b/website/usage/resources.jade
new file mode 100644
index 000000000..33a2a45aa
--- /dev/null
+++ b/website/usage/resources.jade
@@ -0,0 +1,125 @@
+//- π« DOCS > USAGE > RESOURCES
+
+include ../_includes/_mixins
+
++aside("Contribute to this page")
+ | Have you built something cool with spaCy or come across a paper, book or
+ | course that should be featured here?
+ | #[a(href="mailto:#{EMAIL}") Let us know!]
+
++section("libraries")
+ +h(2, "libraries") Third-party libraries
+
+ +grid
+ +card("neuralcoref", "https://github.com/huggingface/neuralcoref", "Hugging Face", "github")
+ | State-of-the-art coreference resolution based on neural nets
+ | and spaCy
+
+ +card("rasa_nlu", "https://github.com/golastmile/rasa_nlu", "LastMile", "github")
+ | High level APIs for building your own language parser using
+ | existing NLP and ML libraries.
+
+ +card("textacy", "https://github.com/chartbeat-labs/textacy", "Burton DeWilde", "github")
+ | Higher-level NLP built on spaCy.
+
+ +card("spacyr", "https://github.com/kbenoit/spacyr", "Kenneth Benoit", "github")
+ | An R wrapper for spaCy.
+
+ +card("spacy_api", "https://github.com/kootenpv/spacy_api", "Pascal van Kooten", "github")
+ | Server/client to load models in a separate, dedicated process.
+
+ +card("spacy-api-docker", "https://github.com/jgontrum/spacy-api-docker", "Johannes Gontrum", "github")
+ | spaCy accessed by a REST API, wrapped in a Docker container.
+
+ +card("spacy-nlp-zeromq", "https://github.com/pasupulaphani/spacy-nlp-docker", "Phaninder Pasupula", "github")
+ | Docker image exposing spaCy with ZeroMQ bindings.
+
+ +card("spacy-nlp", "https://github.com/kengz/spacy-nlp", "Wah Loon Keng", "github")
+ | Expose spaCy NLP text parsing to Node.js (and other languages)
+ | via Socket.IO.
+
+ .u-text-right
+ +button("https://github.com/search?o=desc&q=spacy&s=stars&type=Repositories&utf8=%E2%9C%93", false, "primary", "small") See more projects on GitHub
+
++section("demos")
+ +h(2, "demos") Demos & Visualizations
+
+ +grid
+ +card("Neural coref", "https://huggingface.co/coref/", "Hugging Face")
+ +image("/assets/img/resources/neuralcoref.jpg").o-block-small
+ | State-of-the-art coreference resolution based on neural nets
+ | and spaCy.
+
+ +card("sense2vec", "https://demos.explosion.ai/sense2vec", "Matthew Honnibal and Ines Montani")
+ +image("/assets/img/resources/sense2vec.jpg").o-block-small
+ | Semantic analysis of the Reddit hivemind using sense2vec and spaCy.
+
+ +card("displaCy", "https://demos.explosion.ai/displacy", "Ines Montani")
+ +image("/assets/img/resources/displacy.jpg").o-block-small
+ | An open-source NLP visualiser for the modern web.
+
+ +card("displaCy ENT", "https://demos.explosion.ai/displacy-ent", "Ines Montani")
+ +image("/assets/img/resources/displacy-ent.jpg").o-block-small
+ | An open-source named entity visualiser for the modern web.
+
++section("books")
+ +h(2, "books") Books & Courses
+
+ +grid
+ +card("Natural Language Processing Fundamentals in Python", "https://www.datacamp.com/courses/natural-language-processing-fundamentals-in-python", "Katharine Jarmul (Datacamp, 2017)", "course")
+ | An interactive online course on everything you need to know about
+ | Natural Language Processing in Python, featuring spaCy and NLTK.
+
+ +card("Introduction to Machine Learning with Python: A Guide for Data Scientists", "https://books.google.com/books?id=vbQlDQAAQBAJ", "Andreas C. MΓΌller and Sarah Guido (O'Reilly, 2016)", "book")
+ | Andreas is a lead developer of Scikit-Learn, and Sarah is a lead
+ | data scientist at Mashable. We're proud to get a mention.
+
+ +card("Text Analytics with Python", "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X", "Dipanjan Sarkar (Apress / Springer, 2016)", "book")
+ | A Practical Real-World Approach to Gaining Actionable Insights
+ | from your Data
+
++section("notebooks")
+ +h(2, "notebooks") Jupyter notebooks
+
+ +grid
+ +card("Modern NLP in Python", gh("spacy-notebooks", "notebooks/conference_notebooks/modern_nlp_in_python.ipynb"), "Patrick Harrison", "jupyter")
+ | Introduction to NLP in Python using spaCy and Gensim. Presented
+ | at PyData DC 2016.
+
+ +card("Advanced Text Analysis", gh("spacy-notebooks", "notebooks/conference_notebooks/advanced_text_analysis.ipynb"), "Jonathan Reeve", "jupyter")
+ | Advanced Text Analysis with spaCy and Scikit-Learn. Presented at
+ | NYU during NYCDH Week 2017.
+
+ .u-text-right
+ +button(gh("spacy-notebooks"), false, "primary", "small") See more notebooks on GitHub
+
++section("research")
+ +h(2, "research") Research systems
+
+ p Researchers are using spaCy to build ambitious, next-generation text processing technologies. spaCy is particularly popular amongst the biomedical NLP community, who are working on extracting knowledge from the huge volume of literature in their field.
+
+ +grid
+ +card(false, "https://www.semanticscholar.org/paper/Choosing-an-NLP-Library-for-Analyzing-Software-Doc-Omran-Treude/72f280e47e91b30af24205fa24d53247605aa591", "Fouad Nasser A. Al Omran et al. (2017)", "book", "third")
+ | Choosing an NLP Library for Analyzing Software Documentation: A
+ | Systematic Literature Review and a Series of Experiments
+
+ +card(false, "https://www.semanticscholar.org/paper/Mixing-Dirichlet-Topic-Models-and-Word-Embeddings-Moody/bf8116e06f7b498c6abfbf97aeb67d0838c08609", "Christopher E. Moody (2016)", "book", "third")
+ | Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec
+
+ +card(false, "https://www.semanticscholar.org/paper/Refactoring-the-Genia-Event-Extraction-Shared-Task-Kim-Wang/06d94b64a7bd2d3433f57caddad5084435d6a91f", "Jin-Dong Kim et al. (2016)", "book", "third")
+ | Refactoring the Genia Event Extraction Shared Task Toward a
+ | General Framework for IE-Driven KB Development
+
+ +card(false, "https://www.semanticscholar.org/paper/Predicting-Pre-click-Quality-for-Native-Zhou-Redi/564985430ff2fbc3a9daa9c2af8997b7f5046da8", "Ke Zhou et al. (2016)", "book", "third")
+ | Predicting Pre-click Quality for Native Advertisements
+
+ +card(false, "https://www.semanticscholar.org/paper/Threat-detection-in-online-discussions-Wester-%C3%98vrelid/f4150e2fb4d8646ebc2ea84f1a86afa1b593239b", "Aksel Wester et al. (2016)", "book", "third")
+ | Threat detection in online discussions
+
+ +card(false, "https://www.semanticscholar.org/paper/Distributional-semantics-for-understanding-spoken-Korpusik-Huang/5f55c5535e80d3e5ed7f1f0b89531e32725faff5", "Mandy Korpusik et al. (2016)", "book", "third")
+ | Distributional semantics for understanding spoken meal
+ | descriptions
+
+ .u-text-right
+ +button("https://scholar.google.com/scholar?scisbd=2&q=spacy&hl=en&as_sdt=1,5&as_vis=1", false, "primary", "small")
+ | See 200+ papers on Google Scholar
diff --git a/website/usage/spacy-101.jade b/website/usage/spacy-101.jade
new file mode 100644
index 000000000..3b75202f7
--- /dev/null
+++ b/website/usage/spacy-101.jade
@@ -0,0 +1,300 @@
+//- π« DOCS > USAGE > SPACY 101
+
+include ../_includes/_mixins
+
+p
+ | Whether you're new to spaCy, or just want to brush up on some
+ | NLP basics and implementation details β this page should have you covered.
+ | Each section will explain one of spaCy's features in simple terms and
+ | with examples or illustrations. Some sections will also reappear across
+ | the usage guides as a quick introduction.
+
++aside("Help us improve the docs")
+ | Did you spot a mistake or come across explanations that
+ | are unclear? We always appreciate improvement
+ | #[+a(gh("spaCy") + "/issues") suggestions] or
+ | #[+a(gh("spaCy") + "/pulls") pull requests]. You can find a "Suggest
+ | edits" link at the bottom of each page that points you to the source.
+
++h(2, "whats-spacy") What's spaCy?
+
++grid.o-no-block
+ +grid-col("half")
+ p
+ | spaCy is a #[strong free, open-source library] for advanced
+ | #[strong Natural Language Processing] (NLP) in Python.
+
+ p
+ | If you're working with a lot of text, you'll eventually want to
+ | know more about it. For example, what's it about? What do the
+ | words mean in context? Who is doing what to whom? What companies
+ | and products are mentioned? Which texts are similar to each other?
+
+ p
+ | spaCy is designed specifically for #[strong production use] and
+ | helps you build applications that process and "understand"
+ | large volumes of text. It can be used to build
+ | #[strong information extraction] or
+ | #[strong natural language understanding] systems, or to
+ | pre-process text for #[strong deep learning].
+
+ +table-of-contents
+ +item #[+a("#features") Features]
+ +item #[+a("#annotations") Linguistic annotations]
+ +item #[+a("#annotations-token") Tokenization]
+ +item #[+a("#annotations-pos-deps") POS tags and dependencies]
+ +item #[+a("#annotations-ner") Named entities]
+ +item #[+a("#vectors-similarity") Word vectors and similarity]
+ +item #[+a("#pipelines") Pipelines]
+ +item #[+a("#vocab") Vocab, hashes and lexemes]
+ +item #[+a("#serialization") Serialization]
+ +item #[+a("#training") Training]
+ +item #[+a("#language-data") Language data]
+ +item #[+a("#lightning-tour") Lightning tour]
+ +item #[+a("#architecture") Architecture]
+ +item #[+a("#community") Community & FAQ]
+
++h(3, "what-spacy-isnt") What spaCy isn't
+
++list
+ +item #[strong spaCy is not a platform or "an API"].
+ | Unlike a platform, spaCy does not provide a software as a service, or
+ | a web application. It's an open-source library designed to help you
+ | build NLP applications, not a consumable service.
+ +item #[strong spaCy is not an out-of-the-box chat bot engine].
+ | While spaCy can be used to power conversational applications, it's
+ | not designed specifically for chat bots, and only provides the
+ | underlying text processing capabilities.
+ +item #[strong spaCy is not research software].
+ | It's built on the latest research, but it's designed to get
+ | things done. This leads to fairly different design decisions than
+ | #[+a("https://github./nltk/nltk") NLTK]
+ | or #[+a("https://stanfordnlp.github.io/CoreNLP/") CoreNLP], which were
+ | created as platforms for teaching and research. The main difference
+ | is that spaCy is integrated and opinionated. spaCy tries to avoid asking
+ | the user to choose between multiple algorithms that deliver equivalent
+ | functionality. Keeping the menu small lets spaCy deliver generally better
+ | performance and developer experience.
+ +item #[strong spaCy is not a company].
+ | It's an open-source library. Our company publishing spaCy and other
+ | software is called #[+a(COMPANY_URL, true) Explosion AI].
+
++section("features")
+ +h(2, "features") Features
+
+ p
+ | In the documentation, you'll come across mentions of spaCy's
+ | features and capabilities. Some of them refer to linguistic concepts,
+ | while others are related to more general machine learning
+ | functionality.
+
+ +aside
+ | If one of spaCy's functionalities #[strong needs a model], it means
+ | that you need to have one of the available
+ | #[+a("/models") statistical models] installed. Models are used
+ | to #[strong predict] linguistic annotations β for example, if a word
+ | is a verb or a noun.
+
+ +table(["Name", "Description", "Needs model"])
+ +row
+ +cell #[strong Tokenization]
+ +cell Segmenting text into words, punctuations marks etc.
+ +cell #[+procon("con")]
+
+ +row
+ +cell #[strong Part-of-speech] (POS) #[strong Tagging]
+ +cell Assigning word types to tokens, like verb or noun.
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Dependency Parsing]
+ +cell
+ | Assigning syntactic dependency labels, describing the
+ | relations between individual tokens, like subject or object.
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Lemmatization]
+ +cell
+ | Assigning the base forms of words. For example, the lemma of
+ | "was" is "be", and the lemma of "rats" is "rat".
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Sentence Boundary Detection] (SBD)
+ +cell Finding and segmenting individual sentences.
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Named Entity Recongition] (NER)
+ +cell
+ | Labelling named "real-world" objects, like persons, companies
+ | or locations.
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Similarity]
+ +cell
+ | Comparing words, text spans and documents and how similar
+ | they are to each other.
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Text Classification]
+ +cell
+ | Assigning categories or labels to a whole document, or parts
+ | of a document.
+ +cell #[+procon("pro")]
+
+ +row
+ +cell #[strong Rule-based Matching]
+ +cell
+ | Finding sequences of tokens based on their texts and
+ | linguistic annotations, similar to regular expressions.
+ +cell #[+procon("con")]
+
+ +row
+ +cell #[strong Training]
+ +cell Updating and improving a statistical model's predictions.
+ +cell #[+procon("neutral")]
+
+ +row
+ +cell #[strong Serialization]
+ +cell Saving objects to files or byte strings.
+ +cell #[+procon("neutral")]
+
+ +h(2, "annotations") Linguistic annotations
+
+ p
+ | spaCy provides a variety of linguistic annotations to give you
+ | #[strong insights into a text's grammatical structure]. This
+ | includes the word types, like the parts of speech, and how the words
+ | are related to each other. For example, if you're analysing text, it
+ | makes a huge difference whether a noun is the subject of a sentence,
+ | or the object β or whether "google" is used as a verb, or refers to
+ | the website or company in a specific context.
+
+ p
+ | Once you've downloaded and installed a #[+a("/usage/models") model],
+ | you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
+ | return a #[code Language] object contaning all components and data needed
+ | to process text. We usually call it #[code nlp]. Calling the #[code nlp]
+ | object on a string of text will return a processed #[code Doc]:
+
+ +code.
+ import spacy
+
+ nlp = spacy.load('en')
+ doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
+
+ p
+ | Even though a #[code Doc] is processed β e.g. split into individual words
+ | and annotated β it still holds #[strong all information of the original text],
+ | like whitespace characters. You can always get the offset of a token into the
+ | original string, or reconstruct the original by joining the tokens and their
+ | trailing whitespace. This way, you'll never lose any information
+ | when processing text with spaCy.
+
+ +h(3, "annotations-token") Tokenization
+
+ include _spacy-101/_tokenization
+
+ +infobox
+ | To learn more about how spaCy's tokenization rules work in detail,
+ | how to #[strong customise and replace] the default tokenizer and how to
+ | #[strong add language-specific data], see the usage guides on
+ | #[+a("/usage/adding-languages") adding languages] and
+ | #[+a("/usage/linguistic-features#tokenization") customising the tokenizer].
+
+ +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+ +tag-model("dependency parse")
+
+ include _spacy-101/_pos-deps
+
+ +infobox
+ | To learn more about #[strong part-of-speech tagging] and rule-based
+ | morphology, and how to #[strong navigate and use the parse tree]
+ | effectively, see the usage guides on
+ | #[+a("/usage/linguistic-features#pos-tagging") part-of-speech tagging] and
+ | #[+a("/usage/linguistic-features#dependency-parse") using the dependency parse].
+
+ +h(3, "annotations-ner") Named Entities
+ +tag-model("named entities")
+
+ include _spacy-101/_named-entities
+
+ +infobox
+ | To learn more about entity recognition in spaCy, how to
+ | #[strong add your own entities] to a document and how to
+ | #[strong train and update] the entity predictions of a model, see the
+ | usage guides on
+ | #[+a("/usage/linguistic-features#named-entities") named entity recognition] and
+ | #[+a("/usage/training#ner") training the named entity recognizer].
+
+ +h(2, "vectors-similarity") Word vectors and similarity
+ +tag-model("vectors")
+
+ include _spacy-101/_similarity
+
+ include _spacy-101/_word-vectors
+
+ +infobox
+ | To learn more about word vectors, how to #[strong customise them] and
+ | how to load #[strong your own vectors] into spaCy, see the usage
+ | guide on
+ | #[+a("/usage/vectors-similarity") using word vectors and semantic similarities].
+
+ +h(2, "pipelines") Pipelines
+
+ include _spacy-101/_pipelines
+
+ +infobox
+ | To learn more about #[strong how processing pipelines work] in detail,
+ | how to enable and disable their components, and how to
+ | #[strong create your own], see the usage guide on
+ | #[+a("/usage/processing-pipelines") language processing pipelines].
+
+ +h(2, "vocab") Vocab, hashes and lexemes
+
+ include _spacy-101/_vocab
+
+ +h(2, "serialization") Serialization
+
+ include _spacy-101/_serialization
+
+ +infobox
+ | To learn more about how to #[strong save and load your own models],
+ | see the usage guide on
+ | #[+a("/usage/training#saving-loading") saving and loading].
+
+ +h(2, "training") Training
+
+ include _spacy-101/_training
+
+ +infobox
+ | To learn more about #[strong training and updating] models, how to create
+ | training data and how to improve spaCy's named entity recognition models,
+ | see the usage guides on #[+a("/usage/training") training].
+
+ +h(2, "language-data") Language data
+
+ include _spacy-101/_language-data
+
+ +infobox
+ | To learn more about the individual components of the language data and
+ | how to #[strong add a new language] to spaCy in preparation for training
+ | a language model, see the usage guide on
+ | #[+a("/usage/adding-languages") adding languages].
+
+
++section("lightning-tour")
+ +h(2, "lightning-tour") Lightning tour
+ include _spacy-101/_lightning-tour
+
++section("architecture")
+ +h(2, "architecture") Architecture
+ include _spacy-101/_architecture
+
++section("community-faq")
+ +h(2, "community") Community & FAQ
+ include _spacy-101/_community-faq
diff --git a/website/usage/text-classification.jade b/website/usage/text-classification.jade
new file mode 100644
index 000000000..8a0e93450
--- /dev/null
+++ b/website/usage/text-classification.jade
@@ -0,0 +1,9 @@
+//- π« DOCS > USAGE > TEXT CLASSIFICATION
+
+include ../_includes/_mixins
+
++under-construction
+
++h(2, "example") Example
+
++github("spacy", "examples/training/train_textcat.py")
diff --git a/website/usage/training.jade b/website/usage/training.jade
new file mode 100644
index 000000000..8f15668c4
--- /dev/null
+++ b/website/usage/training.jade
@@ -0,0 +1,33 @@
+//- π« DOCS > USAGE > TRAINING
+
+include ../_includes/_mixins
+
+p
+ | This guide describes how to train new statistical models for spaCy's
+ | part-of-speech tagger, named entity recognizer and dependency parser.
+ | Once the model is trained, you can then
+ | #[+a("/usage/models#saving-loading") save and load] it.
+
++section("basics")
+ +h(2, "basics") Training basics
+ include _training/_basics
+
++section("ner")
+ +h(2, "ner") Training the named entity recognizer
+ include _training/_ner
+
++section("tagger-parser")
+ +h(2, "tagger-parser") Training the tagger and parser
+ include _training/_tagger-parser
+
++section("similarity")
+ +h(2, "similarity") Training a similarity model
+ include _training/_similarity
+
++section("textcat")
+ +h(2, "textcat") Training a text classification model
+ include _training/_textcat
+
++section("saving-loading")
+ +h(2, "saving-loading") Saving and loading models
+ include _training/_saving-loading
diff --git a/website/usage/v2.jade b/website/usage/v2.jade
new file mode 100644
index 000000000..8737c0b76
--- /dev/null
+++ b/website/usage/v2.jade
@@ -0,0 +1,520 @@
+//- π« DOCS > USAGE > WHAT'S NEW IN V2.0
+
+include ../_includes/_mixins
+
+p
+ | We're very excited to finally introduce spaCy v2.0! On this page, you'll
+ | find a summary of the new features, information on the backwards
+ | incompatibilities, including a handy overview of what's been renamed or
+ | deprecated. To help you make the most of v2.0, we also
+ | #[strong re-wrote almost all of the usage guides and API docs], and added
+ | more real-world examples. If you're new to spaCy, or just want to brush
+ | up on some NLP basics and the details of the library, check out
+ | the #[+a("/usage/spacy-101") spaCy 101 guide] that explains the most
+ | important concepts with examples and illustrations.
+
++h(2, "summary") Summary
+
++grid.o-no-block
+ +grid-col("half")
+
+ p This release features
+ | entirely new #[strong deep learning-powered models] for spaCy's tagger,
+ | parser and entity recognizer. The new models are #[strong 20x smaller]
+ | than the linear models that have powered spaCy until now: from 300 MB to
+ | only 15 MB.
+
+ p
+ | We've also made several usability improvements that are
+ | particularly helpful for #[strong production deployments]. spaCy
+ | v2 now fully supports the Pickle protocol, making it easy to use
+ | spaCy with #[+a("https://spark.apache.org/") Apache Spark]. The
+ | string-to-integer mapping is #[strong no longer stateful], making
+ | it easy to reconcile annotations made in different processes.
+ | Models are smaller and use less memory, and the APIs for serialization
+ | are now much more consistent.
+
+ +table-of-contents
+ +item #[+a("#summary") Summary]
+ +item #[+a("#features") New features]
+ +item #[+a("#features-models") Neural network models]
+ +item #[+a("#features-pipelines") Improved processing pipelines]
+ +item #[+a("#features-text-classification") Text classification]
+ +item #[+a("#features-hash-ids") Hash values instead of integer IDs]
+ +item #[+a("#features-serializer") Saving, loading and serialization]
+ +item #[+a("#features-displacy") displaCy visualizer]
+ +item #[+a("#features-language") Language data and lazy loading]
+ +item #[+a("#features-matcher") Revised matcher API and phrase matcher]
+ +item #[+a("#incompat") Backwards incompatibilities]
+ +item #[+a("#migrating") Migrating from spaCy v1.x]
+ +item #[+a("#benchmarks") Benchmarks]
+
+p
+ | The main usability improvements you'll notice in spaCy v2.0 are around
+ | #[strong defining, training and loading your own models] and components.
+ | The new neural network models make it much easier to train a model from
+ | scratch, or update an existing model with a few examples. In v1.x, the
+ | statistical models depended on the state of the #[code Vocab]. If you
+ | taught the model a new word, you would have to save and load a lot of
+ | data β otherwise the model wouldn't correctly recall the features of your
+ | new example. That's no longer the case.
+
+p
+ | Due to some clever use of hashing, the statistical models
+ | #[strong never change size], even as they learn new vocabulary items.
+ | The whole pipeline is also now fully differentiable. Even if you don't
+ | have explicitly annotated data, you can update spaCy using all the
+ | #[strong latest deep learning tricks] like adversarial training, noise
+ | contrastive estimation or reinforcement learning.
+
++section("features")
+ +h(2, "features") New features
+
+ p
+ | This section contains an overview of the most important
+ | #[strong new features and improvements]. The #[+a("/api") API docs]
+ | include additional deprecation notes. New methods and functions that
+ | were introduced in this version are marked with a #[+tag-new(2)] tag.
+
+ +h(3, "features-models") Convolutional neural network models
+
+ +aside-code("Example", "bash").
+ spacy download en # default English model
+ spacy download de # default German model
+ spacy download fr # default French model
+ spacy download es # default Spanish model
+ spacy download xx_ent_wiki_sm # multi-language NER
+
+ p
+ | spaCy v2.0 features new neural models for tagging,
+ | parsing and entity recognition. The models have
+ | been designed and implemented from scratch specifically for spaCy, to
+ | give you an unmatched balance of speed, size and accuracy. The new
+ | models are #[strong 10× smaller], #[strong 20% more accurate],
+ | and #[strong just as fast] as the previous generation.
+ | #[strong GPU usage] is now supported via
+ | #[+a("http://chainer.org") Chainer]'s CuPy module.
+
+ +infobox
+ | #[+label-inline Usage:] #[+a("/models") Models directory],
+ | #[+a("/usage/#gpu") Using spaCy with GPU]
+
+ +h(3, "features-pipelines") Improved processing pipelines
+
+ +aside-code("Example").
+ # Modify an existing pipeline
+ nlp = spacy.load('en')
+ nlp.pipeline.append(my_component)
+
+ # Register a factory to create a component
+ spacy.set_factory('my_factory', my_factory)
+ nlp = Language(pipeline=['my_factory', mycomponent])
+
+ p
+ | It's now much easier to #[strong customise the pipeline] with your own
+ | components, functions that receive a #[code Doc] object, modify and
+ | return it. If your component is stateful, you can define and register a
+ | factory which receives the shared #[code Vocab] object and returns a
+ |Β component. spaCy's default components can be added to your pipeline by
+ | using their string IDs. This way, you won't have to worry about finding
+ | and implementing them β simply add #[code "tagger"] to the pipeline,
+ | and spaCy will know what to do.
+
+ +image
+ include ../assets/img/pipeline.svg
+
+ +infobox
+ | #[+label-inline API:] #[+api("language") #[code Language]]
+ | #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text]
+
+ +h(3, "features-text-classification") Text classification
+
+ +aside-code("Example").
+ from spacy.lang.en import English
+ nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
+
+ p
+ | spaCy v2.0 lets you add text categorization models to spaCy pipelines.
+ | The model supports classification with multiple, non-mutually exclusive
+ | labels β so multiple labels can apply at once. You can change the model
+ | architecture rather easily, but by default, the #[code TextCategorizer]
+ | class uses a convolutional neural network to assign position-sensitive
+ | vectors to each word in the document.
+
+ +infobox
+ | #[+label-inline API:] #[+api("textcategorizer") #[code TextCategorizer]],
+ | #[+api("doc#attributes") #[code Doc.cats]],
+ | #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
+ | #[+label-inline Usage:] #[+a("/usage/text-classification") Text classification]
+
+ +h(3, "features-hash-ids") Hash values instead of integer IDs
+
+ +aside-code("Example").
+ doc = nlp(u'I love coffee')
+ assert doc.vocab.strings[u'coffee'] == 3197928453018144401
+ assert doc.vocab.strings[3197928453018144401] == u'coffee'
+
+ beer_hash = doc.vocab.strings.add(u'beer')
+ assert doc.vocab.strings[u'beer'] == beer_hash
+ assert doc.vocab.strings[beer_hash] == u'beer'
+
+ p
+ | The #[+api("stringstore") #[code StringStore]] now resolves all strings
+ | to hash values instead of integer IDs. This means that the string-to-int
+ | mapping #[strong no longer depends on the vocabulary state], making a lot
+ | of workflows much simpler, especially during training. Unlike integer IDs
+ | in spaCy v1.x, hash values will #[strong always match] β even across
+ | models. Strings can now be added explicitly using the new
+ | #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
+ | is available via #[code token.orth].
+
+ +infobox
+ | #[+label-inline API:] #[+api("stringstore") #[code StringStore]]
+ | #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+
+ +h(3, "features-serializer") Saving, loading and serialization
+
+ +aside-code("Example").
+ nlp = spacy.load('en') # shortcut link
+ nlp = spacy.load('en_core_web_sm') # package
+ nlp = spacy.load('/path/to/en') # unicode path
+ nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+
+ nlp.to_disk('/path/to/nlp')
+ nlp = English().from_disk('/path/to/nlp')
+
+ p
+ | spay's serialization API has been made consistent across classes and
+ | objects. All container classes, i.e. #[code Language], #[code Doc],
+ | #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
+ | #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
+ | that supports the Pickle protocol.
+
+ p
+ | The improved #[code spacy.load] makes loading models easier and more
+ | transparent. You can load a model by supplying its
+ | #[+a("/usage/models#usage") shortcut link], the name of an installed
+ | #[+a("/usage/saving-loading#generating") model package] or a path.
+ | The #[code Language] class to initialise will be determined based on the
+ | model's settings. For a blank language, you can import the class directly,
+ | e.g. #[code from spacy.lang.en import English].
+
+ +infobox
+ | #[+label-inline API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]]
+ | #[+label-inline Usage:] #[+a("/usage/saving-loading") Saving and loading]
+
+ +h(3, "features-displacy") displaCy visualizer with Jupyter support
+
+ +aside-code("Example").
+ from spacy import displacy
+ doc = nlp(u'This is a sentence about Facebook.')
+ displacy.serve(doc, style='dep') # run the web server
+ html = displacy.render(doc, style='ent') # generate HTML
+
+ p
+ | Our popular dependency and named entity visualizers are now an official
+ | part of the spaCy library. displaCy can run a simple web server, or
+ | generate raw HTML markup or SVG files to be exported. You can pass in one
+ | or more docs, and customise the style. displaCy also auto-detects whether
+ | you're running #[+a("https://jupyter.org") Jupyter] and will render the
+ | visualizations in your notebook.
+
+ +infobox
+ | #[+label-inline API:] #[+api("displacy") #[code displacy]]
+ | #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizing spaCy]
+
+ +h(3, "features-language") Improved language data and lazy loading
+
+ p
+ | Language-specfic data now lives in its own submodule, #[code spacy.lang].
+ | Languages are lazy-loaded, i.e. only loaded when you import a
+ | #[code Language] class, or load a model that initialises one. This allows
+ | languages to contain more custom data, e.g. lemmatizer lookup tables, or
+ | complex regular expressions. The language data has also been tidied up
+ | and simplified. spaCy now also supports simple lookup-based lemmatization.
+
+ +infobox
+ | #[+label-inline API:] #[+api("language") #[code Language]]
+ | #[+label-inline Code:] #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]]
+ | #[+label-inline Usage:] #[+a("/usage/adding-languages") Adding languages]
+
+ +h(3, "features-matcher") Revised matcher API and phrase matcher
+
+ +aside-code("Example").
+ from spacy.matcher import Matcher, PhraseMatcher
+
+ matcher = Matcher(nlp.vocab)
+ matcher.add('HEARTS', None, [{'ORTH': 'β€οΈ', 'OP': '+'}])
+
+ phrasematcher = PhraseMatcher(nlp.vocab)
+ phrasematcher.add('OBAMA', None, nlp(u"Barack Obama"))
+
+ p
+ | Patterns can now be added to the matcher by calling
+ | #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
+ | callback function to be invoked on each match, and one or more patterns.
+ | This allows you to write powerful, pattern-specific logic using only one
+ | matcher. For example, you might only want to merge some entity types,
+ | and set custom flags for other matched patterns. The new
+ | #[+api("phrasematcher") #[code PhraseMatcher]] lets you efficiently
+ | match very large terminology lists using #[code Doc] objects as match
+ | patterns.
+
+ +infobox
+ | #[+label-inline API:] #[+api("matcher") #[code Matcher]],
+ | #[+api("phrasematcher") #[code PhraseMatcher]]
+ | #[+label-inline Usage:] #[+a("/usage/rule-based-matching") Rule-based matching]
+
++section("incompat")
+ +h(2, "incompat") Backwards incompatibilities
+
+ +table(["Old", "New"])
+ +row
+ +cell
+ | #[code spacy.en]
+ | #[code spacy.xx]
+ +cell
+ | #[code spacy.lang.en]
+ | #[code spacy.lang.xx]
+
+ +row
+ +cell #[code orth]
+ +cell #[code lang.xx.lex_attrs]
+
+ +row
+ +cell #[code syntax.iterators]
+ +cell #[code lang.xx.syntax_iterators]
+
+ +row
+ +cell #[code Language.save_to_directory]
+ +cell #[+api("language#to_disk") #[code Language.to_disk]]
+
+ +row
+ +cell #[code Language.create_make_doc]
+ +cell #[+api("language#attributes") #[code Language.tokenizer]]
+
+ +row
+ +cell
+ | #[code Vocab.load]
+ | #[code Vocab.load_lexemes]
+ +cell
+ | #[+api("vocab#from_disk") #[code Vocab.from_disk]]
+ | #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+
+ +row
+ +cell
+ | #[code Vocab.dump]
+ +cell
+ | #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br]
+ | #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+
+ +row
+ +cell
+ | #[code Vocab.load_vectors]
+ | #[code Vocab.load_vectors_from_bin_loc]
+ +cell
+ | #[+api("vectors#from_disk") #[code Vectors.from_disk]]
+ | #[+api("vectors#from_bytes") #[code Vectors.from_bytes]]
+
+ +row
+ +cell
+ | #[code Vocab.dump_vectors]
+ +cell
+ | #[+api("vectors#to_disk") #[code Vectors.to_disk]]
+ | #[+api("vectors#to_bytes") #[code Vectors.to_bytes]]
+
+ +row
+ +cell
+ | #[code StringStore.load]
+ +cell
+ | #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
+ | #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+
+ +row
+ +cell
+ | #[code StringStore.dump]
+ +cell
+ | #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
+ | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+
+ +row
+ +cell #[code Tokenizer.load]
+ +cell
+ | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
+ | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+
+ +row
+ +cell #[code Tagger.load]
+ +cell
+ | #[+api("tagger#from_disk") #[code Tagger.from_disk]]
+ | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+
+ +row
+ +cell #[code DependencyParser.load]
+ +cell
+ | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
+ | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+
+ +row
+ +cell #[code EntityRecognizer.load]
+ +cell
+ | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
+ | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+
+ +row
+ +cell #[code Matcher.load]
+ +cell -
+
+ +row
+ +cell
+ | #[code Matcher.add_pattern]
+ | #[code Matcher.add_entity]
+ +cell #[+api("matcher#add") #[code Matcher.add]]
+
+ +row
+ +cell #[code Matcher.get_entity]
+ +cell #[+api("matcher#get") #[code Matcher.get]]
+
+ +row
+ +cell #[code Matcher.has_entity]
+ +cell #[+api("matcher#contains") #[code Matcher.__contains__]]
+
+ +row
+ +cell #[code Doc.read_bytes]
+ +cell #[+api("binder") #[code Binder]]
+
+ +row
+ +cell #[code Token.is_ancestor_of]
+ +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+
+ +row
+ +cell #[code cli.model]
+ +cell -
+
++section("migrating")
+ +h(2, "migrating") Migrating from spaCy 1.x
+
+ p
+ | Because we'e made so many architectural changes to the library, we've
+ | tried to #[strong keep breaking changes to a minimum]. A lot of projects
+ | follow the philosophy that if you're going to break anything, you may as
+ | well break everything. We think migration is easier if there's a logic to
+ | what has changed.
+
+ p
+ | We've therefore followed a policy of avoiding breaking changes to the
+ | #[code Doc], #[code Span] and #[code Token] objects. This way, you can
+ | focus on only migrating the code that does training, loading and
+ | serialization β in other words, code that works with the #[code nlp]
+ | object directly. Code that uses the annotations should continue to work.
+
+ +infobox("Important note")
+ | If you've trained your own models, keep in mind that your train and
+ | runtime inputs must match. This means you'll have to
+ | #[strong retrain your models] with spaCy v2.0.
+
+ +h(3, "migrating-saving-loading") Saving, loading and serialization
+
+ p
+ | Double-check all calls to #[code spacy.load()] and make sure they don't
+ | use the #[code path] keyword argument. If you're only loading in binary
+ | data and not a model package that can construct its own #[code Language]
+ | class and pipeline, you should now use the
+ | #[+api("language#from_disk") #[code Language.from_disk()]] method.
+
+ +code-new.
+ nlp = spacy.load('/model')
+ nlp = English().from_disk('/model/data')
+ +code-old nlp = spacy.load('en', path='/model')
+
+ p
+ | Review all other code that writes state to disk or bytes.
+ | All containers, now share the same, consistent API for saving and
+ | loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
+ | loading with #[code from_disk()] and #[code from_bytes()].
+
+ +code-new.
+ nlp.to_disk('/model')
+ nlp.vocab.to_disk('/vocab')
+
+ +code-old.
+ nlp.save_to_directory('/model')
+ nlp.vocab.dump('/vocab')
+
+ p
+ | If you've trained models with input from v1.x, you'll need to
+ | #[strong retrain them] with spaCy v2.0. All previous models will not
+ | be compatible with the new version.
+
+ +h(3, "migrating-strings") Strings and hash values
+
+ p
+ | The change from integer IDs to hash values may not actually affect your
+ | code very much. However, if you're adding strings to the vocab manually,
+ | you now need to call #[+api("stringstore#add") #[code StringStore.add()]]
+ | explicitly. You can also now be sure that the string-to-hash mapping will
+ | always match across vocabularies.
+
+ +code-new.
+ nlp.vocab.strings.add(u'coffee')
+ nlp.vocab.strings[u'coffee'] # 3197928453018144401
+ other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
+
+ +code-old.
+ nlp.vocab.strings[u'coffee'] # 3672
+ other_nlp.vocab.strings[u'coffee'] # 40259
+
+ +h(3, "migrating-languages") Processing pipelines and language data
+
+ p
+ | If you're importing language data or #[code Language] classes, make sure
+ | to change your import statements to import from #[code spacy.lang]. If
+ | you've added your own custom language, it needs to be moved to
+ | #[code spacy/lang/xx] and adjusted accordingly.
+
+ +code-new from spacy.lang.en import English
+ +code-old from spacy.en import English
+
+ p
+ | If you've been using custom pipeline components, check out the new
+ | guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
+ | Appending functions to the pipeline still works β but you might be able
+ | to make this more convenient by registering "component factories".
+ | Components of the processing pipeline can now be disabled by passing a
+ | list of their names to the #[code disable] keyword argument on loading
+ | or processing.
+
+ +code-new.
+ nlp = spacy.load('en', disable=['tagger', 'ner'])
+ doc = nlp(u"I don't want parsed", disable=['parser'])
+ +code-old.
+ nlp = spacy.load('en', tagger=False, entity=False)
+ doc = nlp(u"I don't want parsed", parse=False)
+
+ +h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
+
+ p
+ | If you're using the matcher, you can now add patterns in one step. This
+ | should be easy to update β simply merge the ID, callback and patterns
+ | into one call to #[+api("matcher#add") #[code Matcher.add()]].
+
+ +code-new.
+ matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
+ +code-old.
+ matcher.add_entity('GoogleNow', on_match=merge_phrases)
+ matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
+ p
+ | If you've been using #[strong acceptor functions], you'll need to move
+ | this logic into the
+ | #[+a("/usage/rule-based-matching#on_match") #[code on_match] callbacks].
+ | The callback function is invoked on every match and will give you access to
+ | the doc, the index of the current match and all total matches. This lets
+ | you both accept or reject the match, and define the actions to be
+ | triggered.
+
++section("benchmarks")
+ +h(2, "benchmarks") Benchmarks
+
+ include _facts-figures/_benchmarks-models
diff --git a/website/usage/vectors-similarity.jade b/website/usage/vectors-similarity.jade
new file mode 100644
index 000000000..1e1139b20
--- /dev/null
+++ b/website/usage/vectors-similarity.jade
@@ -0,0 +1,18 @@
+//- π« DOCS > USAGE > WORD VECTORS & SIMILARITIES
+
+include ../_includes/_mixins
+
++section("basics")
+ include _vectors-similarity/_basics
+
++section("in-context")
+ +h(2, "in-context") Similarities in context
+ include _vectors-similarity/_in-context
+
++section("custom")
+ +h(2, "custom") Customising word vectors
+ include _vectors-similarity/_custom
+
++section("gpu")
+ +h(2, "gpu") Storing vectors on a GPU
+ include _vectors-similarity/_gpu
diff --git a/website/usage/visualizers.jade b/website/usage/visualizers.jade
new file mode 100644
index 000000000..a092404ac
--- /dev/null
+++ b/website/usage/visualizers.jade
@@ -0,0 +1,48 @@
+//- π« DOCS > USAGE > VISUALIZERS
+
+include ../_includes/_mixins
+
++section
+ p
+ | As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
+ | and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an
+ | official part of the library. Visualizing a dependency parse or named
+ | entities in a text is not only a fun NLP demo β it can also be incredibly
+ | helpful in speeding up development and debugging your code and training
+ | process. If you're running a #[+a("https://jupyter.org") Jupyter] notebook,
+ | displaCy will detect this and return the markup in a format
+ | #[+a("#jupyter") ready to be rendered and exported].
+
+ +aside("What about the old visualizers?")
+ | Our JavaScript-based visualizers #[+src(gh("displacy")) #[code displacy.js]] and
+ | #[+src(gh("displacy-ent")) #[code displacy-ent.js]] will still be available on
+ | GitHub. If you're looking to implement web-based visualizations, we
+ | generally recommend using those instead of spaCy's built-in
+ | #[code displacy] module. It'll allow your application to perform all
+ | rendering on the client and only rely on the server for the text
+ | processing. The generated markup is also more compatible with modern web
+ | standards.
+
+ p
+ | The quickest way visualize #[code Doc] is to use
+ | #[+api("displacy#serve") #[code displacy.serve]]. This will spin up a
+ | simple web server and let you view the result straight from your browser.
+ | displaCy can either take a single #[code Doc] or a list of #[code Doc]
+ | objects as its first argument. This lets you construct them however you
+ | like β using any model or modifications you like.
+
++section("dep")
+ +h(2, "dep") Visualizing the dependency parse
+ include _visualizers/_dep
+
++section("ent")
+ +h(2, "ent") Visualizing the entity recognizer
+ include _visualizers/_ent
+
++section("jupyter")
+ +h(2, "jupyter") Using displaCy in Jupyter notebooks
+ include _visualizers/_jupyter
+
++section("html")
+ +h(2, "html") Rendering HTML
+ include _visualizers/_html