From 1220fd3e6ccfb20662b107dce1cf35650fe7e0cf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 1 Jul 2020 14:50:38 +0200 Subject: [PATCH 01/10] Handle robots.txt for nightly/special deploys [ci skip] --- website/gatsby-config.js | 17 ++ website/package-lock.json | 453 ++++++++++++++++++++++++++++++++++---- website/package.json | 1 + 3 files changed, 430 insertions(+), 41 deletions(-) diff --git a/website/gatsby-config.js b/website/gatsby-config.js index f0e97cf5c..d08c574c6 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -148,6 +148,23 @@ module.exports = { respectDNT: true, }, }, + { + resolve: 'gatsby-plugin-robots-txt', + options: { + host: siteUrl, + sitemap: `${siteUrl}/sitemap.xml`, + // If we're in a special state (nightly, legacy) prevent indexing + resolveEnv: () => (isNightly ? 'development' : 'production'), + env: { + production: { + policy: [{ userAgent: '*', allow: '/' }], + }, + development: { + policy: [{ userAgent: '*', disallow: ['/'] }], + }, + }, + }, + }, `gatsby-plugin-offline`, ], } diff --git a/website/package-lock.json b/website/package-lock.json index 0058b2423..96a10a8af 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -4150,7 +4150,7 @@ }, "@types/debug": { "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@types/debug/-/debug-0.0.29.tgz", + "resolved": "http://registry.npmjs.org/@types/debug/-/debug-0.0.29.tgz", "integrity": "sha1-oeUUrfvZLwOiJLpU1pMRHb8fN1Q=" }, "@types/events": { @@ -4160,7 +4160,7 @@ }, "@types/get-port": { "version": "0.0.4", - "resolved": "https://registry.npmjs.org/@types/get-port/-/get-port-0.0.4.tgz", + "resolved": "http://registry.npmjs.org/@types/get-port/-/get-port-0.0.4.tgz", "integrity": "sha1-62u3Qj2fiItjJmDcfS/T5po1ZD4=" }, "@types/glob": { @@ -4223,9 +4223,14 @@ "resolved": "https://registry.npmjs.org/@types/minimatch/-/minimatch-3.0.3.tgz", "integrity": "sha512-tHq6qdbT9U1IRSGf14CL0pUlULksvY9OZ+5eEgl1N7t+OA3tGvNpxJCzuKQlsNgCVwbAs670L1vcVQi8j9HjnA==" }, + "@types/minimist": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@types/minimist/-/minimist-1.2.0.tgz", + "integrity": "sha1-aaI6OtKcrwCX8G7aWbNh7i8GOfY=" + }, "@types/mkdirp": { "version": "0.3.29", - "resolved": "https://registry.npmjs.org/@types/mkdirp/-/mkdirp-0.3.29.tgz", + "resolved": "http://registry.npmjs.org/@types/mkdirp/-/mkdirp-0.3.29.tgz", "integrity": "sha1-fyrX7FX5FEgvybHsS7GuYCjUYGY=" }, "@types/node": { @@ -4233,6 +4238,11 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-7.10.2.tgz", "integrity": "sha512-RO4ig5taKmcrU4Rex8ojG1gpwFkjddzug9iPQSDvbewHN9vDpcFewevkaOK+KT+w1LeZnxbgOyfXwV4pxsQ4GQ==" }, + "@types/normalize-package-data": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@types/normalize-package-data/-/normalize-package-data-2.4.0.tgz", + "integrity": "sha512-f5j5b/Gf71L+dbqxIpQ4Z2WlmI/mPJ0fOkGGmFgtb6sAu97EPczzbS3/tJKxmcYDj55OX6ssqwDAWOHIYDRDGA==" + }, "@types/parse-json": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.0.tgz", @@ -4268,7 +4278,7 @@ }, "@types/tmp": { "version": "0.0.32", - "resolved": "https://registry.npmjs.org/@types/tmp/-/tmp-0.0.32.tgz", + "resolved": "http://registry.npmjs.org/@types/tmp/-/tmp-0.0.32.tgz", "integrity": "sha1-DTyzECL4Qn6ljACK8yuA2hJspOM=" }, "@types/unist": { @@ -4823,7 +4833,7 @@ }, "array-flatten": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "resolved": "http://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=" }, "array-includes": { @@ -5040,7 +5050,7 @@ }, "util": { "version": "0.10.3", - "resolved": "https://registry.npmjs.org/util/-/util-0.10.3.tgz", + "resolved": "http://registry.npmjs.org/util/-/util-0.10.3.tgz", "integrity": "sha1-evsa/lCAUkZInj23/g7TeTNqwPk=", "requires": { "inherits": "2.0.1" @@ -5070,7 +5080,7 @@ }, "async": { "version": "1.5.2", - "resolved": "https://registry.npmjs.org/async/-/async-1.5.2.tgz", + "resolved": "http://registry.npmjs.org/async/-/async-1.5.2.tgz", "integrity": "sha1-7GphrlZIDAw8skHJVhjiCJL5Zyo=" }, "async-each": { @@ -5226,7 +5236,7 @@ }, "chalk": { "version": "1.1.3", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", "requires": { "ansi-styles": "^2.2.1", @@ -5243,7 +5253,7 @@ }, "supports-color": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "resolved": "http://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=" } } @@ -5333,7 +5343,7 @@ }, "babel-plugin-add-module-exports": { "version": "0.2.1", - "resolved": "https://registry.npmjs.org/babel-plugin-add-module-exports/-/babel-plugin-add-module-exports-0.2.1.tgz", + "resolved": "http://registry.npmjs.org/babel-plugin-add-module-exports/-/babel-plugin-add-module-exports-0.2.1.tgz", "integrity": "sha1-mumh9KjcZ/DN7E9K7aHkOl/2XiU=" }, "babel-plugin-apply-mdx-type-prop": { @@ -5457,7 +5467,7 @@ }, "babel-plugin-syntax-dynamic-import": { "version": "6.18.0", - "resolved": "https://registry.npmjs.org/babel-plugin-syntax-dynamic-import/-/babel-plugin-syntax-dynamic-import-6.18.0.tgz", + "resolved": "http://registry.npmjs.org/babel-plugin-syntax-dynamic-import/-/babel-plugin-syntax-dynamic-import-6.18.0.tgz", "integrity": "sha1-jWomIpyDdFqZgqRBBRVyyqF5sdo=" }, "babel-plugin-syntax-object-rest-spread": { @@ -6924,7 +6934,7 @@ }, "browserify-aes": { "version": "1.2.0", - "resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", + "resolved": "http://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", "integrity": "sha512-+7CHXqGuspUn/Sl5aO7Ea0xWGAtETPXNSAjHo48JfLdPWcMng33Xe4znFvQweqc/uzk5zSOI3H52CYnjCfb5hA==", "requires": { "buffer-xor": "^1.0.3", @@ -6958,7 +6968,7 @@ }, "browserify-rsa": { "version": "4.0.1", - "resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", + "resolved": "http://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", "integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=", "requires": { "bn.js": "^4.1.0", @@ -8596,7 +8606,7 @@ }, "concat-stream": { "version": "1.6.2", - "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "requires": { "buffer-from": "^1.0.0", @@ -8845,7 +8855,7 @@ }, "create-hash": { "version": "1.2.0", - "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", + "resolved": "http://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", "integrity": "sha512-z00bCGNHDG8mHAkP7CtT1qVu+bFQUPjYq/4Iv3C3kWjTFV10zIjfSoeqXo9Asws8gwSHDGj/hl2u4OGIjapeCg==", "requires": { "cipher-base": "^1.0.1", @@ -8857,7 +8867,7 @@ }, "create-hmac": { "version": "1.1.7", - "resolved": "https://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", + "resolved": "http://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", "integrity": "sha512-MJG9liiZ+ogc4TzUwuvbER1JRdgvUFSB5+VR/g5h82fGaIRWMWddtKBHi7/sVhfjQZ6SehlyhvQYrcYkaUIpLg==", "requires": { "cipher-base": "^1.0.3", @@ -8888,12 +8898,12 @@ "dependencies": { "node-fetch": { "version": "2.1.2", - "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.1.2.tgz", + "resolved": "http://registry.npmjs.org/node-fetch/-/node-fetch-2.1.2.tgz", "integrity": "sha1-q4hOjn5X44qUR1POxwb3iNF2i7U=" }, "whatwg-fetch": { "version": "2.0.4", - "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-2.0.4.tgz", + "resolved": "http://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-2.0.4.tgz", "integrity": "sha512-dcQ1GWpOD/eEQ97k66aiEVpNnapVj90/+R+SXTPYGHpYBBypfKJEQjLrvMZ7YXbKm21gXd4NcuxUTjiv1YtLng==" } } @@ -8967,7 +8977,7 @@ }, "css-color-names": { "version": "0.0.4", - "resolved": "https://registry.npmjs.org/css-color-names/-/css-color-names-0.0.4.tgz", + "resolved": "http://registry.npmjs.org/css-color-names/-/css-color-names-0.0.4.tgz", "integrity": "sha1-gIrcLnnPhHOAabZGyyDsJ762KeA=" }, "css-declaration-sorter": { @@ -9076,7 +9086,7 @@ "dependencies": { "jsesc": { "version": "0.5.0", - "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-0.5.0.tgz", + "resolved": "http://registry.npmjs.org/jsesc/-/jsesc-0.5.0.tgz", "integrity": "sha1-597mbjXW/Bb3EP6R1c9p9w8IkR0=" }, "regenerate-unicode-properties": { @@ -9443,6 +9453,15 @@ "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", "integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA=" }, + "decamelize-keys": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/decamelize-keys/-/decamelize-keys-1.1.0.tgz", + "integrity": "sha1-0XGoeTMlKAfrPLYdwcFEXQeN8tk=", + "requires": { + "decamelize": "^1.1.0", + "map-obj": "^1.0.0" + } + }, "decode-uri-component": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.0.tgz", @@ -9876,7 +9895,7 @@ }, "diffie-hellman": { "version": "5.0.3", - "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", + "resolved": "http://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", "integrity": "sha512-kqag/Nl+f3GwyK25fhUMYj81BUOrZ9IuJsjIcDE5icNM9FJHAVm3VcUDxdLPoQtTuUylWm6ZIknYJwwaPxsUzg==", "requires": { "bn.js": "^4.1.0", @@ -10029,7 +10048,7 @@ }, "dotenv": { "version": "4.0.0", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-4.0.0.tgz", + "resolved": "http://registry.npmjs.org/dotenv/-/dotenv-4.0.0.tgz", "integrity": "sha1-hk7xN5rO1Vzm+V3r7NzhefegzR0=" }, "download": { @@ -10080,7 +10099,7 @@ }, "duplexer": { "version": "0.1.1", - "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.1.tgz", + "resolved": "http://registry.npmjs.org/duplexer/-/duplexer-0.1.1.tgz", "integrity": "sha1-rOb/gIwc5mtX0ev5eXessCM0z8E=" }, "duplexer3": { @@ -10644,7 +10663,7 @@ }, "doctrine": { "version": "1.5.0", - "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-1.5.0.tgz", + "resolved": "http://registry.npmjs.org/doctrine/-/doctrine-1.5.0.tgz", "integrity": "sha1-N53Ocw9hZvds76TmcHoVmwLFpvo=", "requires": { "esutils": "^2.0.2", @@ -11416,7 +11435,7 @@ }, "file-loader": { "version": "1.1.11", - "resolved": "https://registry.npmjs.org/file-loader/-/file-loader-1.1.11.tgz", + "resolved": "http://registry.npmjs.org/file-loader/-/file-loader-1.1.11.tgz", "integrity": "sha512-TGR4HU7HUsGg6GCOPJnFk06RhWgEWFLAGWiT6rcD+GRC2keU3s9RGJ+b3Z6/U73jwwNb2gKLJ7YCrp+jvU4ALg==", "requires": { "loader-utils": "^1.0.2", @@ -12912,6 +12931,30 @@ "svg-react-loader": "^0.4.4" } }, + "gatsby-plugin-robots-txt": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/gatsby-plugin-robots-txt/-/gatsby-plugin-robots-txt-1.5.1.tgz", + "integrity": "sha512-hkCmoIgoCmW49GFeql8RLNIh9RsG7RosHm+wNA//oFdJSzaYX1VVXk3OV0O1+35xspovP14Kbo+U7x39nu2emA==", + "requires": { + "@babel/runtime": "^7.10.0", + "generate-robotstxt": "^8.0.3" + }, + "dependencies": { + "@babel/runtime": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.10.4.tgz", + "integrity": "sha512-UpTN5yUJr9b4EX2CnGNWIvER7Ab83ibv0pcvvHc4UOdrBI5jb8bj+32cCwPX6xu0mt2daFNjYhoi+X7beH0RSw==", + "requires": { + "regenerator-runtime": "^0.13.4" + } + }, + "regenerator-runtime": { + "version": "0.13.5", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.5.tgz", + "integrity": "sha512-ZS5w8CpKFinUzOwW3c83oPeVXoNsrLsaCoLtJvAClH135j/R77RuymhiSErhm2lKcwSCIpmvIWSbDkIfAqKQlA==" + } + } + }, "gatsby-plugin-sass": { "version": "2.0.10", "resolved": "https://registry.npmjs.org/gatsby-plugin-sass/-/gatsby-plugin-sass-2.0.10.tgz", @@ -14603,6 +14646,302 @@ "globule": "^1.0.0" } }, + "generate-robotstxt": { + "version": "8.0.3", + "resolved": "https://registry.npmjs.org/generate-robotstxt/-/generate-robotstxt-8.0.3.tgz", + "integrity": "sha512-iD//oAVKcHOCz9M0IiT3pyUiF2uN1qvL3qaTA8RGLz7NU7l0XVwyzd3rN+tzhB657DNUgrygXt9w8+0zkTMFrg==", + "requires": { + "cosmiconfig": "^6.0.0", + "fs-extra": "^9.0.0", + "ip-regex": "^4.1.0", + "is-absolute-url": "^3.0.3", + "meow": "^7.0.1", + "resolve-from": "^5.0.0" + }, + "dependencies": { + "arrify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz", + "integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==" + }, + "camelcase": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.0.0.tgz", + "integrity": "sha512-8KMDF1Vz2gzOq54ONPJS65IvTUaB1cHJ2DMM7MbPmLZljDH1qpzzLsWdiN9pHh6qvkRVDTi/07+eNGch/oLU4w==" + }, + "camelcase-keys": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/camelcase-keys/-/camelcase-keys-6.2.2.tgz", + "integrity": "sha512-YrwaA0vEKazPBkn0ipTiMpSajYDSe+KjQfrjhcBMxJt/znbvlHd8Pw/Vamaz5EB4Wfhs3SUR3Z9mwRu/P3s3Yg==", + "requires": { + "camelcase": "^5.3.1", + "map-obj": "^4.0.0", + "quick-lru": "^4.0.1" + }, + "dependencies": { + "camelcase": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", + "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==" + } + } + }, + "cosmiconfig": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-6.0.0.tgz", + "integrity": "sha512-xb3ZL6+L8b9JLLCx3ZdoZy4+2ECphCMo2PwqgP1tlfVq6M6YReyzBJtvWWtbDSpNr9hn96pkCiZqUcFEc+54Qg==", + "requires": { + "@types/parse-json": "^4.0.0", + "import-fresh": "^3.1.0", + "parse-json": "^5.0.0", + "path-type": "^4.0.0", + "yaml": "^1.7.2" + } + }, + "find-up": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", + "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", + "requires": { + "locate-path": "^5.0.0", + "path-exists": "^4.0.0" + } + }, + "fs-extra": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.0.1.tgz", + "integrity": "sha512-h2iAoN838FqAFJY2/qVpzFXy+EBxfVE220PalAqQLDVsFOHLJrZvut5puAbCdNv6WJk+B8ihI+k0c7JK5erwqQ==", + "requires": { + "at-least-node": "^1.0.0", + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^1.0.0" + } + }, + "graceful-fs": { + "version": "4.2.4", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.4.tgz", + "integrity": "sha512-WjKPNJF79dtJAVniUlGGWHYGz2jWxT6VhN/4m1NdkbZ2nOsEF+cI1Edgql5zCRhs/VsQYRvrXctxktVXZUkixw==" + }, + "import-fresh": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.2.1.tgz", + "integrity": "sha512-6e1q1cnWP2RXD9/keSkxHScg508CdXqXWgWBaETNhyuBFz+kUZlKboh+ISK+bU++DmbHimVBrOz/zzPe0sZ3sQ==", + "requires": { + "parent-module": "^1.0.0", + "resolve-from": "^4.0.0" + }, + "dependencies": { + "resolve-from": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", + "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==" + } + } + }, + "indent-string": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", + "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==" + }, + "ip-regex": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/ip-regex/-/ip-regex-4.1.0.tgz", + "integrity": "sha512-pKnZpbgCTfH/1NLIlOduP/V+WRXzC2MOz3Qo8xmxk8C5GudJLgK5QyLVXOSWy3ParAH7Eemurl3xjv/WXYFvMA==" + }, + "is-absolute-url": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/is-absolute-url/-/is-absolute-url-3.0.3.tgz", + "integrity": "sha512-opmNIX7uFnS96NtPmhWQgQx6/NYFgsUXYMllcfzwWKUMwfo8kku1TvE6hkNcH+Q1ts5cMVrsY7j0bxXQDciu9Q==" + }, + "jsonfile": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.0.1.tgz", + "integrity": "sha512-jR2b5v7d2vIOust+w3wtFKZIfpC2pnRmFAhAC/BuweZFQR8qZzxH1OyrQ10HmdVYiXWkYUqPVsz91cG7EL2FBg==", + "requires": { + "graceful-fs": "^4.1.6", + "universalify": "^1.0.0" + } + }, + "locate-path": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", + "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", + "requires": { + "p-locate": "^4.1.0" + } + }, + "map-obj": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/map-obj/-/map-obj-4.1.0.tgz", + "integrity": "sha512-glc9y00wgtwcDmp7GaE/0b0OnxpNJsVf3ael/An6Fe2Q51LLwN1er6sdomLRzz5h0+yMpiYLhWYF5R7HeqVd4g==" + }, + "meow": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/meow/-/meow-7.0.1.tgz", + "integrity": "sha512-tBKIQqVrAHqwit0vfuFPY3LlzJYkEOFyKa3bPgxzNl6q/RtN8KQ+ALYEASYuFayzSAsjlhXj/JZ10rH85Q6TUw==", + "requires": { + "@types/minimist": "^1.2.0", + "arrify": "^2.0.1", + "camelcase": "^6.0.0", + "camelcase-keys": "^6.2.2", + "decamelize-keys": "^1.1.0", + "hard-rejection": "^2.1.0", + "minimist-options": "^4.0.2", + "normalize-package-data": "^2.5.0", + "read-pkg-up": "^7.0.1", + "redent": "^3.0.0", + "trim-newlines": "^3.0.0", + "type-fest": "^0.13.1", + "yargs-parser": "^18.1.3" + } + }, + "normalize-package-data": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.5.0.tgz", + "integrity": "sha512-/5CMN3T0R4XTj4DcGaexo+roZSdSFW/0AOOTROrjxzCG1wrWXEsGbRKevjlIL+ZDE4sZlJr5ED4YW0yqmkK+eA==", + "requires": { + "hosted-git-info": "^2.1.4", + "resolve": "^1.10.0", + "semver": "2 || 3 || 4 || 5", + "validate-npm-package-license": "^3.0.1" + } + }, + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "requires": { + "p-try": "^2.0.0" + } + }, + "p-locate": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", + "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", + "requires": { + "p-limit": "^2.2.0" + } + }, + "parse-json": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.0.0.tgz", + "integrity": "sha512-OOY5b7PAEFV0E2Fir1KOkxchnZNCdowAJgQ5NuxjpBKTRP3pQhwkrkxqQjeoKJ+fO7bCpmIZaogI4eZGDMEGOw==", + "requires": { + "@babel/code-frame": "^7.0.0", + "error-ex": "^1.3.1", + "json-parse-better-errors": "^1.0.1", + "lines-and-columns": "^1.1.6" + } + }, + "path-exists": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", + "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==" + }, + "path-type": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", + "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==" + }, + "read-pkg": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-5.2.0.tgz", + "integrity": "sha512-Ug69mNOpfvKDAc2Q8DRpMjjzdtrnv9HcSMX+4VsZxD1aZ6ZzrIE7rlzXBtWTyhULSMKg076AW6WR5iZpD0JiOg==", + "requires": { + "@types/normalize-package-data": "^2.4.0", + "normalize-package-data": "^2.5.0", + "parse-json": "^5.0.0", + "type-fest": "^0.6.0" + }, + "dependencies": { + "type-fest": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.6.0.tgz", + "integrity": "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==" + } + } + }, + "read-pkg-up": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-7.0.1.tgz", + "integrity": "sha512-zK0TB7Xd6JpCLmlLmufqykGE+/TlOePD6qKClNW7hHDKFh/J7/7gCWGR7joEQEW1bKq3a3yUZSObOoWLFQ4ohg==", + "requires": { + "find-up": "^4.1.0", + "read-pkg": "^5.2.0", + "type-fest": "^0.8.1" + }, + "dependencies": { + "type-fest": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.8.1.tgz", + "integrity": "sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==" + } + } + }, + "redent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redent/-/redent-3.0.0.tgz", + "integrity": "sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==", + "requires": { + "indent-string": "^4.0.0", + "strip-indent": "^3.0.0" + } + }, + "resolve": { + "version": "1.17.0", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.17.0.tgz", + "integrity": "sha512-ic+7JYiV8Vi2yzQGFWOkiZD5Z9z7O2Zhm9XMaTxdJExKasieFCr+yXZ/WmXsckHiKl12ar0y6XiXDx3m4RHn1w==", + "requires": { + "path-parse": "^1.0.6" + } + }, + "resolve-from": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz", + "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==" + }, + "strip-indent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-3.0.0.tgz", + "integrity": "sha512-laJTa3Jb+VQpaC6DseHhF7dXVqHTfJPCRDaEbid/drOhgitgYku/letMUqOXFoWV0zIIUbjpdH2t+tYj4bQMRQ==", + "requires": { + "min-indent": "^1.0.0" + } + }, + "trim-newlines": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/trim-newlines/-/trim-newlines-3.0.0.tgz", + "integrity": "sha512-C4+gOpvmxaSMKuEf9Qc134F1ZuOHVXKRbtEflf4NTtuuJDEIJ9p5PXsalL8SkeRw+qit1Mo+yuvMPAKwWg/1hA==" + }, + "type-fest": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz", + "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==" + }, + "universalify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-1.0.0.tgz", + "integrity": "sha512-rb6X1W158d7pRQBg5gkR8uPaSfiids68LTJQYOtEUhoJUWBdaQHsuT/EUduxXYxcrt4r5PJ4fuHW1MHT6p0qug==" + }, + "yargs-parser": { + "version": "18.1.3", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-18.1.3.tgz", + "integrity": "sha512-o50j0JeToy/4K6OZcaQmW6lyXXKhq7csREXcDwk2omFPJEwUNOVtJKvmDr9EI1fAJZUyZcRF7kxGBWmRXudrCQ==", + "requires": { + "camelcase": "^5.0.0", + "decamelize": "^1.2.0" + }, + "dependencies": { + "camelcase": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", + "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==" + } + } + } + } + }, "gensync": { "version": "1.0.0-beta.1", "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.1.tgz", @@ -14766,7 +15105,7 @@ }, "globby": { "version": "6.1.0", - "resolved": "https://registry.npmjs.org/globby/-/globby-6.1.0.tgz", + "resolved": "http://registry.npmjs.org/globby/-/globby-6.1.0.tgz", "integrity": "sha1-9abXDoOV4hyFj7BInWTfAkJNUGw=", "requires": { "array-union": "^1.0.1", @@ -14778,7 +15117,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=" } } @@ -15035,6 +15374,11 @@ } } }, + "hard-rejection": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/hard-rejection/-/hard-rejection-2.1.0.tgz", + "integrity": "sha512-VIZB+ibDhx7ObhAe7OVtoEbuP4h/MuOTHJ+J8h/eBXotJYl0fBgR72xDFCKgIh22OJZIOVNxBMWuhAr10r8HdA==" + }, "has": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", @@ -15792,7 +16136,7 @@ }, "immutable": { "version": "3.7.6", - "resolved": "https://registry.npmjs.org/immutable/-/immutable-3.7.6.tgz", + "resolved": "http://registry.npmjs.org/immutable/-/immutable-3.7.6.tgz", "integrity": "sha1-E7TTyxK++hVIKib+Gy665kAHHks=" }, "import-cwd": { @@ -17323,7 +17667,7 @@ }, "load-json-file": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-2.0.0.tgz", + "resolved": "http://registry.npmjs.org/load-json-file/-/load-json-file-2.0.0.tgz", "integrity": "sha1-eUfkIUmvgNaWy/eXvKq8/h/inKg=", "requires": { "graceful-fs": "^4.1.2", @@ -17342,7 +17686,7 @@ }, "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=" } } @@ -17736,7 +18080,7 @@ }, "lru-cache": { "version": "4.0.0", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.0.0.tgz", + "resolved": "http://registry.npmjs.org/lru-cache/-/lru-cache-4.0.0.tgz", "integrity": "sha1-tcvwFVbBaWb+vlTO7A+03JDfbCg=", "requires": { "pseudomap": "^1.0.1", @@ -17916,7 +18260,7 @@ }, "media-typer": { "version": "0.3.0", - "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "resolved": "http://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" }, "mem": { @@ -18184,6 +18528,11 @@ "dom-walk": "^0.1.0" } }, + "min-indent": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/min-indent/-/min-indent-1.0.1.tgz", + "integrity": "sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==" + }, "mini-css-extract-plugin": { "version": "0.4.5", "resolved": "https://registry.npmjs.org/mini-css-extract-plugin/-/mini-css-extract-plugin-0.4.5.tgz", @@ -18234,6 +18583,23 @@ "resolved": "http://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", "integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=" }, + "minimist-options": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/minimist-options/-/minimist-options-4.1.0.tgz", + "integrity": "sha512-Q4r8ghd80yhO/0j1O3B2BjweX3fiHg9cdOwjJd2J76Q135c+NDxGCqdYKQ1SKBuFfgWbAUzBfvYjPUEeNgqN1A==", + "requires": { + "arrify": "^1.0.1", + "is-plain-obj": "^1.1.0", + "kind-of": "^6.0.3" + }, + "dependencies": { + "kind-of": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz", + "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==" + } + } + }, "minipass": { "version": "2.3.5", "resolved": "https://registry.npmjs.org/minipass/-/minipass-2.3.5.tgz", @@ -18668,7 +19034,7 @@ }, "readable-stream": { "version": "1.0.34", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", + "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", "requires": { "core-util-is": "~1.0.0", @@ -18679,7 +19045,7 @@ }, "string_decoder": { "version": "0.10.31", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" } } @@ -19668,7 +20034,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=" } } @@ -21961,6 +22327,11 @@ "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.1.0.tgz", "integrity": "sha512-sluvZZ1YiTLD5jsqZcDmFyV2EwToyXZBfpoVOmktMmW+VEnhgakFHnasVph65fOjGPTWN0Nw3+XQaSeMayr0kg==" }, + "quick-lru": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-4.0.1.tgz", + "integrity": "sha512-ARhCpm70fzdcvNQfPoy49IaanKkTlRWF2JMzqhcJbhSFRZv7nPTvZJdcY7301IPmvW+/p0RgIWnQDLJxifsQ7g==" + }, "ramda": { "version": "0.21.0", "resolved": "http://registry.npmjs.org/ramda/-/ramda-0.21.0.tgz", @@ -22099,7 +22470,7 @@ }, "chalk": { "version": "1.1.3", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", "requires": { "ansi-styles": "^2.2.1", @@ -22213,7 +22584,7 @@ }, "supports-color": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "resolved": "http://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=" }, "tmp": { @@ -23418,7 +23789,7 @@ }, "rgba-regex": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/rgba-regex/-/rgba-regex-1.0.0.tgz", + "resolved": "http://registry.npmjs.org/rgba-regex/-/rgba-regex-1.0.0.tgz", "integrity": "sha1-QzdOLiyglosO8VI0YLfXMP8i7rM=" }, "rimraf": { @@ -24084,7 +24455,7 @@ }, "sha.js": { "version": "2.4.11", - "resolved": "https://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", + "resolved": "http://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", "integrity": "sha512-QMEp5B7cftE7APOjk5Y6xgrbWu+WkLVQwk8JNjZ8nKRciZaByEW6MubieAiToS7+dwvrjGhH8jRXz3MVd0AYqQ==", "requires": { "inherits": "^2.0.1", @@ -24224,7 +24595,7 @@ }, "sift": { "version": "5.1.0", - "resolved": "https://registry.npmjs.org/sift/-/sift-5.1.0.tgz", + "resolved": "http://registry.npmjs.org/sift/-/sift-5.1.0.tgz", "integrity": "sha1-G78t+w63HlbEzH+1Z/vRNRtlAV4=" }, "signal-exit": { @@ -26094,7 +26465,7 @@ }, "tty-browserify": { "version": "0.0.0", - "resolved": "https://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.0.tgz", + "resolved": "http://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.0.tgz", "integrity": "sha1-oVe6QC2iTpv5V/mqadUk7tQpAaY=" }, "tunnel-agent": { diff --git a/website/package.json b/website/package.json index 353cffe00..9ac9bd5c5 100644 --- a/website/package.json +++ b/website/package.json @@ -25,6 +25,7 @@ "gatsby-plugin-offline": "^2.0.24", "gatsby-plugin-react-helmet": "^3.0.6", "gatsby-plugin-react-svg": "^2.0.0", + "gatsby-plugin-robots-txt": "^1.5.1", "gatsby-plugin-sass": "^2.0.10", "gatsby-plugin-sharp": "^2.0.20", "gatsby-plugin-sitemap": "^2.0.5", From 58a289b309fa39e9b2887ad2e0132b9fd3d21f1a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 1 Jul 2020 21:28:51 +0200 Subject: [PATCH 02/10] Update branch name --- website/meta/site.json | 2 +- website/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/meta/site.json b/website/meta/site.json index ed78b1802..724665060 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -4,7 +4,7 @@ "slogan": "Industrial-strength Natural Language Processing in Python", "siteUrl": "https://spacy.io", "siteUrlNightly": "https://nightly.spacy.io", - "nightlyBranches": ["spacy.io-develop"], + "nightlyBranches": ["nightly.spacy.io"], "email": "contact@explosion.ai", "company": "Explosion AI", "companyUrl": "https://explosion.ai", diff --git a/website/package.json b/website/package.json index 9ac9bd5c5..d5c770ddf 100644 --- a/website/package.json +++ b/website/package.json @@ -53,7 +53,7 @@ "scripts": { "build": "gatsby build", "dev": "gatsby develop", - "dev:nightly": "BRANCH=spacy.io-develop npm run dev", + "dev:nightly": "BRANCH=nightly.spacy.io npm run dev", "lint": "eslint **", "clear": "rm -rf .cache", "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"" From 06f1ecb3085f68ba937891915d75b4df9a13b71f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 3 Jul 2020 16:48:21 +0200 Subject: [PATCH 03/10] Update v3 docs --- website/docs/api/architectures.md | 7 + website/docs/api/index.md | 6 +- website/docs/api/top-level.md | 27 - website/docs/usage/training.md | 839 ++++++------------------------ website/meta/sidebars.json | 5 +- website/src/styles/layout.sass | 10 +- 6 files changed, 169 insertions(+), 725 deletions(-) create mode 100644 website/docs/api/architectures.md diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md new file mode 100644 index 000000000..f463d6df2 --- /dev/null +++ b/website/docs/api/architectures.md @@ -0,0 +1,7 @@ +--- +title: Model Architectures +teaser: Pre-defined model architectures included with the core library +source: spacy/ml/models +--- + +TODO: write diff --git a/website/docs/api/index.md b/website/docs/api/index.md index 97a7f57c4..a9dc408f6 100644 --- a/website/docs/api/index.md +++ b/website/docs/api/index.md @@ -1,10 +1,8 @@ --- -title: Architecture -next: /api/annotation +title: Library Architecture +next: /api/architectures --- -## Library architecture {#architecture} - import Architecture101 from 'usage/101/\_architecture.md' diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 6ee324af9..01bc712a8 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -4,7 +4,6 @@ menu: - ['spacy', 'spacy'] - ['displacy', 'displacy'] - ['Utility Functions', 'util'] - - ['Compatibility', 'compat'] --- ## spaCy {#spacy hidden="true"} @@ -269,32 +268,6 @@ page should be safe to use and we'll try to ensure backwards compatibility. However, we recommend having additional tests in place if your application depends on any of spaCy's utilities. -### util.get_data_path {#util.get_data_path tag="function"} - -Get path to the data directory where spaCy looks for models. Defaults to -`spacy/data`. - -| Name | Type | Description | -| ---------------- | --------------- | ------------------------------------------------------- | -| `require_exists` | bool | Only return path if it exists, otherwise return `None`. | -| **RETURNS** | `Path` / `None` | Data path or `None`. | - -### util.set_data_path {#util.set_data_path tag="function"} - -Set custom path to the data directory where spaCy looks for models. - -> #### Example -> -> ```python -> util.set_data_path("/custom/path") -> util.get_data_path() -> # PosixPath('/custom/path') -> ``` - -| Name | Type | Description | -| ------ | ------------ | --------------------------- | -| `path` | str / `Path` | Path to new data directory. | - ### util.get_lang_class {#util.get_lang_class tag="function"} Import and load a `Language` class. Allows lazy-loading diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 8d80655e9..7ca309ea0 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -2,174 +2,21 @@ title: Training Models next: /usage/projects menu: - - ['Basics', 'basics'] - - ['NER', 'ner'] - - ['Tagger & Parser', 'tagger-parser'] - - ['Text Classification', 'textcat'] - - ['Entity Linking', 'entity-linker'] - - ['Tips and Advice', 'tips'] + - ['Introduction', 'basics'] + - ['CLI & Config', 'cli-config'] + - ['Custom Models', 'custom-models'] + - ['Transfer Learning', 'transfer-learning'] + - ['Internal API', 'api'] --- -This guide describes how to train new statistical models for spaCy's -part-of-speech tagger, named entity recognizer, dependency parser, text -classifier and entity linker. Once the model is trained, you can then -[save and load](/usage/saving-loading#models) it. + -## Training basics {#basics} +## Introduction to training models {#basics} import Training101 from 'usage/101/\_training.md' -### Training via the command-line interface {#spacy-train-cli} - -For most purposes, the best way to train spaCy is via the command-line -interface. The [`spacy train`](/api/cli#train) command takes care of many -details for you, including making sure that the data is minibatched and shuffled -correctly, progress is printed, and models are saved after each epoch. You can -prepare your data for use in [`spacy train`](/api/cli#train) using the -[`spacy convert`](/api/cli#convert) command, which accepts many common NLP data -formats, including `.iob` for named entities, and the CoNLL format for -dependencies: - -```bash -git clone https://github.com/UniversalDependencies/UD_Spanish-AnCora -mkdir ancora-json -python -m spacy convert UD_Spanish-AnCora/es_ancora-ud-train.conllu ancora-json -python -m spacy convert UD_Spanish-AnCora/es_ancora-ud-dev.conllu ancora-json -mkdir models -python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json -``` - - - -If you're running spaCy v2.2 or above, you can use the -[`debug-data` command](/api/cli#debug-data) to analyze and validate your -training and development data, get useful stats, and find problems like invalid -entity annotations, cyclic dependencies, low data labels and more. - -```bash -$ python -m spacy debug-data en train.json dev.json --verbose -``` - - - -You can also use the [`gold.docs_to_json`](/api/goldparse#docs_to_json) helper -to convert a list of `Doc` objects to spaCy's JSON training format. - -#### Understanding the training output - -When you train a model using the [`spacy train`](/api/cli#train) command, you'll -see a table showing metrics after each pass over the data. Here's what those -metrics means: - -> #### Tokenization metrics -> -> Note that if the development data has raw text, some of the gold-standard -> entities might not align to the predicted tokenization. These tokenization -> errors are **excluded from the NER evaluation**. If your tokenization makes it -> impossible for the model to predict 50% of your entities, your NER F-score -> might still look good. - -| Name | Description | -| ---------- | ------------------------------------------------------------------------------------------------- | -| `Dep Loss` | Training loss for dependency parser. Should decrease, but usually not to 0. | -| `NER Loss` | Training loss for named entity recognizer. Should decrease, but usually not to 0. | -| `UAS` | Unlabeled attachment score for parser. The percentage of unlabeled correct arcs. Should increase. | -| `NER P.` | NER precision on development data. Should increase. | -| `NER R.` | NER recall on development data. Should increase. | -| `NER F.` | NER F-score on development data. Should increase. | -| `Tag %` | Fine-grained part-of-speech tag accuracy on development data. Should increase. | -| `Token %` | Tokenization accuracy on development data. | -| `CPU WPS` | Prediction speed on CPU in words per second, if available. Should stay stable. | -| `GPU WPS` | Prediction speed on GPU in words per second, if available. Should stay stable. | - -### Improving accuracy with transfer learning {#transfer-learning new="2.1"} - -In most projects, you'll usually have a small amount of labelled data, and -access to a much bigger sample of raw text. The raw text contains a lot of -information about the language in general. Learning this general information -from the raw text can help your model use the smaller labelled data more -efficiently. - -The two main ways to use raw text in your spaCy models are **word vectors** and -**language model pretraining**. Word vectors provide information about the -definitions of words. The vectors are a look-up table, so each word only has one -representation, regardless of its context. Language model pretraining lets you -learn contextualized word representations. Instead of initializing spaCy's -convolutional neural network layers with random weights, the `spacy pretrain` -command trains a language model to predict each word's word vector based on the -surrounding words. The information used to predict this task is a good starting -point for other tasks such as named entity recognition, text classification or -dependency parsing. - - - -For more details, see the documentation on -[vectors and similarity](/usage/vectors-similarity) and the -[`spacy pretrain`](/api/cli#pretrain) command. - - - -### How do I get training data? {#training-data} - -Collecting training data may sound incredibly painful – and it can be, if you're -planning a large-scale annotation project. However, if your main goal is to -update an existing model's predictions – for example, spaCy's named entity -recognition – the hard part is usually not creating the actual annotations. It's -finding representative examples and **extracting potential candidates**. The -good news is, if you've been noticing bad performance on your data, you likely -already have some relevant text, and you can use spaCy to **bootstrap a first -set of training examples**. For example, after processing a few sentences, you -may end up with the following entities, some correct, some incorrect. - -> #### How many examples do I need? -> -> As a rule of thumb, you should allocate at least 10% of your project resources -> to creating training and evaluation data. If you're looking to improve an -> existing model, you might be able to start off with only a handful of -> examples. Keep in mind that you'll always want a lot more than that for -> **evaluation** – especially previous errors the model has made. Otherwise, you -> won't be able to sufficiently verify that the model has actually made the -> **correct generalizations** required for your use case. - -| Text |  Entity | Start | End | Label | | -| ---------------------------------- | ------- | ----- | ---- | -------- | --- | -| Uber blew through 1 million a week | Uber | `0` | `4` | `ORG` | ✅ | -| Android Pay expands to Canada | Android | `0` | `7` | `PERSON` | ❌ | -| Android Pay expands to Canada | Canada | `23` | `30` | `GPE` | ✅ | -| Spotify steps up Asia expansion | Spotify | `0` | `8` | `ORG` | ✅ | -| Spotify steps up Asia expansion | Asia | `17` | `21` | `NORP` | ❌ | - -Alternatively, the [rule-based matcher](/usage/rule-based-matching) can be a -useful tool to extract tokens or combinations of tokens, as well as their start -and end index in a document. In this case, we'll extract mentions of Google and -assume they're an `ORG`. - -| Text |  Entity | Start | End | Label | | -| ------------------------------------- | ------- | ----- | ---- | ----- | --- | -| let me google this for you | google | `7` | `13` | `ORG` | ❌ | -| Google Maps launches location sharing | Google | `0` | `6` | `ORG` | ❌ | -| Google rebrands its business apps | Google | `0` | `6` | `ORG` | ✅ | -| look what i found on google! 😂 | google | `21` | `27` | `ORG` | ✅ | - -Based on the few examples above, you can already create six training sentences -with eight entities in total. Of course, what you consider a "correct -annotation" will always depend on **what you want the model to learn**. While -there are some entity annotations that are more or less universally correct – -like Canada being a geopolitical entity – your application may have its very own -definition of the [NER annotation scheme](/api/annotation#named-entities). - -```python -train_data = [ - ("Uber blew through $1 million a week", [(0, 4, 'ORG')]), - ("Android Pay expands to Canada", [(0, 11, 'PRODUCT'), (23, 30, 'GPE')]), - ("Spotify steps up Asia expansion", [(0, 8, "ORG"), (17, 21, "LOC")]), - ("Google Maps launches location sharing", [(0, 11, "PRODUCT")]), - ("Google rebrands its business apps", [(0, 6, "ORG")]), - ("look what i found on google! 😂", [(21, 27, "PRODUCT")])] -``` - [![Prodigy: Radically efficient machine teaching](../images/prodigy.jpg)](https://prodi.gy) @@ -183,7 +30,146 @@ ready-to-use spaCy models. -### Training with annotations {#annotations} +## Training CLI & config {#cli-config} + +The recommended way to train your spaCy models is via the +[`spacy train`](/api/cli#train) command on the command line. + +1. The **training data** in spaCy's binary format created using + [`spacy convert`](/api/cli#convert). +2. A `config.cfg` **configuration file** with all settings and hyperparameters. +3. An optional **Python file** to register + [custom models and architectures](#custom-models). + + + +> #### Tip: Debug your data +> +> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate +> your training and development data, get useful stats, and find problems like +> invalid entity annotations, cyclic dependencies, low data labels and more. +> +> ```bash +> $ python -m spacy debug-data en train.json dev.json --verbose +> ``` + + + +When you train a model using the [`spacy train`](/api/cli#train) command, you'll +see a table showing metrics after each pass over the data. Here's what those +metrics means: + + + +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------------------- | +| `Dep Loss` | Training loss for dependency parser. Should decrease, but usually not to 0. | +| `NER Loss` | Training loss for named entity recognizer. Should decrease, but usually not to 0. | +| `UAS` | Unlabeled attachment score for parser. The percentage of unlabeled correct arcs. Should increase. | +| `NER P.` | NER precision on development data. Should increase. | +| `NER R.` | NER recall on development data. Should increase. | +| `NER F.` | NER F-score on development data. Should increase. | +| `Tag %` | Fine-grained part-of-speech tag accuracy on development data. Should increase. | +| `Token %` | Tokenization accuracy on development data. | +| `CPU WPS` | Prediction speed on CPU in words per second, if available. Should stay stable. | +| `GPU WPS` | Prediction speed on GPU in words per second, if available. Should stay stable. | + +Note that if the development data has raw text, some of the gold-standard +entities might not align to the predicted tokenization. These tokenization +errors are **excluded from the NER evaluation**. If your tokenization makes it +impossible for the model to predict 50% of your entities, your NER F-score might +still look good. + + + +--- + +### Training config files {#cli} + + + + + +```ini +[training] +use_gpu = -1 +limit = 0 +dropout = 0.2 +patience = 1000 +eval_frequency = 20 +scores = ["ents_p", "ents_r", "ents_f"] +score_weights = {"ents_f": 1} +orth_variant_level = 0.0 +gold_preproc = false +max_length = 0 +seed = 0 +accumulate_gradient = 1 +discard_oversize = false + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[training.optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 +use_averages = false + +[nlp] +lang = "en" +vectors = null + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = true + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +width = 128 +depth = 4 +embed_size = 7000 +maxout_pieces = 3 +window_size = 1 +subword_features = true +pretrained_vectors = null +dropout = null +``` + +### Model architectures {#model-architectures} + + + +## Custom model implementations and architectures {#custom-models} + + + +### Training with custom code + + + +## Transfer learning {#transfer-learning} + +### Using transformer models like BERT {#transformers} + + + +### Pretraining with spaCy {#pretraining} + + + +## Internal training API {#api} + + The [`GoldParse`](/api/goldparse) object collects the annotated training examples, also called the **gold standard**. It's initialized with the @@ -234,12 +220,12 @@ it harder for the model to memorize the training data. For example, a `0.25` dropout means that each feature or internal representation has a 1/4 likelihood of being dropped. -> - [`begin_training()`](/api/language#begin_training): Start the training and +> - [`begin_training`](/api/language#begin_training): Start the training and > return an optimizer function to update the model's weights. Can take an > optional function converting the training data to spaCy's training format. -> - [`update()`](/api/language#update): Update the model with the training -> example and gold data. -> - [`to_disk()`](/api/language#to_disk): Save the updated model to a directory. +> - [`update`](/api/language#update): Update the model with the training example +> and gold data. +> - [`to_disk`](/api/language#to_disk): Save the updated model to a directory. ```python ### Example training loop @@ -265,525 +251,4 @@ The [`nlp.update`](/api/language#update) method takes the following arguments: Instead of writing your own training loop, you can also use the built-in [`train`](/api/cli#train) command, which expects data in spaCy's [JSON format](/api/annotation#json-input). On each epoch, a model will be saved -out to the directory. After training, you can use the -[`package`](/api/cli#package) command to generate an installable Python package -from your model. - -```bash -python -m spacy convert /tmp/train.conllu /tmp/data -python -m spacy train en /tmp/model /tmp/data/train.json -n 5 -``` - -### Simple training style {#training-simple-style new="2"} - -Instead of sequences of `Doc` and `GoldParse` objects, you can also use the -"simple training style" and pass **raw texts** and **dictionaries of -annotations** to [`nlp.update`](/api/language#update). The dictionaries can have -the keys `entities`, `heads`, `deps`, `tags` and `cats`. This is generally -recommended, as it removes one layer of abstraction, and avoids unnecessary -imports. It also makes it easier to structure and load your training data. - -> #### Example Annotations -> -> ```python -> { -> "entities": [(0, 4, "ORG")], -> "heads": [1, 1, 1, 5, 5, 2, 7, 5], -> "deps": ["nsubj", "ROOT", "prt", "quantmod", "compound", "pobj", "det", "npadvmod"], -> "tags": ["PROPN", "VERB", "ADP", "SYM", "NUM", "NUM", "DET", "NOUN"], -> "cats": {"BUSINESS": 1.0}, -> } -> ``` - -```python -### Simple training loop -TRAIN_DATA = [ - ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), - ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})] - -nlp = spacy.blank("en") -optimizer = nlp.begin_training() -for i in range(20): - random.shuffle(TRAIN_DATA) - for text, annotations in TRAIN_DATA: - nlp.update([text], [annotations], sgd=optimizer) -nlp.to_disk("/model") -``` - -The above training loop leaves out a few details that can really improve -accuracy – but the principle really is _that_ simple. Once you've got your -pipeline together and you want to tune the accuracy, you usually want to process -your training examples in batches, and experiment with -[`minibatch`](/api/top-level#util.minibatch) sizes and dropout rates, set via -the `drop` keyword argument. See the [`Language`](/api/language) and -[`Pipe`](/api/pipe) API docs for available options. - -## Training the named entity recognizer {#ner} - -All [spaCy models](/models) support online learning, so you can update a -pretrained model with new examples. You'll usually need to provide many -**examples** to meaningfully improve the system — a few hundred is a good start, -although more is better. - -You should avoid iterating over the same few examples multiple times, or the -model is likely to "forget" how to annotate other examples. If you iterate over -the same few examples, you're effectively changing the loss function. The -optimizer will find a way to minimize the loss on your examples, without regard -for the consequences on the examples it's no longer paying attention to. One way -to avoid this -["catastrophic forgetting" problem](https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting) -is to "remind" the model of other examples by augmenting your annotations with -sentences annotated with entities automatically recognized by the original -model. Ultimately, this is an empirical process: you'll need to **experiment on -your data** to find a solution that works best for you. - -> #### Tip: Converting entity annotations -> -> You can train the entity recognizer with entity offsets or annotations in the -> [BILUO scheme](/api/annotation#biluo). The `spacy.gold` module also exposes -> [two helper functions](/api/goldparse#util) to convert offsets to BILUO tags, -> and BILUO tags to entity offsets. - -### Updating the Named Entity Recognizer {#example-train-ner} - -This example shows how to update spaCy's entity recognizer with your own -examples, starting off with an existing, pretrained model, or from scratch using -a blank `Language` class. To do this, you'll need **example texts** and the -**character offsets** and **labels** of each entity contained in the texts. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py -``` - -#### Step by step guide {#step-by-step-ner} - -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If - you're using a blank model, don't forget to add the entity recognizer to the - pipeline. If you're using an existing model, make sure to disable all other - pipeline components during training using - [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be - training the entity recognizer. -2. **Shuffle and loop over** the examples. For each example, **update the - model** by calling [`nlp.update`](/api/language#update), which steps through - the words of the input. At each word, it makes a **prediction**. It then - consults the annotations to see whether it was right. If it was wrong, it - adjusts its weights so that the correct action will score higher next time. -3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -4. **Test** the model to make sure the entities in the training data are - recognized correctly. - -### Training an additional entity type {#example-new-entity-type} - -This script shows how to add a new entity type `ANIMAL` to an existing -pretrained NER model, or an empty `Language` class. To keep the example short -and simple, only a few sentences are provided as examples. In practice, you'll -need many more — a few hundred would be a good start. You will also likely need -to mix in examples of other entity types, which might be obtained by running the -entity recognizer over unlabelled sentences, and adding their annotations to the -training set. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py -``` - - - -If you're using an existing model, make sure to mix in examples of **other -entity types** that spaCy correctly recognized before. Otherwise, your model -might learn the new type, but "forget" what it previously knew. This is also -referred to as the "catastrophic forgetting" problem. - - - -#### Step by step guide {#step-by-step-ner-new} - -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If - you're using a blank model, don't forget to add the entity recognizer to the - pipeline. If you're using an existing model, make sure to disable all other - pipeline components during training using - [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be - training the entity recognizer. -2. **Add the new entity label** to the entity recognizer using the - [`add_label`](/api/entityrecognizer#add_label) method. You can access the - entity recognizer in the pipeline via `nlp.get_pipe('ner')`. -3. **Loop over** the examples and call [`nlp.update`](/api/language#update), - which steps through the words of the input. At each word, it makes a - **prediction**. It then consults the annotations, to see whether it was - right. If it was wrong, it adjusts its weights so that the correct action - will score higher next time. -4. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -5. **Test** the model to make sure the new entity is recognized correctly. - -## Training the tagger and parser {#tagger-parser} - -### Updating the Dependency Parser {#example-train-parser} - -This example shows how to train spaCy's dependency parser, starting off with an -existing model or a blank model. You'll need a set of **training examples** and -the respective **heads** and **dependency label** for each token of the example -texts. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py -``` - -#### Step by step guide {#step-by-step-parser} - -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If - you're using a blank model, don't forget to add the parser to the pipeline. - If you're using an existing model, make sure to disable all other pipeline - components during training using - [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be - training the parser. -2. **Add the dependency labels** to the parser using the - [`add_label`](/api/dependencyparser#add_label) method. If you're starting off - with a pretrained spaCy model, this is usually not necessary – but it doesn't - hurt either, just to be safe. -3. **Shuffle and loop over** the examples. For each example, **update the - model** by calling [`nlp.update`](/api/language#update), which steps through - the words of the input. At each word, it makes a **prediction**. It then - consults the annotations to see whether it was right. If it was wrong, it - adjusts its weights so that the correct action will score higher next time. -4. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -5. **Test** the model to make sure the parser works as expected. - -### Updating the Part-of-speech Tagger {#example-train-tagger} - -In this example, we're training spaCy's part-of-speech tagger with a custom tag -map. We start off with a blank `Language` class, update its defaults with our -custom tags and then train the tagger. You'll need a set of **training -examples** and the respective **custom tags**, as well as a dictionary mapping -those tags to the -[Universal Dependencies scheme](http://universaldependencies.github.io/docs/u/pos/index.html). - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_tagger.py -``` - -#### Step by step guide {#step-by-step-tagger} - -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If - you're using a blank model, don't forget to add the tagger to the pipeline. - If you're using an existing model, make sure to disable all other pipeline - components during training using - [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be - training the tagger. -2. **Add the tag map** to the tagger using the - [`add_label`](/api/tagger#add_label) method. The first argument is the new - tag name, the second the mapping to spaCy's coarse-grained tags, e.g. - `{'pos': 'NOUN'}`. -3. **Shuffle and loop over** the examples. For each example, **update the - model** by calling [`nlp.update`](/api/language#update), which steps through - the words of the input. At each word, it makes a **prediction**. It then - consults the annotations to see whether it was right. If it was wrong, it - adjusts its weights so that the correct action will score higher next time. -4. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -5. **Test** the model to make sure the parser works as expected. - -### Training a parser for custom semantics {#intent-parser} - -spaCy's parser component can be used to be trained to predict any type of tree -structure over your input text – including **semantic relations** that are not -syntactic dependencies. This can be useful to for **conversational -applications**, which need to predict trees over whole documents or chat logs, -with connections between the sentence roots used to annotate discourse -structure. For example, you can train spaCy's parser to label intents and their -targets, like attributes, quality, time and locations. The result could look -like this: - -![Custom dependencies](../images/displacy-custom-parser.svg) - -```python -doc = nlp("find a hotel with good wifi") -print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) -# [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'), -# ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')] -``` - -The above tree attaches "wifi" to "hotel" and assigns the dependency label -`ATTRIBUTE`. This may not be a correct syntactic dependency – but in this case, -it expresses exactly what we need: the user is looking for a hotel with the -attribute "wifi" of the quality "good". This query can then be processed by your -application and used to trigger the respective action – e.g. search the database -for hotels with high ratings for their wifi offerings. - -> #### Tip: merge phrases and entities -> -> To achieve even better accuracy, try merging multi-word tokens and entities -> specific to your domain into one token before parsing your text. You can do -> this by running the entity recognizer or -> [rule-based matcher](/usage/rule-based-matching) to find relevant spans, and -> merging them using [`Doc.retokenize`](/api/doc#retokenize). You could even add -> your own custom -> [pipeline component](/usage/processing-pipelines#custom-components) to do this -> automatically – just make sure to add it `before='parser'`. - -The following example shows a full implementation of a training loop for a -custom message parser for a common "chat intent": finding local businesses. Our -message semantics will have the following types of relations: `ROOT`, `PLACE`, -`QUALITY`, `ATTRIBUTE`, `TIME` and `LOCATION`. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_intent_parser.py -``` - -#### Step by step guide {#step-by-step-parser-custom} - -1. **Create the training data** consisting of words, their heads and their - dependency labels in order. A token's head is the index of the token it is - attached to. The heads don't need to be syntactically correct – they should - express the **semantic relations** you want the parser to learn. For words - that shouldn't receive a label, you can choose an arbitrary placeholder, for - example `-`. -2. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If - you're using a blank model, don't forget to add the custom parser to the - pipeline. If you're using an existing model, make sure to **remove the old - parser** from the pipeline, and disable all other pipeline components during - training using [`nlp.select_pipes`](/api/language#select_pipes). This way, - you'll only be training the parser. -3. **Add the dependency labels** to the parser using the - [`add_label`](/api/dependencyparser#add_label) method. -4. **Shuffle and loop over** the examples. For each example, **update the - model** by calling [`nlp.update`](/api/language#update), which steps through - the words of the input. At each word, it makes a **prediction**. It then - consults the annotations to see whether it was right. If it was wrong, it - adjusts its weights so that the correct action will score higher next time. -5. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -6. **Test** the model to make sure the parser works as expected. - -## Training a text classification model {#textcat} - -### Adding a text classifier to a spaCy model {#example-textcat new="2"} - -This example shows how to train a convolutional neural network text classifier -on IMDB movie reviews, using spaCy's new -[`TextCategorizer`](/api/textcategorizer) component. The dataset will be loaded -automatically via Thinc's built-in dataset loader. Predictions are available via -[`Doc.cats`](/api/doc#attributes). - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.py -``` - -#### Step by step guide {#step-by-step-textcat} - -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If - you're using an existing model, make sure to disable all other pipeline - components during training using - [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be - training the text classifier. -2. **Add the text classifier** to the pipeline, and add the labels you want to - train – for example, `POSITIVE`. -3. **Load and pre-process the dataset**, shuffle the data and split off a part - of it to hold back for evaluation. This way, you'll be able to see results on - each training iteration. -4. **Loop over** the training examples and partition them into batches using - spaCy's [`minibatch`](/api/top-level#util.minibatch) and - [`compounding`](/api/top-level#util.compounding) helpers. -5. **Update the model** by calling [`nlp.update`](/api/language#update), which - steps through the examples and makes a **prediction**. It then consults the - annotations to see whether it was right. If it was wrong, it adjusts its - weights so that the correct prediction will score higher next time. -6. Optionally, you can also **evaluate the text classifier** on each iteration, - by checking how it performs on the development data held back from the - dataset. This lets you print the **precision**, **recall** and **F-score**. -7. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -8. **Test** the model to make sure the text classifier works as expected. - -## Entity linking {#entity-linker} - -To train an entity linking model, you first need to define a knowledge base -(KB). - -### Creating a knowledge base {#kb} - -A KB consists of a list of entities with unique identifiers. Each such entity -has an entity vector that will be used to measure similarity with the context in -which an entity is used. These vectors have a fixed length and are stored in the -KB. - -The following example shows how to build a knowledge base from scratch, given a -list of entities and potential aliases. The script requires an `nlp` model with -pretrained word vectors to obtain an encoding of an entity's description as its -vector. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py -``` - -#### Step by step guide {#step-by-step-kb} - -1. **Load the model** you want to start with. It should contain pretrained word - vectors. -2. **Obtain the entity embeddings** by running the descriptions of the entities - through the `nlp` model and taking the average of all words with - `nlp(desc).vector`. At this point, a custom encoding step can also be used. -3. **Construct the KB** by defining all entities with their embeddings, and all - aliases with their prior probabilities. -4. **Save** the KB using [`kb.dump`](/api/kb#dump). -5. **Print** the contents of the KB to make sure the entities were added - correctly. - -### Training an entity linking model {#entity-linker-model} - -This example shows how to create an entity linker pipe using a previously -created knowledge base. The entity linker is then trained with a set of custom -examples. To do so, you need to provide **example texts**, and the **character -offsets** and **knowledge base identifiers** of each entity contained in the -texts. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py -``` - -#### Step by step guide {#step-by-step-entity-linker} - -1. **Load the KB** you want to start with, and specify the path to the `Vocab` - object that was used to create this KB. Then, create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. Add - a component for recognizing sentences en one for identifying relevant - entities. In practical applications, you will want a more advanced pipeline - including also a component for - [named entity recognition](/usage/training#ner). Then, create a new entity - linker component, add the KB to it, and then add the entity linker to the - pipeline. If you're using a model with additional components, make sure to - disable all other pipeline components during training using - [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be - training the entity linker. -2. **Shuffle and loop over** the examples. For each example, **update the - model** by calling [`nlp.update`](/api/language#update), which steps through - the annotated examples of the input. For each combination of a mention in - text and a potential KB identifier, the model makes a **prediction** whether - or not this is the correct match. It then consults the annotations to see - whether it was right. If it was wrong, it adjusts its weights so that the - correct combination will score higher next time. -3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -4. **Test** the model to make sure the entities in the training data are - recognized correctly. - -## Optimization tips and advice {#tips} - -There are lots of conflicting "recipes" for training deep neural networks at the -moment. The cutting-edge models take a very long time to train, so most -researchers can't run enough experiments to figure out what's _really_ going on. -For what it's worth, here's a recipe that seems to work well on a lot of NLP -problems: - -1. Initialize with batch size 1, and compound to a maximum determined by your - data size and problem type. -2. Use Adam solver with fixed learning rate. -3. Use averaged parameters -4. Use L2 regularization. -5. Clip gradients by L2 norm to 1. -6. On small data sizes, start at a high dropout rate, with linear decay. - -This recipe has been cobbled together experimentally. Here's why the various -elements of the recipe made enough sense to try initially, and what you might -try changing, depending on your problem. - -### Compounding batch size {#tips-batch-size} - -The trick of increasing the batch size is starting to become quite popular (see -[Smith et al., 2017](https://arxiv.org/abs/1711.00489)). Their recipe is quite -different from how spaCy's models are being trained, but there are some -similarities. In training the various spaCy models, we haven't found much -advantage from decaying the learning rate – but starting with a low batch size -has definitely helped. You should try it out on your data, and see how you go. -Here's our current strategy: - -```python -### Batch heuristic -def get_batches(train_data, model_type): - max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64} - max_batch_size = max_batch_sizes[model_type] - if len(train_data) < 1000: - max_batch_size /= 2 - if len(train_data) < 500: - max_batch_size /= 2 - batch_size = compounding(1, max_batch_size, 1.001) - batches = minibatch(train_data, size=batch_size) - return batches -``` - -This will set the batch size to start at `1`, and increase each batch until it -reaches a maximum size. The tagger, parser and entity recognizer all take whole -sentences as input, so they're learning a lot of labels in a single example. You -therefore need smaller batches for them. The batch size for the text categorizer -should be somewhat larger, especially if your documents are long. - -### Learning rate, regularization and gradient clipping {#tips-hyperparams} - -By default spaCy uses the Adam solver, with default settings -(`learn_rate=0.001`, `beta1=0.9`, `beta2=0.999`). Some researchers have said -they found these settings terrible on their problems – but they've always -performed very well in training spaCy's models, in combination with the rest of -our recipe. You can change these settings directly, by modifying the -corresponding attributes on the `optimizer` object. You can also set environment -variables, to adjust the defaults. - -There are two other key hyper-parameters of the solver: `L2` **regularization**, -and **gradient clipping** (`max_grad_norm`). Gradient clipping is a hack that's -not discussed often, but everybody seems to be using. It's quite important in -helping to ensure the network doesn't diverge, which is a fancy way of saying -"fall over during training". The effect is sort of similar to setting the -learning rate low. It can also compensate for a large batch size (this is a good -example of how the choices of all these hyper-parameters intersect). - -### Dropout rate {#tips-dropout} - -For small datasets, it's useful to set a **high dropout rate at first**, and -**decay** it down towards a more reasonable value. This helps avoid the network -immediately overfitting, while still encouraging it to learn some of the more -interesting things in your data. spaCy comes with a -[`decaying`](/api/top-level#util.decaying) utility function to facilitate this. -You might try setting: - -```python -from spacy.util import decaying -dropout = decaying(0.6, 0.2, 1e-4) -``` - -You can then draw values from the iterator with `next(dropout)`, which you would -pass to the `drop` keyword argument of [`nlp.update`](/api/language#update). -It's pretty much always a good idea to use at least **some dropout**. All of the -models currently use Bernoulli dropout, for no particularly principled reason – -we just haven't experimented with another scheme like Gaussian dropout yet. - -### Parameter averaging {#tips-param-avg} - -The last part of our optimization recipe is **parameter averaging**, an old -trick introduced by -[Freund and Schapire (1999)](https://cseweb.ucsd.edu/~yfreund/papers/LargeMarginsUsingPerceptron.pdf), -popularized in the NLP community by -[Collins (2002)](http://www.aclweb.org/anthology/P04-1015), and explained in -more detail by [Leon Bottou](http://leon.bottou.org/projects/sgd). Just about -the only other people who seem to be using this for neural network training are -the SyntaxNet team (one of whom is Michael Collins) – but it really seems to -work great on every problem. - -The trick is to store the moving average of the weights during training. We -don't optimize this average – we just track it. Then when we want to actually -use the model, we use the averages, not the most recent value. In spaCy (and -[Thinc](https://github.com/explosion/thinc)) this is done by using a context -manager, [`use_params`](/api/language#use_params), to temporarily replace the -weights: - -```python -with nlp.use_params(optimizer.averages): - nlp.to_disk("/model") -``` - -The context manager is handy because you naturally want to evaluate and save the -model at various points during training (e.g. after each epoch). After -evaluating and saving, the context manager will exit and the weights will be -restored, so you resume training from the most recent value, rather than the -average. By evaluating the model after each epoch, you can remove one -hyper-parameter from consideration (the number of epochs). Having one less magic -number to guess is extremely nice – so having the averaging under a context -manager is very convenient. +out to the directory. diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index e509dade4..015051f95 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -19,7 +19,7 @@ { "text": "Rule-based Matching", "url": "/usage/rule-based-matching" }, { "text": "Processing Pipelines", "url": "/usage/processing-pipelines" }, { "text": "Vectors & Similarity", "url": "/usage/vectors-similarity" }, - { "text": "Training Models", "url": "/usage/training" }, + { "text": "Training Models", "url": "/usage/training", "tag": "new" }, { "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" }, { "text": "Saving & Loading", "url": "/usage/saving-loading" }, { "text": "Visualizers", "url": "/usage/visualizers" } @@ -54,7 +54,8 @@ { "label": "Overview", "items": [ - { "text": "Architecture", "url": "/api" }, + { "text": "Library Architecture", "url": "/api" }, + { "text": "Model Architectures", "url": "/api/architectures" }, { "text": "Annotation Specs", "url": "/api/annotation" }, { "text": "Command Line", "url": "/api/cli" }, { "text": "Functions", "url": "/api/top-level" } diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index c97013ab2..56f1a5aa6 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -321,20 +321,20 @@ body [id]:target &.comment, &.prolog, &.doctype, &.cdata, &.punctuation color: var(--syntax-comment) - &.property, &.tag, &.constant, &.symbol + &.property, &.tag, &.symbol color: var(--syntax-tag) &.boolean, &.number color: var(--syntax-number) - &.selector, &.attr-name, &.string, &.char, &.builtin + &.attr-name, &.string, &.char, &.builtin, &.attr-value color: var(--syntax-selector) @at-root .language-css .token.string, &.operator, &.entity, &.url, &.variable color: var(--syntax-operator) - &.atrule, &.attr-value, &.function + &.atrule, &.function, &.selector color: var(--syntax-function) &.regex, &.important @@ -395,13 +395,13 @@ body [id]:target .cm-comment color: var(--syntax-comment) - .cm-keyword, .cm-builtin + .cm-keyword color: var(--syntax-keyword) .cm-operator color: var(--syntax-operator) - .cm-string + .cm-string, .cm-builtin color: var(--syntax-selector) .cm-number From 1e0d54edd1fb3bbe9c16c741a63dbc00a41d84e6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 14:23:10 +0200 Subject: [PATCH 04/10] Update docs --- website/docs/api/cli.md | 126 ++++++++-------- website/docs/api/corpus.md | 37 +++++ website/docs/api/doc.md | 32 ++--- website/docs/api/goldcorpus.md | 24 ---- website/docs/api/sentencerecognizer.md | 29 ++++ website/docs/api/sentencizer.md | 35 ++--- website/docs/api/span.md | 10 +- website/docs/api/token.md | 8 +- website/docs/api/top-level.md | 167 +++++++++++++++++++++- website/docs/usage/101/_architecture.md | 2 + website/docs/usage/models.md | 4 +- website/docs/usage/saving-loading.md | 65 +-------- website/docs/usage/training.md | 9 ++ website/meta/sidebars.json | 8 +- website/src/components/code.js | 3 +- website/src/widgets/quickstart-install.js | 2 +- 16 files changed, 354 insertions(+), 207 deletions(-) create mode 100644 website/docs/api/corpus.md delete mode 100644 website/docs/api/goldcorpus.md create mode 100644 website/docs/api/sentencerecognizer.md diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index c90d7c69c..8dccad165 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -13,6 +13,7 @@ menu: - ['Init Model', 'init-model'] - ['Evaluate', 'evaluate'] - ['Package', 'package'] + - ['Project', 'project'] --- For a list of available commands, type `spacy --help`. @@ -95,26 +96,29 @@ $ python -m spacy validate ## Convert {#convert} -Convert files into spaCy's [JSON format](/api/annotation#json-input) for use -with the `train` command and other experiment management functions. The -converter can be specified on the command line, or chosen based on the file -extension of the input file. +Convert files into spaCy's +[binary training data format](/usage/training#data-format), a serialized +[`DocBin`](/api/docbin), for use with the `train` command and other experiment +management functions. The converter can be specified on the command line, or +chosen based on the file extension of the input file. ```bash -$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] -[--n-sents] [--morphology] [--lang] +$ python -m spacy convert [input_file] [output_dir] [--converter] +[--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] +[--merge-subtokens] [--ner-map] [--lang] ``` | Argument | Type | Description | | ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ | | `input_file` | positional | Input file. | | `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | -| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | | `--converter`, `-c` 2 | option | Name of converter to use (see below). | +| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | | `--n-sents`, `-n` | option | Number of sentences per document. | | `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | | `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | | `--morphology`, `-m` | option | Enable appending morphology to tags. | +| `--ner-map`, `-nm` | option | NER tag mapping (as JSON-encoded dict of entity types). | | `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | @@ -136,20 +140,21 @@ stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. ```bash -$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] +$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] +[--pipeline] [--tag-map-path] [--ignore-warnings] [--verbose] [--no-format] ``` -| Argument | Type | Description | -| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | -| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| Argument | Type | Description | +| ------------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of [binary training data](/usage/training#data-format). Can be a file or a directory of files. | +| `dev_path` | positional | Location of [binary development data](/usage/training#data-format) for evaluation. Can be a file or a directory of files. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | @@ -292,6 +297,8 @@ will not be available. ## Train {#train} + + Train a model. Expects data in spaCy's [JSON format](/api/annotation#json-input). On each epoch, a model will be saved out to the directory. Accuracy scores and model details will be added to a @@ -345,47 +352,10 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | -### Environment variables for hyperparameters {#train-hyperparams new="2"} - -spaCy lets you set hyperparameters for training via environment variables. For -example: - -```bash -$ token_vector_width=256 learn_rate=0.0001 spacy train [...] -``` - -> #### Usage with alias -> -> Environment variables keep the command simple and allow you to to -> [create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537) -> for your custom `train` command while still being able to easily tweak the -> hyperparameters. -> -> ```bash -> alias train-parser="python -m spacy train en /output /data /train /dev -n 1000" -> token_vector_width=256 train-parser -> ``` - -| Name | Description | Default | -| -------------------- | --------------------------------------------------- | ------- | -| `dropout_from` | Initial dropout rate. | `0.2` | -| `dropout_to` | Final dropout rate. | `0.2` | -| `dropout_decay` | Rate of dropout change. | `0.0` | -| `batch_from` | Initial batch size. | `1` | -| `batch_to` | Final batch size. | `64` | -| `batch_compound` | Rate of batch size acceleration. | `1.001` | -| `token_vector_width` | Width of embedding tables and convolutional layers. | `128` | -| `embed_size` | Number of rows in embedding tables. | `7500` | -| `hidden_width` | Size of the parser's and NER's hidden layers. | `128` | -| `learn_rate` | Learning rate. | `0.001` | -| `optimizer_B1` | Momentum for the Adam solver. | `0.9` | -| `optimizer_B2` | Adagrad-momentum for the Adam solver. | `0.999` | -| `optimizer_eps` | Epsilon value for the Adam solver. | `1e-08` | -| `L2_penalty` | L2 regularization penalty. | `1e-06` | -| `grad_norm_clip` | Gradient L2 norm constraint. | `1.0` | - ## Pretrain {#pretrain new="2.1" tag="experimental"} + + Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using an approximate language-modeling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which @@ -491,6 +461,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] ## Evaluate {#evaluate new="2"} + + Evaluate a model's accuracy and speed on JSON-formatted annotated data. Will print the results and optionally export [displaCy visualizations](/usage/visualizers) of a sample set of parses to @@ -516,12 +488,20 @@ $ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-lim ## Package {#package} -Generate a [model Python package](/usage/training#models-generating) from an -existing model data directory. All data files are copied over. If the path to a -`meta.json` is supplied, or a `meta.json` is found in the input directory, this -file is used. Otherwise, the data can be entered directly from the command line. -After packaging, you can run `python setup.py sdist` from the newly created -directory to turn your model into an installable archive file. +Generate an installable +[model Python package](/usage/training#models-generating) from an existing model +data directory. All data files are copied over. If the path to a `meta.json` is +supplied, or a `meta.json` is found in the input directory, this file is used. +Otherwise, the data can be entered directly from the command line. spaCy will +then create a `.tar.gz` archive file that you can distribute and install with +`pip install`. + + + +The `spacy package` command now also builds the `.tar.gz` archive automatically, +so you don't have to run `python setup.py sdist` separately anymore. + + ```bash $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] @@ -531,7 +511,6 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] ### Example python -m spacy package /input /output cd /output/en_model-0.0.0 -python setup.py sdist pip install dist/en_model-0.0.0.tar.gz ``` @@ -541,6 +520,23 @@ pip install dist/en_model-0.0.0.tar.gz | `output_dir` | positional | Directory to create package folder in. | | `--meta-path`, `-m` 2 | option | Path to `meta.json` file (optional). | | `--create-meta`, `-c` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | +| `--version`, `-v` 3 | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. | | `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | + +## Project {#project} + + + +### project clone {#project-clone} + +### project assets {#project-assets} + +### project run-all {#project-run-all} + +### project run {#project-run} + +### project init {#project-init} + +### project update-dvc {#project-update-dvc} diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md new file mode 100644 index 000000000..3256849c3 --- /dev/null +++ b/website/docs/api/corpus.md @@ -0,0 +1,37 @@ +--- +title: Corpus +teaser: An annotated corpus +tag: class +source: spacy/gold/corpus.py +new: 3 +--- + +This class manages annotated corpora and can read training and development +datasets in the [DocBin](/api/docbin) (`.spacy`) format. + +## Corpus.\_\_init\_\_ {#init tag="method"} + +Create a `Corpus`. The input data can be a file or a directory of files. + +| Name | Type | Description | +| ----------- | ------------ | ---------------------------------------------------------------- | +| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | +| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | +| `limit` | int | Maximum number of examples returned. | +| **RETURNS** | `Corpus` | The newly constructed object. | + + + +## Corpus.walk_corpus {#walk_corpus tag="staticmethod"} + +## Corpus.make_examples {#make_examples tag="method"} + +## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"} + +## Corpus.read_docbin {#read_docbin tag="method"} + +## Corpus.count_train {#count_train tag="method"} + +## Corpus.train_dataset {#train_dataset tag="method"} + +## Corpus.dev_dataset {#dev_dataset tag="method"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 3b31b2c80..b5871f2ab 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -123,7 +123,7 @@ details, see the documentation on | Name | Type | Description | | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -140,8 +140,8 @@ Look up a previously registered extension by name. Returns a 4-tuple > > ```python > from spacy.tokens import Doc -> Doc.set_extension('has_city', default=False) -> extension = Doc.get_extension('has_city') +> Doc.set_extension("has_city", default=False) +> extension = Doc.get_extension("has_city") > assert extension == (False, None, None, None) > ``` @@ -158,8 +158,8 @@ Check whether an extension has been registered on the `Doc` class. > > ```python > from spacy.tokens import Doc -> Doc.set_extension('has_city', default=False) -> assert Doc.has_extension('has_city') +> Doc.set_extension("has_city", default=False) +> assert Doc.has_extension("has_city") > ``` | Name | Type | Description | @@ -175,9 +175,9 @@ Remove a previously registered extension. > > ```python > from spacy.tokens import Doc -> Doc.set_extension('has_city', default=False) -> removed = Doc.remove_extension('has_city') -> assert not Doc.has_extension('has_city') +> Doc.set_extension("has_city", default=False) +> removed = Doc.remove_extension("has_city") +> assert not Doc.has_extension("has_city") > ``` | Name | Type | Description | @@ -204,7 +204,7 @@ the character indices don't map to a valid span. | `end` | int | The index of the last character after the span. | | `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | | `kb_id` 2.2 | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} @@ -264,7 +264,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | Name | Type | Description | | ----------- | -------------------------------------- | ----------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | +| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. | ## Doc.to_json {#to_json tag="method" new="2.1"} @@ -303,7 +303,7 @@ Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence of `M` attributes, the output array will be of shape `(N, M)`, where `N` is the length of the `Doc` (in tokens). If `attr_ids` is a single attribute, the output shape will be `(N,)`. You can specify attributes by integer ID (e.g. -`spacy.attrs.LEMMA`) or string name (e.g. 'LEMMA' or 'lemma'). The values will +`spacy.attrs.LEMMA`) or string name (e.g. "LEMMA" or "lemma"). The values will be 64-bit integers. Returns a 2D array with one row per token and one column per attribute (when @@ -323,7 +323,7 @@ Returns a 2D array with one row per token and one column per attribute (when | Name | Type | Description | | ----------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | | `attr_ids` | list or int or string | A list of attributes (int IDs or string names) or a single attribute (int ID or string name) | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype='uint64']` or `numpy.ndarray[ndim=1, dtype='uint64']` | The exported attributes as a numpy array. | +| **RETURNS** | `numpy.ndarray[ndim=2, dtype="uint64"]` or `numpy.ndarray[ndim=1, dtype="uint64"]` | The exported attributes as a numpy array. | ## Doc.from_array {#from_array tag="method"} @@ -345,14 +345,14 @@ array of attributes. | Name | Type | Description | | ----------- | -------------------------------------- | ------------------------------------------------------------------------- | | `attrs` | list | A list of attribute ID ints. | -| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | +| `array` | `numpy.ndarray[ndim=2, dtype="int32"]` | The attribute values to load. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Doc` | Itself. | - ## Doc.from_docs {#from_docs tag="staticmethod"} -Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. +Concatenate multiple `Doc` objects to form a new one. Raises an error if the +`Doc` objects do not all share the same `Vocab`. > #### Example > @@ -634,7 +634,7 @@ vectors. | Name | Type | Description | | ----------- | ---------------------------------------- | ------------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the document's semantics. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the document's semantics. | ## Doc.vector_norm {#vector_norm tag="property" model="vectors"} diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md deleted file mode 100644 index 7767b28bd..000000000 --- a/website/docs/api/goldcorpus.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: GoldCorpus -teaser: An annotated corpus, using the JSON file format -tag: class -source: spacy/gold.pyx -new: 2 ---- - -This class manages annotations for tagging, dependency parsing and NER. - -## GoldCorpus.\_\_init\_\_ {#init tag="method"} - -Create a `GoldCorpus`. IF the input data is an iterable, each item should be a -`(text, paragraphs)` tuple, where each paragraph is a tuple -`(sentences, brackets)`, and each sentence is a tuple -`(ids, words, tags, heads, ner)`. See the implementation of -[`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) -for further details. - -| Name | Type | Description | -| ----------- | ----------------------- | ------------------------------------------------------------ | -| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. | -| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. | -| **RETURNS** | `GoldCorpus` | The newly constructed object. | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md new file mode 100644 index 000000000..367b79e5d --- /dev/null +++ b/website/docs/api/sentencerecognizer.md @@ -0,0 +1,29 @@ +--- +title: SentenceRecognizer +tag: class +source: spacy/pipeline/pipes.pyx +new: 3 +--- + +A trainable pipeline component for sentence segmentation. For a simpler, +ruse-based strategy, see the [`Sentencizer`](/api/sentencizer). This class is a +subclass of `Pipe` and follows the same API. The component is also available via +the string name `"senter"`. After initialization, it is typically added to the +processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). + +## SentenceRecognizer.\_\_init\_\_ {#init tag="method"} + +Initialize the sentence recognizer. + +> #### Example +> +> ```python +> # Construction via create_pipe +> senter = nlp.create_pipe("senter") +> +> # Construction from class +> from spacy.pipeline import SentenceRecognizer +> senter = SentenceRecognizer() +> ``` + + diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 14482c353..9c6e2d58c 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -12,19 +12,6 @@ require a statistical model to be loaded. The component is also available via the string name `"sentencizer"`. After initialization, it is typically added to the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). - - -Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component -doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the -tokens in the `Doc` and sets the `Token.is_sent_start` property. The -`SentenceSegmenter` is still available if you import it directly: - -```python -from spacy.pipeline import SentenceSegmenter -``` - - - ## Sentencizer.\_\_init\_\_ {#init tag="method"} Initialize the sentencizer. @@ -40,10 +27,24 @@ Initialize the sentencizer. > sentencizer = Sentencizer() > ``` -| Name | Type | Description | -| ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | -| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. | -| **RETURNS** | `Sentencizer` | The newly constructed object. | +| Name | Type | Description | +| ------------- | ------------- | ----------------------------------------------------------------------------------------------- | +| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. | +| **RETURNS** | `Sentencizer` | The newly constructed object. | + +```python +### punct_chars defaults +['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', + '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', + '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', + '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', + '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', + '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', + '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', + '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', + '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', + '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] +``` ## Sentencizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 4d10c08d9..668013e76 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -25,7 +25,7 @@ Create a Span object from the slice `doc[start : end]`. | `end` | int | The index of the first token after the span. | | `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | | `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} @@ -110,7 +110,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -191,7 +191,7 @@ the character indices don't map to a valid span. | `end` | int | The index of the last character after the span. | | `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | | `kb_id` | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object or `None`. | ## Span.similarity {#similarity tag="method" model="vectors"} @@ -232,7 +232,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | Name | Type | Description | | ----------- | -------------------------------------- | ------------------------------------------------ | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Span`. | +| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Span`. | ## Span.to_array {#to_array tag="method" new="2"} @@ -440,7 +440,7 @@ vectors. | Name | Type | Description | | ----------- | ---------------------------------------- | --------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the span's semantics. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the span's semantics. | ## Span.vector_norm {#vector_norm tag="property" model="vectors"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 2d25d9db2..549189cad 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -58,7 +58,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -370,7 +370,7 @@ A real-valued meaning representation. | Name | Type | Description | | ----------- | ---------------------------------------- | ---------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the token's semantics. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the token's semantics. | ## Token.vector_norm {#vector_norm tag="property" model="vectors"} @@ -435,8 +435,8 @@ The L2 norm of the token's vector representation. | `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | | `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | | `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | | `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | | `is_bracket` | bool | Is the token a bracket? | | `is_quote` | bool | Is the token a quotation mark? | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 01bc712a8..fe0952c9f 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -3,6 +3,7 @@ title: Top-level Functions menu: - ['spacy', 'spacy'] - ['displacy', 'displacy'] + - ['Data & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -76,8 +77,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > > ```python > spacy.info() -> spacy.info("en") -> spacy.info("de", markdown=True) +> spacy.info("en_core_web_sm") +> spacy.info(markdown=True) > ``` | Name | Type | Description | @@ -258,6 +259,156 @@ colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +## Training data and alignment {#gold source="spacy/gold"} + +### gold.docs_to_json {#docs_to_json tag="function"} + +Convert a list of Doc objects into the +[JSON-serializable format](/api/annotation#json-input) used by the +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. + +> #### Example +> +> ```python +> from spacy.gold import docs_to_json +> +> doc = nlp("I like London") +> json_data = docs_to_json([doc]) +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | ------------------------------------------ | +| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. | +| `id` | int | ID to assign to the JSON. Defaults to `0`. | +| **RETURNS** | dict | The data in spaCy's JSON format. | + +### gold.align {#align tag="function"} + +Calculate alignment tables between two tokenizations, using the Levenshtein +algorithm. The alignment is case-insensitive. + + + +The current implementation of the alignment algorithm assumes that both +tokenizations add up to the same string. For example, you'll be able to align +`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not +`["I", "'m"]` and `["I", "am"]`. + + + +> #### Example +> +> ```python +> from spacy.gold import align +> +> bert_tokens = ["obama", "'", "s", "podcast"] +> spacy_tokens = ["obama", "'s", "podcast"] +> alignment = align(bert_tokens, spacy_tokens) +> cost, a2b, b2a, a2b_multi, b2a_multi = alignment +> ``` + +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------------------------------- | +| `tokens_a` | list | String values of candidate tokens to align. | +| `tokens_b` | list | String values of reference tokens to align. | +| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | + +The returned tuple contains the following alignment information: + +> #### Example +> +> ```python +> a2b = array([0, -1, -1, 2]) +> b2a = array([0, 2, 3]) +> a2b_multi = {1: 1, 2: 1} +> b2a_multi = {} +> ``` +> +> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If +> there's no one-to-one alignment for a token, it has the value `-1`. + +| Name | Type | Description | +| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `cost` | int | The number of misaligned tokens. | +| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. | +| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. | +| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | +| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | + +### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} + +Encode labelled spans into per-token tags, using the +[BILUO scheme](/api/annotation#biluo) (Begin, In, Last, Unit, Out). Returns a +list of strings, describing the tags. Each tag string will be of the form of +either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, +`"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with +the tokenization in the `Doc` object. The training algorithm will view these as +missing values. `O` denotes a non-entity token. `B` denotes the beginning of a +multi-token entity, `I` the inside of an entity of three or more tokens, and `L` +the end of an entity of two or more tokens. `U` denotes a single-token entity. + +> #### Example +> +> ```python +> from spacy.gold import biluo_tags_from_offsets +> +> doc = nlp("I like London.") +> entities = [(7, 13, "LOC")] +> tags = biluo_tags_from_offsets(doc, entities) +> assert tags == ["O", "O", "U-LOC", "O"] +> ``` + +| Name | Type | Description | +| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | +| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | +| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | + +### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} + +Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into +entity offsets. + +> #### Example +> +> ```python +> from spacy.gold import offsets_from_biluo_tags +> +> doc = nlp("I like London.") +> tags = ["O", "O", "U-LOC", "O"] +> entities = offsets_from_biluo_tags(doc, tags) +> assert entities == [(7, 13, "LOC")] +> ``` + +| Name | Type | Description | +| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the BILUO tags refer to. | +| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | +| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | + +### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} + +Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into +[`Span`](/api/span) objects. This can be used to create entity spans from +token-based tags, e.g. to overwrite the `doc.ents`. + +> #### Example +> +> ```python +> from spacy.gold import spans_from_biluo_tags +> +> doc = nlp("I like London.") +> tags = ["O", "O", "U-LOC", "O"] +> doc.ents = spans_from_biluo_tags(doc, tags) +> ``` + +| Name | Type | Description | +| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the BILUO tags refer to. | +| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | +| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | + ## Utility functions {#util source="spacy/util.py"} spaCy comes with a small collection of utility functions located in @@ -341,7 +492,7 @@ class. The model data will then be loaded in via > #### Example > > ```python -> nlp = util.load_model("en") +> nlp = util.load_model("en_core_web_sm") > nlp = util.load_model("en_core_web_sm", disable=["ner"]) > nlp = util.load_model("/path/to/data") > ``` @@ -634,3 +785,13 @@ of one entity) or when merging spans with | ----------- | -------- | -------------------- | | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | + +## util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} + + + +| Name | Type | Description | +| ----------- | ----- | ----------- | +| `words` | list | | +| `text` | str | | +| **RETURNS** | tuple | | diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 7cd749521..4363b9b4f 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -12,6 +12,8 @@ place** by the components of the pipeline. The `Language` object coordinates these components. It takes raw text and sends it through the pipeline, returning an **annotated document**. It also orchestrates training and serialization. + + ![Library architecture](../../images/architecture.svg) ### Container objects {#architecture-containers} diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index db8d0ee28..8157e2c07 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -392,9 +392,7 @@ loading models, the underlying functionality is entirely based on native Python packages. This allows your application to handle a model like any other package dependency. -For an example of an automated model training and build process, see -[this overview](/usage/training#example-training-spacy) of how we're training -and packaging our models for spaCy. + ### Downloading and requiring model dependencies {#models-download} diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index e9ba0de6a..ac6b275d8 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -711,67 +711,4 @@ class and call [`from_disk`](/api/language#from_disk) instead. nlp = spacy.blank("en").from_disk("/path/to/data") ``` - - -In spaCy 1.x, the distinction between `spacy.load()` and the `Language` class -constructor was quite unclear. You could call `spacy.load()` when no model was -present, and it would silently return an empty object. Likewise, you could pass -a path to `English`, even if the mode required a different language. spaCy v2.0 -solves this with a clear distinction between setting up the instance and loading -the data. - -```diff -- nlp = spacy.load("en_core_web_sm", path="/path/to/data") -+ nlp = spacy.blank("en_core_web_sm").from_disk("/path/to/data") -``` - - - -### How we're training and packaging models for spaCy {#example-training-spacy} - -Publishing a new version of spaCy often means re-training all available models, -which is [quite a lot](/usage/models#languages). To make this run smoothly, -we're using an automated build process and a [`spacy train`](/api/cli#train) -template that looks like this: - -```bash -$ python -m spacy train {lang} {models_dir}/{name} {train_data} {dev_data} -m meta/{name}.json -V {version} -g {gpu_id} -n {n_epoch} -ns {n_sents} -``` - -> #### meta.json template -> -> ```json -> { -> "lang": "en", -> "name": "core_web_sm", -> "license": "CC BY-SA 3.0", -> "author": "Explosion AI", -> "url": "https://explosion.ai", -> "email": "contact@explosion.ai", -> "sources": ["OntoNotes 5", "Common Crawl"], -> "description": "English multi-task CNN trained on OntoNotes, with GloVe vectors trained on common crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." -> } -> ``` - -In a directory `meta`, we keep `meta.json` templates for the individual models, -containing all relevant information that doesn't change across versions, like -the name, description, author info and training data sources. When we train the -model, we pass in the file to the meta template as the `--meta` argument, and -specify the current model version as the `--version` argument. - -On each epoch, the model is saved out with a `meta.json` using our template and -added properties, like the `pipeline`, `accuracy` scores and the `spacy_version` -used to train the model. After training completion, the best model is selected -automatically and packaged using the [`package`](/api/cli#package) command. -Since a full meta file is already present on the trained model, no further setup -is required to build a valid model package. - -```bash -python -m spacy package -f {best_model} dist/ -cd dist/{model_name} -python setup.py sdist -``` - -This process allows us to quickly trigger the model training and build process -for all available models and languages, and generate the correct meta data -automatically. + diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 7ca309ea0..6fa0b3d8e 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -6,6 +6,7 @@ menu: - ['CLI & Config', 'cli-config'] - ['Custom Models', 'custom-models'] - ['Transfer Learning', 'transfer-learning'] + - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -43,6 +44,10 @@ The recommended way to train your spaCy models is via the +### Training data format {#data-format} + + + > #### Tip: Debug your data > > The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate @@ -167,6 +172,10 @@ dropout = null +## Parallel Training with Ray {#parallel-training} + + + ## Internal training API {#api} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 165c02a29..9a0d0fb05 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -68,7 +68,8 @@ { "text": "Token", "url": "/api/token" }, { "text": "Span", "url": "/api/span" }, { "text": "Lexeme", "url": "/api/lexeme" }, - { "text": "Example", "url": "/api/example" } + { "text": "Example", "url": "/api/example" }, + { "text": "DocBin", "url": "/api/docbin" } ] }, { @@ -86,6 +87,7 @@ { "text": "PhraseMatcher", "url": "/api/phrasematcher" }, { "text": "EntityRuler", "url": "/api/entityruler" }, { "text": "Sentencizer", "url": "/api/sentencizer" }, + { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" }, { "text": "Other Functions", "url": "/api/pipeline-functions" } ] }, @@ -97,10 +99,8 @@ { "text": "Vectors", "url": "/api/vectors" }, { "text": "Lookups", "url": "/api/lookups" }, { "text": "KnowledgeBase", "url": "/api/kb" }, - { "text": "GoldParse", "url": "/api/goldparse" }, - { "text": "GoldCorpus", "url": "/api/goldcorpus" }, { "text": "Scorer", "url": "/api/scorer" }, - { "text": "DocBin", "url": "/api/docbin" } + { "text": "Corpus", "url": "/api/corpus" } ] }, { diff --git a/website/src/components/code.js b/website/src/components/code.js index 5184da833..2c1ad32d8 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -83,12 +83,13 @@ export class Code extends React.Component { executable, github, prompt, + wrap, highlight, className, children, } = this.props const codeClassNames = classNames(classes.code, className, `language-${lang}`, { - [classes.wrap]: !!highlight, + [classes.wrap]: !!highlight || !!wrap, }) const ghClassNames = classNames(codeClassNames, classes.maxHeight) const { Juniper } = this.state diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index dd4e10f01..237567eb8 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -83,7 +83,7 @@ const QuickstartInstall = ({ id, title }) => ( export PYTHONPATH=`pwd` - set PYTHONPATH=/path/to/spaCy + set PYTHONPATH=C:\path\to\spaCy pip install -r requirements.txt From 99aff16d6040d5f9f7475ab85e03b0a3ed504334 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 14:23:32 +0200 Subject: [PATCH 05/10] Make argument shortcut consistent --- spacy/cli/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index af6ef147c..4e3975bda 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -48,7 +48,7 @@ def convert_cli( morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), - ner_map: Optional[Path] = Opt(None, "--ner-map", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), + ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), # fmt: on ): From abd173937ff0b7754fda954abfff29e5e0fc5242 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 14:23:44 +0200 Subject: [PATCH 06/10] Auto-format and update URL --- spacy/gold/corpus.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 64f38d21c..427c00caa 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -8,7 +8,7 @@ class Corpus: """An annotated corpus, reading train and dev datasets from the DocBin (.spacy) format. - DOCS: https://spacy.io/api/goldcorpus + DOCS: https://spacy.io/api/corpus """ def __init__(self, train_loc, dev_loc, limit=0): @@ -49,16 +49,13 @@ class Corpus: Doc( nlp.vocab, words=[word.text for word in reference], - spaces=[bool(word.whitespace_) for word in reference] + spaces=[bool(word.whitespace_) for word in reference], ), - reference + reference, ) else: - return Example( - nlp.make_doc(reference.text), - reference - ) - + return Example(nlp.make_doc(reference.text), reference) + def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: if len(reference) == 0: @@ -71,7 +68,6 @@ class Corpus: continue elif max_length == 0 or len(ref_sent) < max_length: yield self._make_example(nlp, ref_sent.as_doc(), False) - def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: @@ -111,8 +107,9 @@ class Corpus: i += 1 return n - def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False, - max_length=0, **kwargs): + def train_dataset( + self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs + ): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) From 2d9ca0cd8bd465b81b18e9768893a9ac5b3756ce Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 14:39:34 +0200 Subject: [PATCH 07/10] Make Thinc version consistent --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 792eabfa1..43a74d97e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,7 +40,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a11,<8.0.0a20 + thinc>=8.0.0a12,<8.0.0a20 blis>=0.4.0,<0.5.0 wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 From 4498dfe99da54638137dd576de375e0876408880 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 16:25:30 +0200 Subject: [PATCH 08/10] Update docs --- website/docs/api/language.md | 24 +++++------- website/docs/api/top-level.md | 71 +++-------------------------------- 2 files changed, 15 insertions(+), 80 deletions(-) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 9413ef486..e835168b7 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -296,15 +296,13 @@ component function. Disable one or more pipeline components. If used as a context manager, the pipeline will be restored to the initial state at the end of the block. Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method -you can use to undo your changes. - -You can specify either `disable` (as a list or string), or `enable`. In the -latter case, all components not in the `enable` list, will be disabled. +you can use to undo your changes. You can specify either `disable` (as a list or +string), or `enable`. In the latter case, all components not in the `enable` +list, will be disabled. > #### Example > > ```python -> # New API as of v3.0 > with nlp.select_pipes(disable=["tagger", "parser"]): > nlp.begin_training() > @@ -316,15 +314,7 @@ latter case, all components not in the `enable` list, will be disabled. > disabled.restore() > ``` -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------------------ | -| `disable` | list | Names of pipeline components to disable. | -| `disable` | str | Name of pipeline component to disable. | -| `enable` | list | Names of pipeline components that will not be disabled. | -| `enable` | str | Name of pipeline component that will not be disabled. | -| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | - - + As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: @@ -335,6 +325,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: +| Name | Type | Description | +| ----------- | --------------- | ------------------------------------------------------------------------------------ | +| `disable` | str / list | Name(s) of pipeline components to disable. | +| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | +| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | + ## Language.to_disk {#to_disk tag="method" new="2"} Save the current state to a directory. If a model is loaded, this will **include diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index fe0952c9f..d0dda9d71 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -698,72 +698,11 @@ vary on each step. > nlp.update(texts, annotations) > ``` -| Name | Type | Description | -| ---------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `items` | iterable | The items to batch up. | -| `size` | int / iterable | The batch size(s). Use [`util.compounding`](/api/top-level#util.compounding) or [`util.decaying`](/api/top-level#util.decaying) or for an infinite series of compounding or decaying values. | -| **YIELDS** | list | The batches. | - -### util.compounding {#util.compounding tag="function" new="2"} - -Yield an infinite series of compounding values. Each time the generator is -called, a value is produced by multiplying the previous value by the compound -rate. - -> #### Example -> -> ```python -> sizes = compounding(1., 10., 1.5) -> assert next(sizes) == 1. -> assert next(sizes) == 1. * 1.5 -> assert next(sizes) == 1.5 * 1.5 -> ``` - -| Name | Type | Description | -| ---------- | ----------- | ----------------------- | -| `start` | int / float | The first value. | -| `stop` | int / float | The maximum value. | -| `compound` | int / float | The compounding factor. | -| **YIELDS** | int | Compounding values. | - -### util.decaying {#util.decaying tag="function" new="2"} - -Yield an infinite series of linearly decaying values. - -> #### Example -> -> ```python -> sizes = decaying(10., 1., 0.001) -> assert next(sizes) == 10. -> assert next(sizes) == 10. - 0.001 -> assert next(sizes) == 9.999 - 0.001 -> ``` - -| Name | Type | Description | -| ---------- | ----------- | -------------------- | -| `start` | int / float | The first value. | -| `end` | int / float | The maximum value. | -| `decay` | int / float | The decaying factor. | -| **YIELDS** | int | The decaying values. | - -### util.itershuffle {#util.itershuffle tag="function" new="2"} - -Shuffle an iterator. This works by holding `bufsize` items back and yielding -them sometime later. Obviously, this is not unbiased – but should be good enough -for batching. Larger `bufsize` means less bias. - -> #### Example -> -> ```python -> values = range(1000) -> shuffled = itershuffle(values) -> ``` - -| Name | Type | Description | -| ---------- | -------- | ----------------------------------- | -| `iterable` | iterable | Iterator to shuffle. | -| `bufsize` | int | Items to hold back (default: 1000). | -| **YIELDS** | iterable | The shuffled iterator. | +| Name | Type | Description | +| ---------- | -------------- | ---------------------- | +| `items` | iterable | The items to batch up. | +| `size` | int / iterable | The batch size(s). | +| **YIELDS** | list | The batches. | ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} From 37c3bb35e283a20d8c0b43278f93812cac24f660 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 16:25:34 +0200 Subject: [PATCH 09/10] Auto-format --- spacy/cli/debug_data.py | 4 +- spacy/cli/init_model.py | 4 +- spacy/cli/pretrain.py | 2 +- spacy/cli/train.py | 29 +++-- spacy/gold/converters/iob2docs.py | 2 +- spacy/gold/iob_utils.py | 2 +- spacy/ml/models/parser.py | 6 +- spacy/tests/doc/test_doc_api.py | 49 ++++---- spacy/tests/parser/test_ner.py | 1 + spacy/tests/parser/test_parse.py | 1 + spacy/tests/regression/test_issue4402.py | 144 +++++++++++------------ spacy/tests/test_cli.py | 24 ++-- spacy/tests/test_gold.py | 83 ++++++------- spacy/tokens/_serialize.py | 7 +- 14 files changed, 178 insertions(+), 180 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 6026c4b52..712bc7914 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -118,7 +118,9 @@ def debug_data( # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True) - gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False) + gold_train_unpreprocessed_data = _compile_gold( + train_dataset, pipeline, nlp, make_proj=False + ) gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True) train_texts = gold_train_data["texts"] diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index d0d876aed..5cfde43e0 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -229,7 +229,9 @@ def add_vectors( else: if vectors_loc: with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors) + vectors_data, vector_keys = read_vectors( + msg, vectors_loc, truncate_vectors + ) msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index e58d2529d..5b021aabc 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -406,5 +406,5 @@ def verify_cli_args( if not config["nlp"]["vectors"]: msg.fail( "Must specify nlp.vectors if pretraining.objective.type is vectors", - exits=True + exits=True, ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c053e624c..92fd8c20a 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -202,11 +202,11 @@ def train( nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - train_examples = list(corpus.train_dataset( - nlp, - shuffle=False, - gold_preproc=training["gold_preproc"] - )) + train_examples = list( + corpus.train_dataset( + nlp, shuffle=False, gold_preproc=training["gold_preproc"] + ) + ) nlp.begin_training(lambda: train_examples) # Update tag map with provided mapping @@ -293,12 +293,14 @@ def train( def create_train_batches(nlp, corpus, cfg): max_epochs = cfg.get("max_epochs", 0) - train_examples = list(corpus.train_dataset( - nlp, - shuffle=True, - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"] - )) + train_examples = list( + corpus.train_dataset( + nlp, + shuffle=True, + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"], + ) + ) epoch = 0 while True: @@ -520,7 +522,10 @@ def setup_printer(training, nlp): ) ) data = ( - [info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + [info["epoch"], info["step"]] + + losses + + scores + + ["{0:.2f}".format(float(info["score"]))] ) msg.row(data, widths=table_widths, aligns=table_aligns) diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py index 51321a470..c7e243397 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/gold/converters/iob2docs.py @@ -59,6 +59,6 @@ def read_iob(raw_sents, vocab, n_sents): doc[i].is_sent_start = sent_start biluo = iob_to_biluo(iob) entities = tags_to_entities(biluo) - doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities] + doc.ents = [Span(doc, start=s, end=e + 1, label=L) for (L, s, e) in entities] docs.append(doc) return docs diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index cd606fecf..08751cfd4 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -92,7 +92,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): # Handle entity cases for start_char, end_char, label in entities: if not label: - for s in starts: # account for many-to-one + for s in starts: # account for many-to-one if s >= start_char and s < end_char: biluo[starts[s]] = "O" else: diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index d436b1cf6..c1e530d4a 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -17,11 +17,7 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - list2array(), - Linear(hidden_width, t2v_width), - ) + tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 38e6114de..e2b6adf43 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -179,22 +179,9 @@ def test_doc_api_right_edge(en_tokenizer): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) assert doc[6].text == "for" subtree = [w.text for w in doc[6].subtree] - assert subtree == [ - "for", - "the", - "sake", - "of", - "such", - "as", - "live", - "under", - "the", - "government", - "of", - "the", - "Romans", - ",", - ] + # fmt: off + assert subtree == ["for", "the", "sake", "of", "such", "as", "live", "under", "the", "government", "of", "the", "Romans", ","] + # fmt: on assert doc[6].right_edge.text == "," @@ -307,9 +294,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_texts = ["Merging the docs is fun.", "They don't think alike."] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] - docs_idx = en_texts[0].index('docs') + docs_idx = en_texts[0].index("docs") de_doc = de_tokenizer(de_text) - en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None) + en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = ( + True, + None, + None, + None, + ) assert Doc.from_docs([]) is None @@ -323,15 +315,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert len(en_docs) == len(list(m_doc.sents)) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert str(m_doc) == " ".join(en_texts) - p_token = m_doc[len(en_docs[0])-1] + p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') + think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think") assert m_doc[9].idx == think_idx with pytest.raises(AttributeError): - not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension - assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there + # not callable, because it was not set via set_extension + m_doc[2]._.is_ambiguous + assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_docs) == len(list(m_doc.sents)) @@ -341,19 +334,21 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think') + think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think") assert m_doc[9].idx == think_idx - m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos']) - with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing + m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) + with pytest.raises(ValueError): + # important attributes from sentenziser or parser are missing assert list(m_doc.sents) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing + # space delimiter considered, although spacy attribute was missing + assert str(m_doc) == " ".join(en_texts) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') + think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think") assert m_doc[9].idx == think_idx diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 6528a4223..81484c083 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -118,6 +118,7 @@ def test_oracle_moves_missing_B(en_vocab): moves.add_action(move_types.index("U"), label) moves.get_oracle_sequence(example) + # We can't easily represent this on a Doc object. Not sure what the best solution # would be, but I don't think it's an important use case? @pytest.mark.xfail(reason="No longer supported") diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index f13b7e847..c54088f56 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -91,6 +91,7 @@ def test_parser_merge_pp(en_tokenizer): assert doc[2].text == "another phrase" assert doc[3].text == "occurs" + # We removed the step_through API a while ago. we should bring it back though @pytest.mark.xfail(reason="Unsupported") def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index fc05444d5..9c596aaf6 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -8,10 +8,11 @@ from ...tokens import DocBin def test_issue4402(): nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" docs = json2docs([json_data]) - data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() + data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) @@ -25,74 +26,73 @@ def test_issue4402(): assert len(split_train_data) == 4 -json_data =\ - { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], - } +json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], +} diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index e8928f33a..35ca47268 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -28,7 +28,9 @@ def test_cli_converters_conllu2json(): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] - ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + ent_offsets = [ + (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] + ] biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"] @@ -54,7 +56,9 @@ def test_cli_converters_conllu2json(): ) def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) - converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) + converted_docs = conllu2docs( + input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} + ) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 @@ -68,7 +72,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] - ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + ent_offsets = [ + (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] + ] biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] @@ -115,7 +121,9 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] - ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + ent_offsets = [ + (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] + ] biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"] @@ -138,11 +146,11 @@ def test_cli_converters_iob2json(en_vocab): sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] - # fmt: off - assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] + expected = ["I", "like", "London", "and", "New", "York", "City", "."] + assert [t["orth"] for t in tokens] == expected assert len(converted_docs[0].ents) == 8 for ent in converted_docs[0].ents: - assert(ent.text in ["New York City", "London"]) + assert ent.text in ["New York City", "London"] def test_cli_converters_conll_ner2json(): @@ -210,7 +218,7 @@ def test_cli_converters_conll_ner2json(): # fmt: on assert len(converted_docs[0].ents) == 10 for ent in converted_docs[0].ents: - assert (ent.text in ["New York City", "London"]) + assert ent.text in ["New York City", "London"] def test_pretrain_make_docs(): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 96acb8982..a5e11ea28 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -161,65 +161,54 @@ def test_example_from_dict_no_ner(en_vocab): ner_tags = example.get_aligned_ner() assert ner_tags == [None, None, None, None] + def test_example_from_dict_some_ner(en_vocab): words = ["a", "b", "c", "d"] spaces = [True, True, False, True] predicted = Doc(en_vocab, words=words, spaces=spaces) example = Example.from_dict( - predicted, - { - "words": words, - "entities": ["U-LOC", None, None, None] - } + predicted, {"words": words, "entities": ["U-LOC", None, None, None]} ) ner_tags = example.get_aligned_ner() assert ner_tags == ["U-LOC", None, None, None] def test_json2docs_no_ner(en_vocab): - data = [{ - "id":1, - "paragraphs":[ - { - "sentences":[ - { - "tokens":[ - { - "dep":"nn", - "head":1, - "tag":"NNP", - "orth":"Ms." - }, - { - "dep":"nsubj", - "head":1, - "tag":"NNP", - "orth":"Haag" - }, - { - "dep":"ROOT", - "head":0, - "tag":"VBZ", - "orth":"plays" - }, - { - "dep":"dobj", - "head":-1, - "tag":"NNP", - "orth":"Elianti" - }, - { - "dep":"punct", - "head":-2, - "tag":".", - "orth":"." - } + data = [ + { + "id": 1, + "paragraphs": [ + { + "sentences": [ + { + "tokens": [ + {"dep": "nn", "head": 1, "tag": "NNP", "orth": "Ms."}, + { + "dep": "nsubj", + "head": 1, + "tag": "NNP", + "orth": "Haag", + }, + { + "dep": "ROOT", + "head": 0, + "tag": "VBZ", + "orth": "plays", + }, + { + "dep": "dobj", + "head": -1, + "tag": "NNP", + "orth": "Elianti", + }, + {"dep": "punct", "head": -2, "tag": ".", "orth": "."}, + ] + } ] - } - ] - } - ] - }] + } + ], + } + ] docs = json2docs(data) assert len(docs) == 1 for doc in docs: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index edc183e0d..f2374bdc6 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -8,8 +8,9 @@ from ..tokens import Doc from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors - +# fmt: off ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") +# fmt: on class DocBin(object): @@ -86,9 +87,7 @@ class DocBin(object): assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) - self.flags.append({ - "has_unknown_spaces": doc.has_unknown_spaces - }) + self.flags.append({"has_unknown_spaces": doc.has_unknown_spaces}) for token in doc: self.strings.add(token.text) self.strings.add(token.tag_) From dc8c9d912fa7496ac09139127dc59231108874d2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 16:47:24 +0200 Subject: [PATCH 10/10] Update docs [ci skip] --- website/docs/api/top-level.md | 22 ---------------------- website/docs/usage/index.md | 7 +++---- website/docs/usage/linguistic-features.md | 2 ++ 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index d0dda9d71..5f7130038 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -611,28 +611,6 @@ detecting the IPython kernel. Mainly used for the | ----------- | ---- | ------------------------------------- | | **RETURNS** | bool | `True` if in Jupyter, `False` if not. | -### util.update_exc {#util.update_exc tag="function"} - -Update, validate and overwrite -[tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). Used to -combine global exceptions with custom, language-specific exceptions. Will raise -an error if key doesn't match `ORTH` values. - -> #### Example -> -> ```python -> BASE = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]} -> NEW = {"a.": [{ORTH: "a.", NORM: "all"}]} -> exceptions = util.update_exc(BASE, NEW) -> # {"a.": [{ORTH: "a.", NORM: "all"}], ":)": [{ORTH: ":)"}]} -> ``` - -| Name | Type | Description | -| ----------------- | ----- | --------------------------------------------------------------- | -| `base_exceptions` | dict | Base tokenizer exceptions. | -| `*addition_dicts` | dicts | Exception dictionaries to add to the base exceptions, in order. | -| **RETURNS** | dict | Combined tokenizer exceptions. | - ### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"} Compile a sequence of prefix rules into a regex object. diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 1e9473a5d..051d6a060 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -29,8 +29,7 @@ import QuickstartInstall from 'widgets/quickstart-install.js' ### pip {#pip} -Using pip, spaCy releases are available as source packages and binary wheels (as -of v2.0.13). +Using pip, spaCy releases are available as source packages and binary wheels. ```bash $ pip install -U spacy @@ -50,8 +49,8 @@ $ pip install -U spacy -To install additional data tables for lemmatization in **spaCy v2.2+** you can -run `pip install spacy[lookups]` or install +To install additional data tables for lemmatization you can run +`pip install spacy[lookups]` or install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) separately. The lookups package is needed to create blank models with lemmatization data, and to lemmatize in languages that don't yet come with diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index e3d83c296..faa6dc850 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1353,6 +1353,8 @@ print("After:", [(token.text, token._.is_musician) for token in doc]) ## Sentence Segmentation {#sbd} + + A [`Doc`](/api/doc) object's sentences are available via the `Doc.sents` property. Unlike other libraries, spaCy uses the dependency parse to determine sentence boundaries. This is usually more accurate than a rule-based approach,