diff --git a/.github/contributors/jmargeta.md b/.github/contributors/jmargeta.md new file mode 100644 index 000000000..ab13424f8 --- /dev/null +++ b/.github/contributors/jmargeta.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jan Margeta | +| Company name (if applicable) | KardioMe | +| Title or role (if applicable) | Founder | +| Date | 2020-10-16 | +| GitHub username | jmargeta | +| Website (optional) | kardio.me | diff --git a/netlify.toml b/netlify.toml index 3c17b876c..e860e4bf4 100644 --- a/netlify.toml +++ b/netlify.toml @@ -58,5 +58,7 @@ redirects = [ {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true}, {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true}, # Renamed universe projects - {from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true} + {from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}, + # Old model pages + {from = "/models/en-starters", to = "/models/en", force = true}, ] diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index 54aec2e31..47b3abbf6 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -1,17 +1,20 @@ # Recommended settings and available resources for each language, if available. # Not all languages have recommended word vectors or transformers and for some, # the recommended transformer for efficiency and accuracy may be the same. -en: - word_vectors: en_vectors_web_lg +ar: + word_vectors: null transformer: efficiency: - name: roberta-base + name: asafaya/bert-base-arabic size_factor: 3 accuracy: - name: roberta-base + name: asafaya/bert-base-arabic size_factor: 3 +da: + word_vectors: da_core_news_lg + transformer: null de: - word_vectors: null + word_vectors: de_core_news_lg transformer: efficiency: name: bert-base-german-cased @@ -19,17 +22,26 @@ de: accuracy: name: bert-base-german-cased size_factor: 3 -fr: - word_vectors: null +el: + word_vectors: el_core_news_lg transformer: efficiency: - name: camembert-base + name: nlpaueb/bert-base-greek-uncased-v1 size_factor: 3 accuracy: - name: camembert-base + name: nlpaueb/bert-base-greek-uncased-v1 + size_factor: 3 +en: + word_vectors: en_core_web_lg + transformer: + efficiency: + name: roberta-base + size_factor: 3 + accuracy: + name: roberta-base size_factor: 3 es: - word_vectors: null + word_vectors: es_core_news_lg transformer: efficiency: name: dccuchile/bert-base-spanish-wwm-cased @@ -37,15 +49,6 @@ es: accuracy: name: dccuchile/bert-base-spanish-wwm-cased size_factor: 3 -sv: - word_vectors: null - transformer: - efficiency: - name: KB/bert-base-swedish-cased - size_factor: 3 - accuracy: - name: KB/bert-base-swedish-cased - size_factor: 3 fi: word_vectors: null transformer: @@ -55,14 +58,65 @@ fi: accuracy: name: TurkuNLP/bert-base-finnish-cased-v1 size_factor: 3 -el: +fr: + word_vectors: fr_core_news_lg + transformer: + efficiency: + name: camembert-base + size_factor: 3 + accuracy: + name: camembert-base + size_factor: 3 +it: + word_vectors: it_core_news_lg + transformers: null +ja: + word_vectors: ja_core_news_lg + transformers: null +lt: + word_vectors: lt_core_news_lg + transformers: null +nb: + word_vectors: nb_core_news_lg + transformers: null +nl: + word_vectors: nl_core_news_lg + transformer: + efficiency: + name: pdelobelle/robbert-v2-dutch-base + size_factor: 3 + accuracy: + name: pdelobelle/robbert-v2-dutch-base + size_factor: 3 +pl: + word_vectors: pl_core_news_lg + transformer: + efficiency: + name: dkleczek/bert-base-polish-cased-v1 + size_factor: 3 + accuracy: + name: dkleczek/bert-base-polish-cased-v1 + size_factor: 3 +pt: + word_vectors: pt_core_news_lg + transformer: + efficiency: + name: neuralmind/bert-base-portuguese-cased + size_factor: 3 + accuracy: + name: neuralmind/bert-base-portuguese-cased + size_factor: 3 +ro: + word_vectors: ro_core_news_lg + transformers: null +sv: word_vectors: null transformer: efficiency: - name: nlpaueb/bert-base-greek-uncased-v1 + name: KB/bert-base-swedish-cased size_factor: 3 accuracy: - name: nlpaueb/bert-base-greek-uncased-v1 + name: KB/bert-base-swedish-cased size_factor: 3 tr: word_vectors: null @@ -74,7 +128,7 @@ tr: name: dbmdz/bert-base-turkish-cased size_factor: 3 zh: - word_vectors: null + word_vectors: zh_core_web_lg transformer: efficiency: name: bert-base-chinese @@ -83,39 +137,3 @@ zh: name: bert-base-chinese size_factor: 3 has_letters: false -ar: - word_vectors: null - transformer: - efficiency: - name: asafaya/bert-base-arabic - size_factor: 3 - accuracy: - name: asafaya/bert-base-arabic - size_factor: 3 -pl: - word_vectors: null - transformer: - efficiency: - name: dkleczek/bert-base-polish-cased-v1 - size_factor: 3 - accuracy: - name: dkleczek/bert-base-polish-cased-v1 - size_factor: 3 -nl: - word_vectors: null - transformer: - efficiency: - name: pdelobelle/robbert-v2-dutch-base - size_factor: 3 - accuracy: - name: pdelobelle/robbert-v2-dutch-base - size_factor: 3 -pt: - word_vectors: null - transformer: - efficiency: - name: neuralmind/bert-base-portuguese-cased - size_factor: 3 - accuracy: - name: neuralmind/bert-base-portuguese-cased - size_factor: 3 diff --git a/spacy/schemas.py b/spacy/schemas.py index f3664acff..9480596f0 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -257,7 +257,7 @@ class TokenPattern(BaseModel): class TokenPatternSchema(BaseModel): - pattern: List[TokenPattern] = Field(..., minItems=1) + pattern: List[TokenPattern] = Field(..., min_items=1) class Config: extra = "forbid" diff --git a/spacy/tests/regression/test_issue6258.py b/spacy/tests/regression/test_issue6258.py new file mode 100644 index 000000000..03b0b9373 --- /dev/null +++ b/spacy/tests/regression/test_issue6258.py @@ -0,0 +1,14 @@ +import pydantic +import pytest +from pydantic import ValidationError +from spacy.schemas import TokenPattern, TokenPatternSchema + + +def test_issue6258(): + """Test that the non-empty constraint pattern field is respected""" + # These one is valid + TokenPatternSchema(pattern=[TokenPattern()]) + # But an empty pattern list should fail to validate + # based on the schema's constraint + with pytest.raises(ValidationError): + TokenPatternSchema(pattern=[]) diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index cf5b70af2..2a8733f41 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -68,8 +68,8 @@ representation consists of 300 dimensions of `0`, which means it's practically nonexistent. If your application will benefit from a **large vocabulary** with more vectors, you should consider using one of the larger pipeline packages or loading in a full vector package, for example, -[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes -over **1 million unique vectors**. +[`en_core_web_lg`](/models/en#en_core_web_lg), which includes **685k unique +vectors**. spaCy is able to compare two objects, and make a prediction of **how similar they are**. Predicting similarity is useful for building recommendation systems diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 4077cf293..af07a438f 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1859,9 +1859,8 @@ pruning the vectors will be taken care of automatically if you set the `--prune` flag. You can also do it manually in the following steps: 1. Start with a **word vectors package** that covers a huge vocabulary. For - instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) - starter provides 300-dimensional GloVe vectors for over 1 million terms of - English. + instance, the [`en_core_web_lg`](/models/en#en_core_web_lg) package provides + 300-dimensional GloVe vectors for 685k terms of English. 2. If your vocabulary has values set for the `Lexeme.prob` attribute, the lexemes will be sorted by descending probability to determine which vectors to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. @@ -1869,7 +1868,7 @@ flag. You can also do it manually in the following steps: vectors you want to keep. ```python -nlp = spacy.load('en_vectors_web_lg') +nlp = spacy.load("en_core_web_lg") n_vectors = 105000 # number of vectors to keep removed_words = nlp.vocab.prune_vectors(n_vectors) diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index 4a8ef5a37..8d310f1a4 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -22,7 +22,7 @@ For more details and a behind-the-scenes look at the new release, > > ```bash > $ python -m spacy pretrain ./raw_text.jsonl -> en_vectors_web_lg ./pretrained-model +> en_core_web_lg ./pretrained-model > ``` spaCy v2.1 introduces a new CLI command, `spacy pretrain`, that can make your diff --git a/website/gatsby-node.js b/website/gatsby-node.js index 56a65aeae..b5d8c22c3 100644 --- a/website/gatsby-node.js +++ b/website/gatsby-node.js @@ -226,8 +226,6 @@ exports.createPages = ({ graphql, actions }) => { const langs = result.data.site.siteMetadata.languages const modelLangs = langs.filter(({ models }) => models && models.length) - const starterLangs = langs.filter(({ starters }) => starters && starters.length) - modelLangs.forEach(({ code, name, models, example, has_examples }, i) => { const slug = `/models/${code}` const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null @@ -247,28 +245,6 @@ exports.createPages = ({ graphql, actions }) => { }, }) }) - - starterLangs.forEach(({ code, name, starters }, i) => { - const slug = `/models/${code}-starters` - const next = i < starterLangs.length - 1 ? starterLangs[i + 1] : null - createPage({ - path: slug, - component: DEFAULT_TEMPLATE, - context: { - id: `${code}-starters`, - slug: slug, - isIndex: false, - title: name, - section: 'models', - sectionTitle: sections.models.title, - theme: sections.models.theme, - next: next - ? { title: next.name, slug: `/models/${next.code}-starters` } - : null, - meta: { models: starters, isStarters: true }, - }, - }) - }) }) ) }) diff --git a/website/src/templates/docs.js b/website/src/templates/docs.js index 7bb62fd21..8343a16a8 100644 --- a/website/src/templates/docs.js +++ b/website/src/templates/docs.js @@ -52,19 +52,6 @@ const Docs = ({ pageContext, children }) => ( id: model, })), })) - if (sidebar.items.length > 2) { - sidebar.items[2].items = languages - .filter(({ starters }) => starters && starters.length) - .map(lang => ({ - text: lang.name, - url: `/models/${lang.code}-starters`, - isActive: id === `${lang.code}-starters`, - menu: lang.starters.map(model => ({ - text: model, - id: model, - })), - })) - } } const sourcePath = source ? github(source) : null const currentSource = getCurrentSource(slug, isIndex) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 9c6f595da..b9658dacd 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -374,7 +374,7 @@ const Models = ({ pageContext, repo, children }) => { const [initialized, setInitialized] = useState(false) const [compatibility, setCompatibility] = useState({}) const { id, title, meta } = pageContext - const { models, isStarters } = meta + const { models } = meta const baseUrl = `https://raw.githubusercontent.com/${repo}/master` useEffect(() => { @@ -388,26 +388,9 @@ const Models = ({ pageContext, repo, children }) => { } }, [initialized, baseUrl]) - const modelTitle = title - const modelTeaser = `Available trained pipelines for ${title}` - const starterTitle = `${title} starters` - const starterTeaser = `Available transfer learning starter packs for ${title}` - return ( <> -
- Starter packs are pretrained weights you can initialize your models with to - achieve better accuracy, like word vectors (which will be used as features - during training). -
-