import React, { useEffect, useState, useMemo, Fragment } from 'react' import { window } from 'browser-monads' import Title from '../components/title' import Section from '../components/section' import Button from '../components/button' import Aside from '../components/aside' import { InlineCode } from '../components/inlineCode' import CodeBlock from '../components/codeBlock' import { Table, Tr, Td, Th } from '../components/table' import Tag from '../components/tag' import { H2, Label } from '../components/typography' import Icon from '../components/icon' import Link, { OptionalLink } from '../components/link' import Infobox from '../components/infobox' import Accordion from '../components/accordion' import { isString, isEmptyObj, join, arrayToObj, abbrNum } from '../components/util' import MarkdownToReact from '../components/markdownToReactDynamic' import siteMetadata from '../../meta/site.json' import languages from '../../meta/languages.json' const COMPONENT_LINKS = { tok2vec: '/api/tok2vec', transformer: '/api/transformer', tagger: '/api/tagger', parser: '/api/dependencyparser', ner: '/api/entityrecognizer', lemmatizer: '/api/lemmatizer', attribute_ruler: '/api/attributeruler', senter: '/api/sentencerecognizer', morphologizer: '/api/morphologizer', } const MODEL_META = { core: 'Vocabulary, syntax, entities, vectors', core_no_vectors: 'Vocabulary, syntax, entities', dep: 'Vocabulary, syntax', ent: 'Named entities', sent: 'Sentence boundaries', pytt: 'PyTorch Transformers', trf: 'Transformers', vectors: 'Word vectors', web: 'written text (blogs, news, comments)', news: 'written text (news, media)', wiki: 'Wikipedia', uas: 'Unlabeled dependencies', las: 'Labeled dependencies', dep_uas: 'Unlabeled dependencies', dep_las: 'Labeled dependencies', token_acc: 'Tokenization', tok: 'Tokenization', lemma: 'Lemmatization', morph: 'Morphological analysis', lemma_acc: 'Lemmatization', morph_acc: 'Morphological analysis', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tag_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tag: 'Part-of-speech tags (fine grained tags, Token.tag)', pos: 'Part-of-speech tags (coarse grained tags, Token.pos)', pos_acc: 'Part-of-speech tags (coarse grained tags, Token.pos)', ents_f: 'Named entities (F-score)', ents_p: 'Named entities (precision)', ents_r: 'Named entities (recall)', ner_f: 'Named entities (F-score)', ner_p: 'Named entities (precision)', ner_r: 'Named entities (recall)', sents_f: 'Sentence segmentation (F-score)', sents_p: 'Sentence segmentation (precision)', sents_r: 'Sentence segmentation (recall)', cpu: 'words per second on CPU', gpu: 'words per second on GPU', pipeline: 'Active processing pipeline components in order', components: 'All processing pipeline components (including disabled components)', sources: 'Sources of training data', vecs: 'Word vectors included in the package. Packages that only support context vectors compute similarity via the tensors shared with the pipeline.', benchmark_parser: 'Syntax accuracy', benchmark_ner: 'NER accuracy', benchmark_speed: 'Speed', compat: 'Latest compatible package version for your spaCy installation', download_link: 'Download link for the pipeline', } const LABEL_SCHEME_META = { tagger: 'Part-of-speech tags via Token.tag_', parser: 'Dependency labels via Token.dep_', ner: 'Named entity labels', } const MARKDOWN_COMPONENTS = { code: InlineCode, } function getModelComponents(name) { const [lang, type, genre, size] = name.split('_') return { lang, type, genre, size } } function isStableVersion(v) { return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc') } function getLatestVersion(modelId, compatibility, prereleases) { for (let [version, models] of Object.entries(compatibility)) { if (isStableVersion(version) && models[modelId]) { const modelVersions = models[modelId] for (let modelVersion of modelVersions) { if (isStableVersion(modelVersion) || prereleases) { return modelVersion } } } } } function formatVectors(data) { if (!data) return 'n/a' if (Object.values(data).every((n) => n === 0)) return 'context vectors only' const { keys, vectors, width } = data if (keys >= 0) { return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)` } else { return `${abbrNum(vectors)} floret vectors (${width} dimensions)` } } function formatAccuracy(data, lang) { const exclude = lang !== 'ja' ? ['speed'] : ['speed', 'morph_acc'] if (!data) return [] return Object.keys(data) .map((label) => { const value = data[label] return isNaN(value) || exclude.includes(label) ? null : { label, value: value.toFixed(2), help: MODEL_META[label], } }) .filter((item) => item) } function formatDownloadLink(lang, name, version) { const fullName = `${lang}_${name}-${version}` const filename = `${fullName}-py3-none-any.whl` const url = `https://github.com/explosion/spacy-models/releases/download/${fullName}/${filename}` return ( {filename} ) } function formatModelMeta(data) { return { fullName: `${data.lang}_${data.name}-${data.version}`, version: data.version, sizeFull: data.size, pipeline: data.pipeline, components: data.components, notes: data.notes, description: data.description, sources: data.sources, author: data.author, url: data.url, license: data.license, labels: isEmptyObj(data.labels) ? null : data.labels, vectors: formatVectors(data.vectors), accuracy: formatAccuracy(data.performance, data.lang), download_link: formatDownloadLink(data.lang, data.name, data.version), } } function formatSources(data = []) { const sources = data.map((s) => (isString(s) ? { name: s } : s)) return sources.map(({ name, url, author }, i) => ( {i > 0 &&
} {name && url ? {name} : name} {author && ` (${author})`}
)) } function linkComponents(components = []) { return join( components.map((c) => ( {c} )) ) } const Help = ({ children }) => ( ) const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses, prereleases, }) => { const [initialized, setInitialized] = useState(false) const [isError, setIsError] = useState(true) const [meta, setMeta] = useState({}) const { type, genre, size } = getModelComponents(name) const display_type = type === 'core' && (size === 'sm' || size === 'trf') ? 'core_no_vectors' : type const version = useMemo( () => getLatestVersion(name, compatibility, prereleases), [name, compatibility, prereleases] ) useEffect(() => { window.dispatchEvent(new Event('resize')) // scroll position for progress if (!initialized && version) { setIsError(false) fetch(`${baseUrl}/meta/${name}-${version}.json`) .then((res) => res.json()) .then((json) => { setMeta(formatModelMeta(json)) }) .catch((err) => { setIsError(true) console.error(err) }) setInitialized(true) } }, [initialized, version, baseUrl, name]) const releaseTag = meta.fullName ? `tag/${meta.fullName}` : '' const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}` const pipeline = linkComponents(meta.pipeline) const components = linkComponents(meta.components) const sources = formatSources(meta.sources) const author = !meta.url ? meta.author : {meta.author} const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null const license = licenseUrl ? {meta.license} : meta.license const hasInteractiveCode = size === 'sm' && hasExamples && !isError const labels = meta.labels const rows = [ { label: 'Language', tag: langId, content: langName }, { label: 'Type', tag: type, content: MODEL_META[display_type] }, { label: 'Genre', tag: genre, content: MODEL_META[genre] }, { label: 'Size', tag: size, content: meta.sizeFull }, { label: 'Components', content: components, help: MODEL_META.components }, { label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline }, { label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs }, { label: 'Download Link', content: meta.download_link, help: MODEL_META.download_link }, { label: 'Sources', content: sources, help: MODEL_META.sources }, { label: 'Author', content: author }, { label: 'License', content: license }, ] const error = (

To find out more about this model, see the overview of the{' '} latest model releases.

) return (

{version && (
Latest: {version}
)} } > {name}

{meta.description && } {isError && error} {rows.map(({ label, tag, help, content }, i) => !tag && !content ? null : ( ) )}
{tag && {tag}} {content}
{meta.notes && } {hasInteractiveCode && ( {[ `import spacy`, `from spacy.lang.${langId}.examples import sentences `, ``, `nlp = spacy.load("${name}")`, `doc = nlp(sentences[0])`, `print(doc.text)`, `for token in doc:`, ` print(token.text, token.pos_, token.dep_)`, ].join('\n')} )} {meta.accuracy && ( {meta.accuracy.map(({ label, value, help }) => ( ))}
{label.toUpperCase()} {help} {value}
)} {labels && (

The statistical components included in this model package assign the following labels. The labels are specific to the corpus that the model was trained on. To see the description of a label, you can use{' '} spacy.explain .

{Object.keys(labels).map((pipe) => { const labelNames = labels[pipe] || [] const help = LABEL_SCHEME_META[pipe] return ( ) })}
{labelNames.map((label, i) => ( {i > 0 && ', '} {label} ))}
)}
) } const Models = ({ pageContext, repo, children }) => { const [initialized, setInitialized] = useState(false) const [compatibility, setCompatibility] = useState({}) const { id, title, meta } = pageContext const { models } = meta const baseUrl = `https://raw.githubusercontent.com/${repo}/master` useEffect(() => { window.dispatchEvent(new Event('resize')) // scroll position for progress if (!initialized) { fetch(`${baseUrl}/compatibility.json`) .then((res) => res.json()) .then(({ spacy }) => setCompatibility(spacy)) .catch((err) => console.error(err)) setInitialized(true) } }, [initialized, baseUrl]) return ( <> {models.map((modelName) => ( <Model key={modelName} name={modelName} langId={id} langName={title} compatibility={compatibility} baseUrl={baseUrl} repo={repo} licenses={arrayToObj(languages.licenses, 'id')} hasExamples={meta.hasExamples} prereleases={siteMetadata.nightly} /> ))} {children} </> ) } export default Models