import React from 'react' import PropTypes from 'prop-types' import { StaticQuery, graphql } from 'gatsby' import { LandingHeader, LandingTitle, LandingSubtitle, LandingGrid } from '../components/landing' import { LandingCard, LandingButton, LandingDemo } from '../components/landing' import { LandingBannerGrid, LandingBanner, LandingLogos } from '../components/landing' import { H2 } from '../components/typography' import { Ul, Li } from '../components/list' import Button from '../components/button' import Link from '../components/link' import BenchmarksChoi from 'usage/_benchmarks-choi.md' const CODE_EXAMPLE = `# pip install spacy # python -m spacy download en_core_web_sm import spacy # Load English tokenizer, tagger, parser, NER and word vectors nlp = spacy.load("en_core_web_sm") # Process whole documents text = (u"When Sebastian Thrun started working on self-driving cars at " u"Google in 2007, few people outside of the company took him " u"seriously. “I can tell you very senior CEOs of major American " u"car companies would shake my hand and turn away because I wasn’t " u"worth talking to,” said Thrun, now the co-founder and CEO of " u"online higher education startup Udacity, in an interview with " u"Recode earlier this week.") doc = nlp(text) # Find named entities, phrases and concepts for entity in doc.ents: print(entity.text, entity.label_) # Determine semantic similarities doc1 = nlp(u"my fries were super gross") doc2 = nlp(u"such disgusting fries") similarity = doc1.similarity(doc2) print(doc1.text, doc2.text, similarity) ` /** * Compute the overall total counts of models and languages */ function getCounts(langs = []) { return { langs: langs.length, modelLangs: langs.filter(({ models }) => models && !!models.length).length, models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0), } } const Landing = ({ data }) => { const counts = getCounts(data.languages) return ( <> Industrial-Strength
Natural Language
Processing in Python

spaCy excels at large-scale information extraction tasks. It's written from the ground up in carefully memory-managed Cython. Independent research has confirmed that spaCy is the fastest in the world. If your application needs to process entire web dumps, spaCy is the library you want to be using.

Facts & Figures

spaCy is designed to help you do real work — to build real products, or gather real insights. The library respects your time, and tries to avoid wasting it. It's easy to install, and its API is simple and productive. We like to think of spaCy as the Ruby on Rails of Natural Language Processing.

Get started

spaCy is the best way to prepare text for deep learning. It interoperates seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of Python's awesome AI ecosystem. With spaCy, you can easily construct linguistically sophisticated statistical models for a variety of NLP problems.

Read more {CODE_EXAMPLE}

Features

Non-destructive tokenization
Named entity recognition
Support for {counts.langs}+ languages
{counts.models} statistical models for{' '} {counts.modelLangs} languages
Pre-trained word vectors
Easy deep learning integration
Part-of-speech tagging
Labelled dependency parsing
Syntax-driven sentence segmentation
Built in visualizers for syntax and NER
Convenient string-to-hash mapping
Export to numpy data arrays
Efficient binary serialization
Easy model packaging and deployment
State-of-the-art speed
Robust, rigorously evaluated accuracy

Learn more from small training corpora by initializing your models with{' '} knowledge from raw text. The new pretrain command teaches spaCy's CNN model to predict words based on their context, producing representations of words in contexts. If you've seen Google's BERT system or fast.ai's ULMFiT, spaCy's pretraining is similar – but much more efficient. It's still experimental, but users are already reporting good results, so give it a try! Prodigy is an annotation tool so efficient that data scientists can do the annotation themselves, enabling a new level of rapid iteration. Whether you're working on entity recognition, intent detection or image classification, Prodigy can help you train and evaluate your models faster. Stream in your own examples or real-world data from live APIs, update your model in real-time and chain models together to build more complex systems. spaCy v2.0 features new neural models for tagging,{' '} parsing and entity recognition. The models have been designed and implemented from scratch specifically for spaCy, to give you an unmatched balance of speed, size and accuracy. A novel bloom embedding strategy with subword features is used to support huge vocabularies in tiny tables. Convolutional layers with residual connections, layer normalization and maxout non-linearity are used, giving much better efficiency than the standard BiLSTM solution. Finally, the parser and NER use an imitation learning objective to deliver accuracy in-line with the latest research systems, even when evaluated from raw text. With these innovations, spaCy v2.0's models are 10× smaller,{' '} 20% more accurate, and even cheaper to run than the previous generation.

Benchmarks

In 2015, independent researchers from Emory University and Yahoo! Labs showed that spaCy offered the fastest syntactic parser in the world and that its accuracy was within 1% of the best available ( Choi et al., 2015 ). spaCy v2.0, released in 2017, is more accurate than any of the systems Choi et al. evaluated.

) } Landing.propTypes = { data: PropTypes.shape({ repo: PropTypes.string, languages: PropTypes.arrayOf( PropTypes.shape({ models: PropTypes.arrayOf(PropTypes.string), }) ), logosUsers: PropTypes.arrayOf( PropTypes.shape({ id: PropTypes.string.isRequired, url: PropTypes.string.isRequired, }) ), logosPublications: PropTypes.arrayOf( PropTypes.shape({ id: PropTypes.string.isRequired, url: PropTypes.string.isRequired, }) ), }), } export default () => ( } /> ) const landingQuery = graphql` query LandingQuery { site { siteMetadata { repo languages { models } logosUsers { id url } logosPublications { id url } } } } `