mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
163 lines
9.5 KiB
Plaintext
163 lines
9.5 KiB
Plaintext
//- ----------------------------------
|
|
//- 💫 LANDING PAGE
|
|
//- ----------------------------------
|
|
|
|
include _includes/_mixins
|
|
|
|
header.o-header.u-pattern
|
|
h1.o-block-small.u-heading-0.u-text-shadow
|
|
| Industrial-strength#[br]
|
|
| Natural Language#[br]
|
|
| Processing
|
|
|
|
.o-block-small.u-text-medium.u-text-strong.u-text-shadow
|
|
| Thousands of researchers are trying to make#[br]
|
|
| computers understand text. They're succeeding.#[br]
|
|
| spaCy is a Python NLP library that helps you get#[br]
|
|
| their work out of papers and into production.
|
|
|
|
.o-inline-list
|
|
+button("/docs/#getting-started", true, "secondary")(target="_self") Install spaCy
|
|
|
|
main.o-main
|
|
.o-inline-list.u-padding-small.u-text-center.u-text-label.u-color-subtle.u-border-bottom
|
|
+a("https://github.com/" + SOCIAL.github + "/spaCy/releases") #[strong Latest Release:] v#{SPACY_VERSION}
|
|
|
|
+a("https://github.com/" + SOCIAL.github + "/spaCy") #[+icon("github")] #[strong #{SPACY_STARS}+ stars] on GitHub
|
|
|
|
+a("https://www.reddit.com/r/" + SOCIAL.reddit) #[+icon("reddit")] #[strong User Group] on Reddit
|
|
|
|
+grid.u-border-bottom
|
|
+grid-col("half").u-padding
|
|
+label Release update
|
|
+h(2)
|
|
+a("https://github.com/" + SOCIAL.github + "/spaCy/releases") spaCy v1.0 out now!
|
|
|
|
p.u-text-medium I'm excited — and more than a little nervous! — to finally make the #[+a("https://github.com/" + SOCIAL.github + "/spaCy/releases") 1.0 release of spaCy]. By far my favourite part of the release is the new support for custom pipelines. Default support for GloVe vectors is also nice. The trickiest change was a significant rewrite of the Matcher class, to support entity IDs and attributes. I've added #[a(href="/docs/#tutorials") tutorials] for the new features, and some training examples.#[br]#[br]
|
|
|
|
+button("https://explosion.ai/blog/spacy-deep-learning-keras", true, "primary") Read the blog post
|
|
|
|
+grid-col("half").u-padding
|
|
+label Are you using spaCy?
|
|
+h(2)
|
|
+a("https://survey.spacy.io", true) Take the spaCy user survey
|
|
|
|
p.u-text-medium.
|
|
Two years after I started working on spaCy full time, I'm finally pushing forward with a #[strong 1.0 release]. It's also past time to take a bit of a census. I hope you'll take a few minutes to fill out this survey, to help me understand how you're using the library, and how it can be better.#[br]#[br]
|
|
|
|
#[strong Thanks for your support!]#[br]
|
|
#[strong.u-heading-3.u-color-theme Matt]
|
|
#[br]#[br]
|
|
|
|
#[+button("https://survey.spacy.io", true, "primary") Take the survey]
|
|
|
|
+grid
|
|
+grid-col("half").u-padding
|
|
+h(2) Built for Production
|
|
p.u-text-medium Most AI software is built for research. Over the last ten years, we've used a lot of that software, and built some of it ourselves, especially for natural language processing (NLP). But the faster the research has moved, the more impatient we've become. We want to see advanced NLP technologies get out into great products, as the basis of great businesses. We built spaCy to make that happen.
|
|
|
|
+h(2) Easy and Powerful
|
|
p.u-text-medium For any NLP task, there are always lots of competing algorithms. We don't believe in implementing them all and letting you choose. Instead, we just implement one – the best one. When better algorithms are developed, we can update the library without breaking your code or bloating the API. This approach makes spaCy both #[strong easier] and #[strong more powerful] than a pluggable architecture. spaCy also features a #[strong unique whole-document design]. Where other NLP libraries rely on sentence detection as a pre-process, spaCy reads the whole document at once, making it much more robust to informal and poorly formatted text.
|
|
|
|
+h(2) Permissive open-source license (MIT)
|
|
p.u-text-medium We think spaCy is valuable software, so we made it free, to raise its value even higher. Making spaCy open-source puts us on the same side – we can tell you everything about how it works, and let you run it however you like. We think the software would be much less valuable as a service, which could disappear at any point.
|
|
|
|
+grid-col("half").u-padding-medium
|
|
.x-terminal
|
|
.x-terminal__icons: span
|
|
.u-padding-small.u-text-label.u-text-strong.u-text-center lightning_tour.py
|
|
|
|
+code.x-terminal__code.
|
|
# pip install spacy && python -m spacy.en.download
|
|
import spacy
|
|
|
|
# Load English tokenizer, tagger, parser, NER and word vectors
|
|
nlp = spacy.load('en')
|
|
# Process a document, of any size
|
|
text = open('war_and_peace.txt').read()
|
|
doc = nlp(text)
|
|
|
|
from spacy.attrs import *
|
|
# All strings mapped to integers, for easy export to numpy
|
|
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
|
|
|
from reddit_corpus import RedditComments
|
|
reddit = RedditComments('/path/to/reddit/corpus')
|
|
# Parse a stream of documents, with multi-threading (no GIL!)
|
|
# Processes over 100,000 tokens per second.
|
|
for doc in nlp.pipe(reddit.texts, batch_size=10000, n_threads=4):
|
|
# Multi-word expressions, such as names, dates etc
|
|
# can be merged into single tokens
|
|
for ent in doc.ents:
|
|
ent.merge(ent.root.tag_, ent.text, ent.ent_type_)
|
|
# Efficient, lossless serialization --- all annotations
|
|
# saved, same size as uncompressed text
|
|
byte_string = doc.to_bytes()
|
|
|
|
.o-block.u-text-center
|
|
+label spaCy is trusted by
|
|
|
|
.o-block
|
|
each row in logos
|
|
.o-inline-list.u-padding-medium
|
|
each logo in row
|
|
+a(logo[1])
|
|
img(src="/assets/img/logos/" + logo[0] + ".png" width="150").u-padding-medium
|
|
|
|
.o-block.u-text-center.u-padding
|
|
+label What our users say...
|
|
|
|
+grid
|
|
+grid-col("third")
|
|
<blockquote class="twitter-tweet" data-cards="hidden" data-lang="en"><p lang="en" dir="ltr">"Dead Code Should be Buried" <a href="http://t.co/AxfZRRz8nB">http://t.co/AxfZRRz8nB</a> by <a href="https://twitter.com/honnibal">@honnibal</a> on NLP tools & new Python library spaCy <a href="http://t.co/C9f798R3aO">http://t.co/C9f798R3aO</a> looks nice!</p>— Andrej Karpathy (@karpathy) <a href="https://twitter.com/karpathy/status/640098689894232064">September 5, 2015</a></blockquote>
|
|
|
|
+grid-col("third")
|
|
<blockquote class="twitter-tweet" data-cards="hidden" data-lang="en"><p lang="en" dir="ltr">spaCy seems pretty exciting to me - and it is clear that NLTK has not kept up with <a href="https://twitter.com/hashtag/NLP?src=hash">#NLP</a>. <a href="http://t.co/mUPFUMLrbo">http://t.co/mUPFUMLrbo</a> <a href="https://twitter.com/hashtag/python?src=hash">#python</a> <a href="https://twitter.com/hashtag/datascience?src=hash">#datascience</a></p>— Alex Engler (@AlexCEngler) <a href="https://twitter.com/AlexCEngler/status/648537133544833025">September 28, 2015</a></blockquote>
|
|
|
|
+grid-col("third")
|
|
<blockquote class="twitter-tweet" data-lang="en"><p lang="en" dir="ltr">I wish I'd known about nlp.pipe about two weeks ago. Nice feature in <a href="https://twitter.com/Spacy">@spacy</a> to parallelize your NLP pipeline.</p>— Matti Lyra (@mattilyra) <a href="https://twitter.com/mattilyra/status/704753660329369600">March 1, 2016</a></blockquote>
|
|
|
|
+grid
|
|
+grid-col("half").u-padding
|
|
+label Benchmarks
|
|
+h(2) State-of-the-art speed and accuracy
|
|
|
|
p.u-text-medium spaCy is committed to rigorous evaluation under standard methodology. Two peer-reviewed papers in 2015 confirm that it offers the #[strong fastest syntactic parser in the world] and that #[strong its accuracy is within 2% of the best] available.
|
|
|
|
sup #[+a("http://aclweb.org/anthology/P/P15/P15-1038.pdf") [1]], #[+a("https://aclweb.org/anthology/D/D15/D15-1162.pdf") [2]], #[+a("http://homes.cs.washington.edu/~lsz/papers/llz-naacl16.pdf") [3]]
|
|
|
|
+grid-col("half").u-padding
|
|
+table(["System", "Language", "Accuracy", "Speed (WPS)"])
|
|
+row
|
|
+cell
|
|
!=partial("_includes/_logo", { logo_size: "tiny" })
|
|
+cell #[strong Cython]
|
|
+cell #[strong 91.8]
|
|
+cell #[strong 13,963]
|
|
|
|
+row
|
|
+cell ClearNLP
|
|
+cell Java
|
|
+cell 91.7
|
|
+cell 10,271
|
|
|
|
+row
|
|
+cell CoreNLP
|
|
+cell Java
|
|
+cell 89.6
|
|
+cell 8,602
|
|
|
|
+row
|
|
+cell MATE
|
|
+cell Java
|
|
+cell 92.5
|
|
+cell 550
|
|
|
|
+row
|
|
+cell Turbo
|
|
+cell C++
|
|
+cell 92.4
|
|
+cell 349
|
|
|
|
include _includes/_newsletter
|