- var urls = {} - urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf" - urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf" +comparison("NLTK") p spaCy is: ul li.pro 100x faster; li.pro 50% more accurate; li.pro Serializes TODO% smaller; p spaCy features: ul li.pro Integrated word vectors; li.pro Efficient binary serialization; p NLTK features: ul li.con Multiple languages; li.neutral Educational resources //+comparison("Pattern") +comparison("CoreNLP") p spaCy is: ul li.pro TODO% faster; li.pro TODO% more accurate; li.pro Not Java; li.pro Well documented; li.pro Cheaper to license commercially; li.neutral | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping | options. p CoreNLP features: ul li.con Multiple Languages; li.con Sentiment analysis li.con Coreference resolution +comparison("ClearNLP") p spaCy is: ul li.pro Not Java; li.pro TODO% faster; li.pro Well documented; li.neutral Slightly more accurate; p ClearNLP features: ul li.con Semantic Role Labelling li.con Multiple Languages li.con Model for biology/life-science; //+comparison("Accuracy Summary") //+comparison("Speed Summary") // table // thead // tr // th. // th(colspan=3) Absolute (ms per doc) // th(colspan=3) Relative (to spaCy) // // tbody // tr // td: strong System // td: strong Split // td: strong Tag // td: strong Parse // td: strong Split // td: strong Tag // td: strong Parse // // +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") // +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") // +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") // +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") // +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") // // p // | Set up: 100,000 plain-text documents were streamed // | from an SQLite3 database, and processed with an NLP library, to one // | of three levels of detail – tokenization, tagging, or parsing. // | The tasks are additive: to parse the text you have to tokenize and // | tag it. The pre-processing was not subtracted from the times – // | I report the time required for the pipeline to complete. I report // | mean times per document, in milliseconds. // // p // | Hardware: Intel i7-3770 (2012) +comparison("Peer-reviewed Evaluations") p. spaCy is committed to rigorous evaluation under standard methodology. Two papers in 2015 confirm that: ol li spaCy is the fastest syntactic parser in the world; li Its accuracy is within 1% of the best available; li The few systems that are more accurate are 20× slower or more. p | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, | as part of a survey paper benchmarking the current state-of-the-art dependency | parsers a(href=urls.choi_paper) (Choi et al., 2015) | . table thead +columns("System", "Language", "Accuracy", "Speed") tbody +row("spaCy v0.84", "Cython", "90.6", "13,963") +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)") +row("ClearNLP", "Java", "91.7", "10,271") +row("CoreNLP", "Java", "89.6", "8,602") +row("MATE", "Java", "92.5", "550") +row("Turbo", "C++", "92.4", "349") +row("Yara", "Java", "92.3", "340") p | Discussion with the authors led to accuracy improvements in spaCy, which | have been accepted for publication in EMNLP, in joint work with Macquarie | University a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015) | .