mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Remove old docs stuff
This commit is contained in:
parent
b15619e170
commit
2d4e5ceafd
268
docs/conf.py
268
docs/conf.py
|
@ -1,268 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
#
|
|
||||||
# spaCy documentation build configuration file, created by
|
|
||||||
# sphinx-quickstart on Tue Aug 19 16:27:38 2014.
|
|
||||||
#
|
|
||||||
# This file is execfile()d with the current directory set to its
|
|
||||||
# containing dir.
|
|
||||||
#
|
|
||||||
# Note that not all possible configuration values are present in this
|
|
||||||
# autogenerated file.
|
|
||||||
#
|
|
||||||
# All configuration values have a default; values that are commented out
|
|
||||||
# serve to show the default.
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import os.path
|
|
||||||
import sphinx_rtd_theme
|
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
||||||
# If extensions (or modules to document with autodoc) are in another directory,
|
|
||||||
# add these directories to sys.path here. If the directory is relative to the
|
|
||||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
|
||||||
#sys.path.insert(0, os.path.abspath('.'))
|
|
||||||
|
|
||||||
# -- General configuration ------------------------------------------------
|
|
||||||
|
|
||||||
# If your documentation needs a minimal Sphinx version, state it here.
|
|
||||||
#needs_sphinx = '1.0'
|
|
||||||
|
|
||||||
# Add any Sphinx extension module names here, as strings. They can be
|
|
||||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
|
||||||
# ones.
|
|
||||||
extensions = [
|
|
||||||
'sphinx.ext.coverage',
|
|
||||||
'sphinx.ext.viewcode',
|
|
||||||
'sphinx.ext.autodoc',
|
|
||||||
'sphinxcontrib.napoleon',
|
|
||||||
'sphinx.ext.doctest'
|
|
||||||
]
|
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
|
||||||
templates_path = ['_templates']
|
|
||||||
|
|
||||||
# The suffix of source filenames.
|
|
||||||
source_suffix = '.rst'
|
|
||||||
|
|
||||||
# The encoding of source files.
|
|
||||||
#source_encoding = 'utf-8-sig'
|
|
||||||
|
|
||||||
# The master toctree document.
|
|
||||||
master_doc = 'index'
|
|
||||||
|
|
||||||
# General information about the project.
|
|
||||||
project = u'spaCy'
|
|
||||||
copyright = u'2014, Matthew Honnibal'
|
|
||||||
|
|
||||||
# The version info for the project you're documenting, acts as replacement for
|
|
||||||
# |version| and |release|, also used in various other places throughout the
|
|
||||||
# built documents.
|
|
||||||
#
|
|
||||||
# The short X.Y version.
|
|
||||||
version = '0.1'
|
|
||||||
# The full version, including alpha/beta/rc tags.
|
|
||||||
release = '0.1'
|
|
||||||
|
|
||||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
|
||||||
# for a list of supported languages.
|
|
||||||
#language = None
|
|
||||||
|
|
||||||
# There are two options for replacing |today|: either, you set today to some
|
|
||||||
# non-false value, then it is used:
|
|
||||||
#today = ''
|
|
||||||
# Else, today_fmt is used as the format for a strftime call.
|
|
||||||
#today_fmt = '%B %d, %Y'
|
|
||||||
|
|
||||||
# List of patterns, relative to source directory, that match files and
|
|
||||||
# directories to ignore when looking for source files.
|
|
||||||
exclude_patterns = ['_build']
|
|
||||||
|
|
||||||
# The reST default role (used for this markup: `text`) to use for all
|
|
||||||
# documents.
|
|
||||||
#default_role = None
|
|
||||||
|
|
||||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
|
||||||
#add_function_parentheses = True
|
|
||||||
|
|
||||||
# If true, the current module name will be prepended to all description
|
|
||||||
# unit titles (such as .. function::).
|
|
||||||
#add_module_names = True
|
|
||||||
|
|
||||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
|
||||||
# output. They are ignored by default.
|
|
||||||
#show_authors = False
|
|
||||||
|
|
||||||
# The name of the Pygments (syntax highlighting) style to use.
|
|
||||||
pygments_style = 'sphinx'
|
|
||||||
|
|
||||||
# A list of ignored prefixes for module index sorting.
|
|
||||||
#modindex_common_prefix = []
|
|
||||||
|
|
||||||
# If true, keep warnings as "system message" paragraphs in the built documents.
|
|
||||||
#keep_warnings = False
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for HTML output ----------------------------------------------
|
|
||||||
|
|
||||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
|
||||||
# a list of builtin themes.
|
|
||||||
|
|
||||||
html_theme = 'sphinx_rtd_theme'
|
|
||||||
|
|
||||||
# Theme options are theme-specific and customize the look and feel of a theme
|
|
||||||
# further. For a list of options available for each theme, see the
|
|
||||||
# documentation.
|
|
||||||
#html_theme_options = {}
|
|
||||||
|
|
||||||
# Add any paths that contain custom themes here, relative to this directory.
|
|
||||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
|
||||||
|
|
||||||
# The name for this set of Sphinx documents. If None, it defaults to
|
|
||||||
# "<project> v<release> documentation".
|
|
||||||
#html_title = None
|
|
||||||
|
|
||||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
|
||||||
#html_short_title = None
|
|
||||||
|
|
||||||
# The name of an image file (relative to this directory) to place at the top
|
|
||||||
# of the sidebar.
|
|
||||||
#html_logo = None
|
|
||||||
|
|
||||||
# The name of an image file (within the static path) to use as favicon of the
|
|
||||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
|
||||||
# pixels large.
|
|
||||||
#html_favicon = None
|
|
||||||
|
|
||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
|
||||||
html_static_path = ['_static']
|
|
||||||
|
|
||||||
# Add any extra paths that contain custom files (such as robots.txt or
|
|
||||||
# .htaccess) here, relative to this directory. These files are copied
|
|
||||||
# directly to the root of the documentation.
|
|
||||||
#html_extra_path = []
|
|
||||||
|
|
||||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
|
||||||
# using the given strftime format.
|
|
||||||
#html_last_updated_fmt = '%b %d, %Y'
|
|
||||||
|
|
||||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
|
||||||
# typographically correct entities.
|
|
||||||
#html_use_smartypants = True
|
|
||||||
|
|
||||||
# Custom sidebar templates, maps document names to template names.
|
|
||||||
#html_sidebars = {}
|
|
||||||
|
|
||||||
# Additional templates that should be rendered to pages, maps page names to
|
|
||||||
# template names.
|
|
||||||
#html_additional_pages = {}
|
|
||||||
|
|
||||||
# If false, no module index is generated.
|
|
||||||
#html_domain_indices = True
|
|
||||||
|
|
||||||
# If false, no index is generated.
|
|
||||||
#html_use_index = True
|
|
||||||
|
|
||||||
# If true, the index is split into individual pages for each letter.
|
|
||||||
#html_split_index = False
|
|
||||||
|
|
||||||
# If true, links to the reST sources are added to the pages.
|
|
||||||
html_show_sourcelink = False
|
|
||||||
|
|
||||||
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
|
||||||
#html_show_sphinx = True
|
|
||||||
|
|
||||||
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
|
||||||
#html_show_copyright = True
|
|
||||||
|
|
||||||
# If true, an OpenSearch description file will be output, and all pages will
|
|
||||||
# contain a <link> tag referring to it. The value of this option must be the
|
|
||||||
# base URL from which the finished HTML is served.
|
|
||||||
#html_use_opensearch = ''
|
|
||||||
|
|
||||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
|
||||||
#html_file_suffix = None
|
|
||||||
|
|
||||||
# Output file base name for HTML help builder.
|
|
||||||
htmlhelp_basename = 'spaCydoc'
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for LaTeX output ---------------------------------------------
|
|
||||||
|
|
||||||
latex_elements = {
|
|
||||||
# The paper size ('letterpaper' or 'a4paper').
|
|
||||||
#'papersize': 'letterpaper',
|
|
||||||
|
|
||||||
# The font size ('10pt', '11pt' or '12pt').
|
|
||||||
#'pointsize': '10pt',
|
|
||||||
|
|
||||||
# Additional stuff for the LaTeX preamble.
|
|
||||||
#'preamble': '',
|
|
||||||
}
|
|
||||||
|
|
||||||
# Grouping the document tree into LaTeX files. List of tuples
|
|
||||||
# (source start file, target name, title,
|
|
||||||
# author, documentclass [howto, manual, or own class]).
|
|
||||||
latex_documents = [
|
|
||||||
('index', 'spaCy.tex', u'spaCy Documentation',
|
|
||||||
u'Matthew Honnibal', 'manual'),
|
|
||||||
]
|
|
||||||
|
|
||||||
# The name of an image file (relative to this directory) to place at the top of
|
|
||||||
# the title page.
|
|
||||||
#latex_logo = None
|
|
||||||
|
|
||||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
|
||||||
# not chapters.
|
|
||||||
#latex_use_parts = False
|
|
||||||
|
|
||||||
# If true, show page references after internal links.
|
|
||||||
#latex_show_pagerefs = False
|
|
||||||
|
|
||||||
# If true, show URL addresses after external links.
|
|
||||||
#latex_show_urls = False
|
|
||||||
|
|
||||||
# Documents to append as an appendix to all manuals.
|
|
||||||
#latex_appendices = []
|
|
||||||
|
|
||||||
# If false, no module index is generated.
|
|
||||||
#latex_domain_indices = True
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for manual page output ---------------------------------------
|
|
||||||
|
|
||||||
# One entry per manual page. List of tuples
|
|
||||||
# (source start file, name, description, authors, manual section).
|
|
||||||
man_pages = [
|
|
||||||
('index', 'spacy', u'spaCy Documentation',
|
|
||||||
[u'Matthew Honnibal'], 1)
|
|
||||||
]
|
|
||||||
|
|
||||||
# If true, show URL addresses after external links.
|
|
||||||
#man_show_urls = False
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for Texinfo output -------------------------------------------
|
|
||||||
|
|
||||||
# Grouping the document tree into Texinfo files. List of tuples
|
|
||||||
# (source start file, target name, title, author,
|
|
||||||
# dir menu entry, description, category)
|
|
||||||
texinfo_documents = [
|
|
||||||
('index', 'spaCy', u'spaCy Documentation',
|
|
||||||
u'Matthew Honnibal', 'spaCy', 'One line description of project.',
|
|
||||||
'Miscellaneous'),
|
|
||||||
]
|
|
||||||
|
|
||||||
# Documents to append as an appendix to all manuals.
|
|
||||||
#texinfo_appendices = []
|
|
||||||
|
|
||||||
# If false, no module index is generated.
|
|
||||||
#texinfo_domain_indices = True
|
|
||||||
|
|
||||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
|
||||||
#texinfo_show_urls = 'footnote'
|
|
||||||
|
|
||||||
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
|
||||||
#texinfo_no_detailmenu = False
|
|
|
@ -1,22 +0,0 @@
|
||||||
Installation
|
|
||||||
============
|
|
||||||
|
|
||||||
pip install spacy
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
The easiest way to install is from PyPi via pip::
|
|
||||||
|
|
||||||
pip install spacy
|
|
||||||
|
|
||||||
git clone http://github.com/honnibal/spaCy.git
|
|
||||||
----------------------------------------------
|
|
||||||
|
|
||||||
Installation From source via `GitHub <https://github.com/honnibal/spaCy>`_, using virtualenv::
|
|
||||||
|
|
||||||
$ git clone http://github.com/honnibal/spaCy.git
|
|
||||||
$ cd spaCy
|
|
||||||
$ virtualenv .env
|
|
||||||
$ source .env/bin/activate
|
|
||||||
$ pip install -r requirements.txt
|
|
||||||
$ fab make
|
|
||||||
$ fab test
|
|
|
@ -1,71 +0,0 @@
|
||||||
Overview
|
|
||||||
========
|
|
||||||
|
|
||||||
What and Why
|
|
||||||
------------
|
|
||||||
|
|
||||||
spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon.
|
|
||||||
|
|
||||||
Most tokenizers give you a sequence of strings. That's barbaric.
|
|
||||||
Giving you strings invites you to compute on every *token*, when what
|
|
||||||
you should be doing is computing on every *type*. Remember
|
|
||||||
`Zipf's law <http://en.wikipedia.org/wiki/Zipf's_law>`_: you'll
|
|
||||||
see exponentially fewer types than tokens.
|
|
||||||
|
|
||||||
Instead of strings, spaCy gives you references to Lexeme objects, from which you
|
|
||||||
can access an excellent set of pre-computed orthographic and distributional features:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
>>> from spacy import en
|
|
||||||
>>> apples, are, nt, oranges, dots = en.EN.tokenize(u"Apples aren't oranges...")
|
|
||||||
>>> are.prob >= oranges.prob
|
|
||||||
True
|
|
||||||
>>> apples.check_flag(en.IS_TITLE)
|
|
||||||
True
|
|
||||||
>>> apples.check_flag(en.OFT_TITLE)
|
|
||||||
False
|
|
||||||
>>> are.check_flag(en.CAN_NOUN)
|
|
||||||
False
|
|
||||||
|
|
||||||
spaCy makes it easy to write very efficient NLP applications, because your feature
|
|
||||||
functions have to do almost no work: almost every lexical property you'll want
|
|
||||||
is pre-computed for you. See the tutorial for an example POS tagger.
|
|
||||||
|
|
||||||
Benchmark
|
|
||||||
---------
|
|
||||||
|
|
||||||
The tokenizer itself is also very efficient:
|
|
||||||
|
|
||||||
+--------+-------+--------------+--------------+
|
|
||||||
| System | Time | Words/second | Speed Factor |
|
|
||||||
+--------+-------+--------------+--------------+
|
|
||||||
| NLTK | 6m4s | 89,000 | 1.00 |
|
|
||||||
+--------+-------+--------------+--------------+
|
|
||||||
| spaCy | 9.5s | 3,093,000 | 38.30 |
|
|
||||||
+--------+-------+--------------+--------------+
|
|
||||||
|
|
||||||
The comparison refers to 30 million words from the English Gigaword, on
|
|
||||||
a Maxbook Air. For context, calling string.split() on the data completes in
|
|
||||||
about 5s.
|
|
||||||
|
|
||||||
Pros and Cons
|
|
||||||
-------------
|
|
||||||
|
|
||||||
Pros:
|
|
||||||
|
|
||||||
- All tokens come with indices into the original string
|
|
||||||
- Full unicode support
|
|
||||||
- Extensible to other languages
|
|
||||||
- Batch operations computed efficiently in Cython
|
|
||||||
- Cython API
|
|
||||||
- numpy interoperability
|
|
||||||
|
|
||||||
Cons:
|
|
||||||
|
|
||||||
- It's new (released September 2014)
|
|
||||||
- Security concerns, from memory management
|
|
||||||
- Higher memory usage (up to 1gb)
|
|
||||||
- More conceptually complicated
|
|
||||||
- Tokenization rules expressed in code, not as data
|
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
.. spaCy documentation master file, created by
|
|
||||||
sphinx-quickstart on Tue Aug 19 16:27:38 2014.
|
|
||||||
You can adapt this file completely to your liking, but it should at least
|
|
||||||
contain the root `toctree` directive.
|
|
||||||
|
|
||||||
spaCy NLP Tokenizer and Lexicon
|
|
||||||
================================
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 3
|
|
||||||
|
|
||||||
guide/overview.rst
|
|
||||||
guide/install.rst
|
|
||||||
|
|
||||||
api/index.rst
|
|
||||||
|
|
||||||
modules/index.rst
|
|
||||||
|
|
||||||
|
|
||||||
Source (GitHub)
|
|
||||||
----------------
|
|
||||||
|
|
||||||
http://github.com/honnibal/spaCy
|
|
||||||
|
|
||||||
License
|
|
||||||
-------
|
|
||||||
|
|
||||||
Copyright Matthew Honnibal
|
|
||||||
|
|
||||||
Non-commercial use: $0
|
|
||||||
Commercial trial use: $0
|
|
||||||
Full commercial license: $500
|
|
||||||
|
|
||||||
honnibal@gmail.com
|
|
|
@ -1,20 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy._hashing import PointerHash
|
|
||||||
import random
|
|
||||||
|
|
||||||
|
|
||||||
def test_insert():
|
|
||||||
h = PointerHash()
|
|
||||||
assert h[1] is None
|
|
||||||
h[1] = 5
|
|
||||||
assert h[1] == 5
|
|
||||||
h[2] = 6
|
|
||||||
assert h[1] == 5
|
|
||||||
assert h[2] == 6
|
|
||||||
|
|
||||||
def test_resize():
|
|
||||||
h = PointerHash(4)
|
|
||||||
for i in range(1, 100):
|
|
||||||
value = int(i * (random.random() + 1))
|
|
||||||
h[i] = value
|
|
Loading…
Reference in New Issue
Block a user