mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Re-add docs, sorting out mess from gh-pages
This commit is contained in:
parent
aba4a7c7ea
commit
bfab6403bc
177
docs/Makefile
Normal file
177
docs/Makefile
Normal file
|
@ -0,0 +1,177 @@
|
||||||
|
# Makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line.
|
||||||
|
SPHINXOPTS =
|
||||||
|
SPHINXBUILD = sphinx-build
|
||||||
|
PAPER =
|
||||||
|
BUILDDIR = ../../docs-spacy
|
||||||
|
|
||||||
|
# User-friendly check for sphinx-build
|
||||||
|
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
|
||||||
|
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Internal variables.
|
||||||
|
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||||
|
PAPEROPT_letter = -D latex_paper_size=letter
|
||||||
|
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||||
|
# the i18n builder cannot share the environment and doctrees with the others
|
||||||
|
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||||
|
|
||||||
|
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
||||||
|
|
||||||
|
help:
|
||||||
|
@echo "Please use \`make <target>' where <target> is one of"
|
||||||
|
@echo " html to make standalone HTML files"
|
||||||
|
@echo " dirhtml to make HTML files named index.html in directories"
|
||||||
|
@echo " singlehtml to make a single large HTML file"
|
||||||
|
@echo " pickle to make pickle files"
|
||||||
|
@echo " json to make JSON files"
|
||||||
|
@echo " htmlhelp to make HTML files and a HTML help project"
|
||||||
|
@echo " qthelp to make HTML files and a qthelp project"
|
||||||
|
@echo " devhelp to make HTML files and a Devhelp project"
|
||||||
|
@echo " epub to make an epub"
|
||||||
|
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
||||||
|
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
||||||
|
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
|
||||||
|
@echo " text to make text files"
|
||||||
|
@echo " man to make manual pages"
|
||||||
|
@echo " texinfo to make Texinfo files"
|
||||||
|
@echo " info to make Texinfo files and run them through makeinfo"
|
||||||
|
@echo " gettext to make PO message catalogs"
|
||||||
|
@echo " changes to make an overview of all changed/added/deprecated items"
|
||||||
|
@echo " xml to make Docutils-native XML files"
|
||||||
|
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
|
||||||
|
@echo " linkcheck to check all external links for integrity"
|
||||||
|
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(BUILDDIR)/*
|
||||||
|
|
||||||
|
html:
|
||||||
|
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
||||||
|
|
||||||
|
dirhtml:
|
||||||
|
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
||||||
|
|
||||||
|
singlehtml:
|
||||||
|
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
||||||
|
|
||||||
|
pickle:
|
||||||
|
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can process the pickle files."
|
||||||
|
|
||||||
|
json:
|
||||||
|
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can process the JSON files."
|
||||||
|
|
||||||
|
htmlhelp:
|
||||||
|
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
||||||
|
".hhp project file in $(BUILDDIR)/htmlhelp."
|
||||||
|
|
||||||
|
qthelp:
|
||||||
|
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
||||||
|
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
||||||
|
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spaCy.qhcp"
|
||||||
|
@echo "To view the help file:"
|
||||||
|
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spaCy.qhc"
|
||||||
|
|
||||||
|
devhelp:
|
||||||
|
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished."
|
||||||
|
@echo "To view the help file:"
|
||||||
|
@echo "# mkdir -p $$HOME/.local/share/devhelp/spaCy"
|
||||||
|
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spaCy"
|
||||||
|
@echo "# devhelp"
|
||||||
|
|
||||||
|
epub:
|
||||||
|
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
||||||
|
|
||||||
|
latex:
|
||||||
|
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
||||||
|
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
||||||
|
"(use \`make latexpdf' here to do that automatically)."
|
||||||
|
|
||||||
|
latexpdf:
|
||||||
|
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||||
|
@echo "Running LaTeX files through pdflatex..."
|
||||||
|
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
||||||
|
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||||
|
|
||||||
|
latexpdfja:
|
||||||
|
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||||
|
@echo "Running LaTeX files through platex and dvipdfmx..."
|
||||||
|
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
|
||||||
|
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||||
|
|
||||||
|
text:
|
||||||
|
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
||||||
|
|
||||||
|
man:
|
||||||
|
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
||||||
|
|
||||||
|
texinfo:
|
||||||
|
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
||||||
|
@echo "Run \`make' in that directory to run these through makeinfo" \
|
||||||
|
"(use \`make info' here to do that automatically)."
|
||||||
|
|
||||||
|
info:
|
||||||
|
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||||
|
@echo "Running Texinfo files through makeinfo..."
|
||||||
|
make -C $(BUILDDIR)/texinfo info
|
||||||
|
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
||||||
|
|
||||||
|
gettext:
|
||||||
|
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
||||||
|
|
||||||
|
changes:
|
||||||
|
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
||||||
|
@echo
|
||||||
|
@echo "The overview file is in $(BUILDDIR)/changes."
|
||||||
|
|
||||||
|
linkcheck:
|
||||||
|
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
||||||
|
@echo
|
||||||
|
@echo "Link check complete; look for any errors in the above output " \
|
||||||
|
"or in $(BUILDDIR)/linkcheck/output.txt."
|
||||||
|
|
||||||
|
doctest:
|
||||||
|
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
||||||
|
@echo "Testing of doctests in the sources finished, look at the " \
|
||||||
|
"results in $(BUILDDIR)/doctest/output.txt."
|
||||||
|
|
||||||
|
xml:
|
||||||
|
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
|
||||||
|
|
||||||
|
pseudoxml:
|
||||||
|
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
|
8
docs/source/api/index.rst
Normal file
8
docs/source/api/index.rst
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
API
|
||||||
|
===
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
tokenizers/index.rst
|
||||||
|
lexicon.rst
|
6
docs/source/api/lexicon.rst
Normal file
6
docs/source/api/lexicon.rst
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
spacy.word.Lexeme
|
||||||
|
=================
|
||||||
|
|
||||||
|
|
||||||
|
.. autoclass:: spacy.word.Lexeme
|
||||||
|
:members:
|
94
docs/source/api/tokenizers/en.rst
Normal file
94
docs/source/api/tokenizers/en.rst
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
spacy.en.EN
|
||||||
|
============
|
||||||
|
|
||||||
|
.. automodule:: spacy.en
|
||||||
|
|
||||||
|
Tokenizer API
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. automethod:: spacy.en.EN.tokenize
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
.. automethod:: spacy.en.EN.lookup
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
Lexeme Features Flag IDs
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
A number of boolean features are computed for English Lexemes. To access a feature,
|
||||||
|
pass its ID to the :py:meth:`spacy.word.Lexeme.check_flag` function.
|
||||||
|
|
||||||
|
Orthographic Features
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
These features describe the `orthographic` (lettering) type of the word. The
|
||||||
|
function used to compute the value is listed along with the flag.
|
||||||
|
|
||||||
|
.. data:: IS_ALPHA
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_alpha`
|
||||||
|
|
||||||
|
.. data:: IS_DIGIT
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_digit`
|
||||||
|
|
||||||
|
.. data:: IS_UPPER
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_upper`
|
||||||
|
|
||||||
|
.. data:: IS_PUNCT
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_punct`
|
||||||
|
|
||||||
|
.. data:: IS_SPACE
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_space`
|
||||||
|
|
||||||
|
.. data:: IS_ASCII
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_ascii`
|
||||||
|
|
||||||
|
.. data:: IS_TITLE
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_title`
|
||||||
|
|
||||||
|
.. data:: IS_LOWER
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_lower`
|
||||||
|
|
||||||
|
.. data:: IS_UPPER
|
||||||
|
|
||||||
|
:py:func:`spacy.orth.is_upper`
|
||||||
|
|
||||||
|
Distributional Orthographic Features
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
These features describe how often the lower-cased form of the word appears
|
||||||
|
in various case-styles in a large sample of English text. See :py:func:`spacy.orth.oft_case`
|
||||||
|
|
||||||
|
.. data:: OFT_UPPER
|
||||||
|
.. data:: OFT_LOWER
|
||||||
|
.. data:: OFT_TITLE
|
||||||
|
|
||||||
|
|
||||||
|
Tag Dictionary Features
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
These features describe whether the word commonly occurs with a given
|
||||||
|
part-of-speech, in a large text corpus, using a part-of-speech tagger designed
|
||||||
|
to reduce the tag-dictionary bias of its training corpus. See
|
||||||
|
:py:func:`spacy.orth.can_tag`.
|
||||||
|
|
||||||
|
.. data:: CAN_PUNCT
|
||||||
|
.. data:: CAN_CONJ
|
||||||
|
.. data:: CAN_NUM
|
||||||
|
.. data:: CAN_DET
|
||||||
|
.. data:: CAN_ADP
|
||||||
|
.. data:: CAN_ADJ
|
||||||
|
.. data:: CAN_ADV
|
||||||
|
.. data:: CAN_VERB
|
||||||
|
.. data:: CAN_NOUN
|
||||||
|
.. data:: CAN_PDT
|
||||||
|
.. data:: CAN_POS
|
||||||
|
.. data:: CAN_PRON
|
||||||
|
.. data:: CAN_PRT
|
8
docs/source/api/tokenizers/index.rst
Normal file
8
docs/source/api/tokenizers/index.rst
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
Tokenizers
|
||||||
|
===================================
|
||||||
|
|
||||||
|
Each module listed here implements a different tokenization scheme, usually
|
||||||
|
intended for a specific language.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
en.rst
|
270
docs/source/conf.py
Normal file
270
docs/source/conf.py
Normal file
|
@ -0,0 +1,270 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# spaCy documentation build configuration file, created by
|
||||||
|
# sphinx-quickstart on Thu Sep 25 17:47:15 2014.
|
||||||
|
#
|
||||||
|
# This file is execfile()d with the current directory set to its
|
||||||
|
# containing dir.
|
||||||
|
#
|
||||||
|
# Note that not all possible configuration values are present in this
|
||||||
|
# autogenerated file.
|
||||||
|
#
|
||||||
|
# All configuration values have a default; values that are commented out
|
||||||
|
# serve to show the default.
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
import sphinx_rtd_theme
|
||||||
|
|
||||||
|
# If extensions (or modules to document with autodoc) are in another directory,
|
||||||
|
# add these directories to sys.path here. If the directory is relative to the
|
||||||
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
#sys.path.insert(0, os.path.abspath('.'))
|
||||||
|
|
||||||
|
# -- General configuration ------------------------------------------------
|
||||||
|
|
||||||
|
# If your documentation needs a minimal Sphinx version, state it here.
|
||||||
|
#needs_sphinx = '1.0'
|
||||||
|
|
||||||
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
|
# ones.
|
||||||
|
extensions = [
|
||||||
|
'sphinx.ext.autodoc',
|
||||||
|
'sphinx.ext.doctest',
|
||||||
|
'sphinx.ext.intersphinx',
|
||||||
|
'sphinx.ext.viewcode',
|
||||||
|
'sphinxcontrib.napoleon'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
|
templates_path = ['_templates']
|
||||||
|
|
||||||
|
# The suffix of source filenames.
|
||||||
|
source_suffix = '.rst'
|
||||||
|
|
||||||
|
# The encoding of source files.
|
||||||
|
#source_encoding = 'utf-8-sig'
|
||||||
|
|
||||||
|
# The master toctree document.
|
||||||
|
master_doc = 'index'
|
||||||
|
|
||||||
|
# General information about the project.
|
||||||
|
project = u'spaCy'
|
||||||
|
copyright = u'2014, Matthew Honnibal'
|
||||||
|
|
||||||
|
# The version info for the project you're documenting, acts as replacement for
|
||||||
|
# |version| and |release|, also used in various other places throughout the
|
||||||
|
# built documents.
|
||||||
|
#
|
||||||
|
# The short X.Y version.
|
||||||
|
version = '1.0'
|
||||||
|
# The full version, including alpha/beta/rc tags.
|
||||||
|
release = '1.0'
|
||||||
|
|
||||||
|
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||||
|
# for a list of supported languages.
|
||||||
|
#language = None
|
||||||
|
|
||||||
|
# There are two options for replacing |today|: either, you set today to some
|
||||||
|
# non-false value, then it is used:
|
||||||
|
#today = ''
|
||||||
|
# Else, today_fmt is used as the format for a strftime call.
|
||||||
|
#today_fmt = '%B %d, %Y'
|
||||||
|
|
||||||
|
# List of patterns, relative to source directory, that match files and
|
||||||
|
# directories to ignore when looking for source files.
|
||||||
|
exclude_patterns = []
|
||||||
|
|
||||||
|
# The reST default role (used for this markup: `text`) to use for all
|
||||||
|
# documents.
|
||||||
|
#default_role = None
|
||||||
|
|
||||||
|
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||||
|
#add_function_parentheses = True
|
||||||
|
|
||||||
|
# If true, the current module name will be prepended to all description
|
||||||
|
# unit titles (such as .. function::).
|
||||||
|
#add_module_names = True
|
||||||
|
|
||||||
|
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||||
|
# output. They are ignored by default.
|
||||||
|
#show_authors = False
|
||||||
|
|
||||||
|
# The name of the Pygments (syntax highlighting) style to use.
|
||||||
|
pygments_style = 'sphinx'
|
||||||
|
|
||||||
|
# A list of ignored prefixes for module index sorting.
|
||||||
|
#modindex_common_prefix = []
|
||||||
|
|
||||||
|
# If true, keep warnings as "system message" paragraphs in the built documents.
|
||||||
|
#keep_warnings = False
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for HTML output ----------------------------------------------
|
||||||
|
|
||||||
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
|
# a list of builtin themes.
|
||||||
|
html_theme = 'sphinx_rtd_theme'
|
||||||
|
|
||||||
|
# Theme options are theme-specific and customize the look and feel of a theme
|
||||||
|
# further. For a list of options available for each theme, see the
|
||||||
|
# documentation.
|
||||||
|
#html_theme_options = {}
|
||||||
|
|
||||||
|
# Add any paths that contain custom themes here, relative to this directory.
|
||||||
|
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||||
|
|
||||||
|
# The name for this set of Sphinx documents. If None, it defaults to
|
||||||
|
# "<project> v<release> documentation".
|
||||||
|
#html_title = None
|
||||||
|
|
||||||
|
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||||
|
#html_short_title = None
|
||||||
|
|
||||||
|
# The name of an image file (relative to this directory) to place at the top
|
||||||
|
# of the sidebar.
|
||||||
|
#html_logo = None
|
||||||
|
|
||||||
|
# The name of an image file (within the static path) to use as favicon of the
|
||||||
|
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||||
|
# pixels large.
|
||||||
|
#html_favicon = None
|
||||||
|
|
||||||
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
|
html_static_path = ['_static']
|
||||||
|
|
||||||
|
# Add any extra paths that contain custom files (such as robots.txt or
|
||||||
|
# .htaccess) here, relative to this directory. These files are copied
|
||||||
|
# directly to the root of the documentation.
|
||||||
|
#html_extra_path = []
|
||||||
|
|
||||||
|
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||||
|
# using the given strftime format.
|
||||||
|
#html_last_updated_fmt = '%b %d, %Y'
|
||||||
|
|
||||||
|
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||||
|
# typographically correct entities.
|
||||||
|
#html_use_smartypants = True
|
||||||
|
|
||||||
|
# Custom sidebar templates, maps document names to template names.
|
||||||
|
#html_sidebars = {}
|
||||||
|
|
||||||
|
# Additional templates that should be rendered to pages, maps page names to
|
||||||
|
# template names.
|
||||||
|
#html_additional_pages = {}
|
||||||
|
|
||||||
|
# If false, no module index is generated.
|
||||||
|
#html_domain_indices = True
|
||||||
|
|
||||||
|
# If false, no index is generated.
|
||||||
|
#html_use_index = True
|
||||||
|
|
||||||
|
# If true, the index is split into individual pages for each letter.
|
||||||
|
#html_split_index = False
|
||||||
|
|
||||||
|
# If true, links to the reST sources are added to the pages.
|
||||||
|
#html_show_sourcelink = True
|
||||||
|
|
||||||
|
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
||||||
|
#html_show_sphinx = True
|
||||||
|
|
||||||
|
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
||||||
|
#html_show_copyright = True
|
||||||
|
|
||||||
|
# If true, an OpenSearch description file will be output, and all pages will
|
||||||
|
# contain a <link> tag referring to it. The value of this option must be the
|
||||||
|
# base URL from which the finished HTML is served.
|
||||||
|
#html_use_opensearch = ''
|
||||||
|
|
||||||
|
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
||||||
|
#html_file_suffix = None
|
||||||
|
|
||||||
|
# Output file base name for HTML help builder.
|
||||||
|
htmlhelp_basename = 'spaCydoc'
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for LaTeX output ---------------------------------------------
|
||||||
|
|
||||||
|
latex_elements = {
|
||||||
|
# The paper size ('letterpaper' or 'a4paper').
|
||||||
|
#'papersize': 'letterpaper',
|
||||||
|
|
||||||
|
# The font size ('10pt', '11pt' or '12pt').
|
||||||
|
#'pointsize': '10pt',
|
||||||
|
|
||||||
|
# Additional stuff for the LaTeX preamble.
|
||||||
|
#'preamble': '',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Grouping the document tree into LaTeX files. List of tuples
|
||||||
|
# (source start file, target name, title,
|
||||||
|
# author, documentclass [howto, manual, or own class]).
|
||||||
|
latex_documents = [
|
||||||
|
('index', 'spaCy.tex', u'spaCy Documentation',
|
||||||
|
u'Matthew Honnibal', 'manual'),
|
||||||
|
]
|
||||||
|
|
||||||
|
# The name of an image file (relative to this directory) to place at the top of
|
||||||
|
# the title page.
|
||||||
|
#latex_logo = None
|
||||||
|
|
||||||
|
# For "manual" documents, if this is true, then toplevel headings are parts,
|
||||||
|
# not chapters.
|
||||||
|
#latex_use_parts = False
|
||||||
|
|
||||||
|
# If true, show page references after internal links.
|
||||||
|
#latex_show_pagerefs = False
|
||||||
|
|
||||||
|
# If true, show URL addresses after external links.
|
||||||
|
#latex_show_urls = False
|
||||||
|
|
||||||
|
# Documents to append as an appendix to all manuals.
|
||||||
|
#latex_appendices = []
|
||||||
|
|
||||||
|
# If false, no module index is generated.
|
||||||
|
#latex_domain_indices = True
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for manual page output ---------------------------------------
|
||||||
|
|
||||||
|
# One entry per manual page. List of tuples
|
||||||
|
# (source start file, name, description, authors, manual section).
|
||||||
|
man_pages = [
|
||||||
|
('index', 'spacy', u'spaCy Documentation',
|
||||||
|
[u'Matthew Honnibal'], 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
# If true, show URL addresses after external links.
|
||||||
|
#man_show_urls = False
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for Texinfo output -------------------------------------------
|
||||||
|
|
||||||
|
# Grouping the document tree into Texinfo files. List of tuples
|
||||||
|
# (source start file, target name, title, author,
|
||||||
|
# dir menu entry, description, category)
|
||||||
|
texinfo_documents = [
|
||||||
|
('index', 'spaCy', u'spaCy Documentation',
|
||||||
|
u'Matthew Honnibal', 'spaCy', 'One line description of project.',
|
||||||
|
'Miscellaneous'),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Documents to append as an appendix to all manuals.
|
||||||
|
#texinfo_appendices = []
|
||||||
|
|
||||||
|
# If false, no module index is generated.
|
||||||
|
#texinfo_domain_indices = True
|
||||||
|
|
||||||
|
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||||
|
#texinfo_show_urls = 'footnote'
|
||||||
|
|
||||||
|
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
||||||
|
#texinfo_no_detailmenu = False
|
||||||
|
|
||||||
|
|
||||||
|
# Example configuration for intersphinx: refer to the Python standard library.
|
||||||
|
intersphinx_mapping = {'http://docs.python.org/': None}
|
22
docs/source/guide/install.rst
Normal file
22
docs/source/guide/install.rst
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
Installation
|
||||||
|
============
|
||||||
|
|
||||||
|
pip install spacy
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
The easiest way to install is from PyPi via pip::
|
||||||
|
|
||||||
|
pip install spacy
|
||||||
|
|
||||||
|
git clone http://github.com/honnibal/spaCy.git
|
||||||
|
----------------------------------------------
|
||||||
|
|
||||||
|
Installation From source via `GitHub <https://github.com/honnibal/spaCy>`_, using virtualenv::
|
||||||
|
|
||||||
|
$ git clone http://github.com/honnibal/spaCy.git
|
||||||
|
$ cd spaCy
|
||||||
|
$ virtualenv .env
|
||||||
|
$ source .env/bin/activate
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
$ fab make
|
||||||
|
$ fab test
|
71
docs/source/guide/overview.rst
Normal file
71
docs/source/guide/overview.rst
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
What and Why
|
||||||
|
------------
|
||||||
|
|
||||||
|
spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon.
|
||||||
|
|
||||||
|
Most tokenizers give you a sequence of strings. That's barbaric.
|
||||||
|
Giving you strings invites you to compute on every *token*, when what
|
||||||
|
you should be doing is computing on every *type*. Remember
|
||||||
|
`Zipf's law <http://en.wikipedia.org/wiki/Zipf's_law>`_: you'll
|
||||||
|
see exponentially fewer types than tokens.
|
||||||
|
|
||||||
|
Instead of strings, spaCy gives you references to Lexeme objects, from which you
|
||||||
|
can access an excellent set of pre-computed orthographic and distributional features:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> from spacy import en
|
||||||
|
>>> apples, are, nt, oranges, dots = en.EN.tokenize(u"Apples aren't oranges...")
|
||||||
|
>>> are.prob >= oranges.prob
|
||||||
|
True
|
||||||
|
>>> apples.check_flag(en.IS_TITLE)
|
||||||
|
True
|
||||||
|
>>> apples.check_flag(en.OFT_TITLE)
|
||||||
|
False
|
||||||
|
>>> are.check_flag(en.CAN_NOUN)
|
||||||
|
False
|
||||||
|
|
||||||
|
spaCy makes it easy to write very efficient NLP applications, because your feature
|
||||||
|
functions have to do almost no work: almost every lexical property you'll want
|
||||||
|
is pre-computed for you. See the tutorial for an example POS tagger.
|
||||||
|
|
||||||
|
Benchmark
|
||||||
|
---------
|
||||||
|
|
||||||
|
The tokenizer itself is also very efficient:
|
||||||
|
|
||||||
|
+--------+-------+--------------+--------------+
|
||||||
|
| System | Time | Words/second | Speed Factor |
|
||||||
|
+--------+-------+--------------+--------------+
|
||||||
|
| NLTK | 6m4s | 89,000 | 1.00 |
|
||||||
|
+--------+-------+--------------+--------------+
|
||||||
|
| spaCy | 9.5s | 3,093,000 | 38.30 |
|
||||||
|
+--------+-------+--------------+--------------+
|
||||||
|
|
||||||
|
The comparison refers to 30 million words from the English Gigaword, on
|
||||||
|
a Maxbook Air. For context, calling string.split() on the data completes in
|
||||||
|
about 5s.
|
||||||
|
|
||||||
|
Pros and Cons
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Pros:
|
||||||
|
|
||||||
|
- All tokens come with indices into the original string
|
||||||
|
- Full unicode support
|
||||||
|
- Extensible to other languages
|
||||||
|
- Batch operations computed efficiently in Cython
|
||||||
|
- Cython API
|
||||||
|
- numpy interoperability
|
||||||
|
|
||||||
|
Cons:
|
||||||
|
|
||||||
|
- It's new (released September 2014)
|
||||||
|
- Security concerns, from memory management
|
||||||
|
- Higher memory usage (up to 1gb)
|
||||||
|
- More conceptually complicated
|
||||||
|
- Tokenization rules expressed in code, not as data
|
||||||
|
|
34
docs/source/index.rst
Normal file
34
docs/source/index.rst
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
.. spaCy documentation master file, created by
|
||||||
|
sphinx-quickstart on Tue Aug 19 16:27:38 2014.
|
||||||
|
You can adapt this file completely to your liking, but it should at least
|
||||||
|
contain the root `toctree` directive.
|
||||||
|
|
||||||
|
spaCy NLP Tokenizer and Lexicon
|
||||||
|
================================
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 3
|
||||||
|
|
||||||
|
guide/overview.rst
|
||||||
|
guide/install.rst
|
||||||
|
|
||||||
|
api/index.rst
|
||||||
|
|
||||||
|
modules/index.rst
|
||||||
|
|
||||||
|
|
||||||
|
Source (GitHub)
|
||||||
|
----------------
|
||||||
|
|
||||||
|
http://github.com/honnibal/spaCy
|
||||||
|
|
||||||
|
License
|
||||||
|
-------
|
||||||
|
|
||||||
|
Copyright Matthew Honnibal
|
||||||
|
|
||||||
|
Non-commercial use: $0
|
||||||
|
Commercial trial use: $0
|
||||||
|
Full commercial license: $500
|
||||||
|
|
||||||
|
honnibal@gmail.com
|
Loading…
Reference in New Issue
Block a user