diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 4a3d8bd6c..000000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spacy.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spacy.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/spacy" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spacy" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 0e150a0a2..000000000 --- a/docs/conf.py +++ /dev/null @@ -1,269 +0,0 @@ -# -*- coding: utf-8 -*- -# -# spacy documentation build configuration file, created by -# sphinx-quickstart on Thu Jul 3 21:54:08 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.join(os.path.abspath('.'), 'spacy')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'spacy' -copyright = u'2014, Matthew Honnibal' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.0' -# The full version, including alpha/beta/rc tags. -release = '0.0' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'spacydoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'spacy.tex', u'spacy Documentation', - u'Matthew Honnibal', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'spacy', u'spacy Documentation', - [u'Matthew Honnibal'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'spacy', u'spacy Documentation', - u'Matthew Honnibal', 'spacy', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index a6601b1a0..000000000 --- a/docs/index.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. spacy documentation master file, created by - sphinx-quickstart on Thu Jul 3 21:54:08 2014. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to spacy's documentation! -================================= - -Contents: - -.. toctree:: - :maxdepth: 2 - -.. py:function:: enumerate(sequence[, start=0]) - - Return an iterator that yields tuples of an index and an item of the - *sequence*. (And so on.) - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/spacy/__init__.py b/spacy/__init__.py index 7ab2b411d..d2b763c42 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,8 +1,17 @@ from .lexeme import lex_of from .lexeme import sic_of +from .tokens import Tokens -__all__ = [lex_of, sic_of] +# Don't know how to get the enum Python visible :( + +SIC = 0 +LEX = 1 +NORM = 2 +SHAPE = 3 +LAST3 = 4 + +__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3] """ diff --git a/spacy/en.pxd b/spacy/en.pxd index fa0410db8..ee58118a9 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,8 +1,8 @@ from libcpp.vector cimport vector from spacy.spacy cimport StringHash -from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport Lexeme_addr +from spacy.spacy cimport Lexeme +from spacy.spacy cimport Lexeme_addr from spacy.spacy cimport Language from spacy.tokens cimport Tokens diff --git a/spacy/en.pyx b/spacy/en.pyx index 0e30771b8..1775d097c 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t from libcpp.vector cimport vector -from spacy.lexeme cimport Lexeme from spacy.string_tools cimport substr from . import util @@ -37,12 +36,15 @@ cdef class English(spacy.Language): cdef bint is_punct(unicode word, size_t i, size_t length): # Don't count appostrophes as punct if the next char is a letter if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): + # ...Unless we're at 0 + return i == 0 + if word[i] == "-" and i < (length - 1) and word[i+1] == '-': return False # Don't count commas as punct if the next char is a number if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): return False - # Don't count periods as punct if the next char is a number - if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): + # Don't count periods as punct if the next char is not whitespace + if word[i] == "." and i < (length - 1) and not word[i+1].isspace(): return False return not word[i].isalnum() diff --git a/spacy/en_ptb.pxd b/spacy/en_ptb.pxd index c1e3a2ec9..eaa0f8471 100644 --- a/spacy/en_ptb.pxd +++ b/spacy/en_ptb.pxd @@ -2,8 +2,8 @@ from libcpp.vector cimport vector from spacy.spacy cimport StringHash from spacy.spacy cimport Language -from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport Lexeme_addr +from spacy.spacy cimport Lexeme +from spacy.spacy cimport Lexeme_addr from spacy.tokens cimport Tokens diff --git a/spacy/en_ptb.pyx b/spacy/en_ptb.pyx index 0dd8a41cf..b407cd4ed 100644 --- a/spacy/en_ptb.pyx +++ b/spacy/en_ptb.pyx @@ -4,11 +4,11 @@ boldly assume no collisions. ''' from __future__ import unicode_literals + from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t from libcpp.vector cimport vector -from spacy.lexeme cimport Lexeme from spacy.string_tools cimport substr from spacy.spacy cimport Language from . import util diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 46912de93..50417e65a 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,39 +4,53 @@ from libc.stdint cimport uint64_t ctypedef int ClusterID ctypedef uint64_t StringHash ctypedef size_t Lexeme_addr +ctypedef char Bits8 +ctypedef uint64_t Bits64 + from spacy.spacy cimport Language + +cdef struct Orthography: + StringHash last3 + StringHash shape + StringHash norm + + Py_UNICODE first + Bits8 flags + + +cdef struct Distribution: + double prob + ClusterID cluster + Bits64 tagdict + Bits8 flags + + cdef struct Lexeme: StringHash sic # Hash of the original string StringHash lex # Hash of the word, with punctuation and clitics split off - StringHash normed # Hash of the normalized version of lex - StringHash last3 # Last 3 characters of the token - Py_UNICODE first # First character of the token - double prob # What is the log probability of the lex value? - ClusterID cluster # Brown cluster of the token - - bint oft_upper # Is the lowered version of the lex value often in all caps? - bint oft_title # Is the lowered version of the lex value often title-cased? + Distribution* dist # Distribution info, lazy loaded + Orthography* orth # Extra orthographic views Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens -cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) - -cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed, - int split, size_t length) - -# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which -# has a conditional to pick out the correct item. This allows safe iteration -# over the Lexeme, via: -# for field in range(LexAttr.n): get_attr(Lexeme*, field) -cdef enum HashFields: - sic - lex - normed - cluster - n +cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL) -#cdef uint64_t get_attr(Lexeme* word, HashFields attr) +cdef enum StringAttr: + SIC + LEX + NORM + SHAPE + LAST3 + + +cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 + +cpdef StringHash sic_of(size_t lex_id) except 0 +cpdef StringHash lex_of(size_t lex_id) except 0 +cpdef StringHash norm_of(size_t lex_id) except 0 +cpdef StringHash shape_of(size_t lex_id) except 0 +cpdef StringHash last3_of(size_t lex_id) except 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 96482e930..e769a6bee 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -11,49 +11,29 @@ from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t from libcpp.vector cimport vector +from spacy.spacy cimport StringHash -cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed, - int split, size_t length): - assert split <= length - cdef Lexeme* word = calloc(1, sizeof(Lexeme)) +# Reiterate the enum, for python +#SIC = StringAttr.sic +#LEX = StringAttr.lex +#NORM = StringAttr.norm +#SHAPE = StringAttr.shape +#LAST3 = StringAttr.last3 - word.first = (string[0] if string else 0) - word.sic = hashed - - cdef unicode tail_string - cdef unicode lex - if split != 0 and split < length: - lex = substr(string, 0, split, length) - tail_string = substr(string, split, length, length) + +cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: + if attr == SIC: + return sic_of(lex_id) + elif attr == LEX: + return lex_of(lex_id) + elif attr == NORM: + return norm_of(lex_id) + elif attr == SHAPE: + return shape_of(lex_id) + elif attr == LAST3: + return last3_of(lex_id) else: - lex = string - tail_string = '' - assert lex - #cdef unicode normed = normalize_word_string(lex) - cdef unicode normed = '?' - cdef unicode last3 = substr(string, length - 3, length, length) - - assert normed - assert len(normed) - - word.lex = lang.hash_string(lex, len(lex)) - word.normed = lang.hash_string(normed, len(normed)) - word.last3 = lang.hash_string(last3, len(last3)) - - lang.bacov[word.lex] = lex - lang.bacov[word.normed] = normed - lang.bacov[word.last3] = last3 - - # These are loaded later - word.prob = 0 - word.cluster = 0 - word.oft_upper = False - word.oft_title = False - - # Now recurse, and deal with the tail - if tail_string: - word.tail = lang.lookup(-1, tail_string, len(tail_string)) - return word + raise StandardError cpdef StringHash sic_of(size_t lex_id) except 0: @@ -82,6 +62,35 @@ cpdef StringHash lex_of(size_t lex_id) except 0: return (lex_id).lex +cpdef StringHash norm_of(size_t lex_id) except 0: + '''Access the `lex' field of the Lexeme pointed to by lex_id. + + The lex field is the hash of the string you would expect to get back from + a standard tokenizer, i.e. the word with punctuation and other non-whitespace + delimited tokens split off. The other fields refer to properties of the + string that the lex field stores a hash of, except sic and tail. + + >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')] + [u'Hi', u'!', u'world'] + ''' + return (lex_id).orth.norm + + +cpdef StringHash shape_of(size_t lex_id) except 0: + return (lex_id).orth.shape + + +cpdef StringHash last3_of(size_t lex_id) except 0: + '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores + the hash of the last three characters of the word: + + >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')] + >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] + [u'llo', u'!'] + ''' + return (lex_id).orth.last3 + + cpdef ClusterID cluster_of(size_t lex_id): '''Access the `cluster' field of the Lexeme pointed to by lex_id, which gives an integer representation of the cluster ID of the word, @@ -98,7 +107,7 @@ cpdef ClusterID cluster_of(size_t lex_id): while "dapple" is totally different. On the other hand, "scalable" receives the same cluster ID as "pineapple", which is not what we'd like. ''' - return (lex_id).cluster + return (lex_id).dist.cluster cpdef Py_UNICODE first_of(size_t lex_id): @@ -109,7 +118,7 @@ cpdef Py_UNICODE first_of(size_t lex_id): >>> unhash(first_of(lex_id)) u'H' ''' - return (lex_id).first + return (lex_id).orth.first cpdef double prob_of(size_t lex_id): @@ -122,18 +131,7 @@ cpdef double prob_of(size_t lex_id): >>> prob_of(lookup(u'world')) -20.10340371976182 ''' - pass - - -cpdef StringHash last3_of(size_t lex_id): - '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores - the hash of the last three characters of the word: - - >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')] - >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] - [u'llo', u'!'] - ''' - return (lex_id).last3 + return (lex_id).dist.prob cpdef bint is_oft_upper(size_t lex_id): @@ -148,7 +146,12 @@ cpdef bint is_oft_upper(size_t lex_id): >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer True ''' - return (lex_id).oft_upper + return False + #cdef Lexeme* w = lex_id + #return w.orth.last3 if w.orth != NULL else 0 + + + #return (lex_id).oft_upper cpdef bint is_oft_title(size_t lex_id): @@ -163,4 +166,5 @@ cpdef bint is_oft_title(size_t lex_id): >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value True ''' - return (lex_id).oft_title + return False + #return (lex_id).oft_title diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 5328d1411..6189f37a3 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map # Circular import problems here ctypedef size_t Lexeme_addr ctypedef uint64_t StringHash -ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab -ctypedef int (*Splitter)(unicode word, size_t length) +ctypedef dense_hash_map[StringHash, size_t] Vocab +from spacy.lexeme cimport Lexeme + +from spacy.tokens cimport Tokens + +# Put these above import to avoid circular import problem +ctypedef char Bits8 +ctypedef uint64_t Bits64 +ctypedef int ClusterID from spacy.lexeme cimport Lexeme -from spacy.tokens cimport Tokens +from spacy.lexeme cimport Distribution +from spacy.lexeme cimport Orthography + cdef class Language: cdef object name cdef Vocab* vocab + cdef Vocab* distri + cdef Vocab* ortho cdef dict bacov cdef int find_split(self, unicode word, size_t length) @@ -26,3 +37,8 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) + cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, + int split, size_t length) + cdef Orthography* init_orth(self, StringHash hashed, unicode lex) + + diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 2fab0de06..25c7c823b 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -6,22 +6,65 @@ from libc.stdlib cimport calloc, free from ext.murmurhash cimport MurmurHash64A from ext.murmurhash cimport MurmurHash64B -from spacy.lexeme cimport init_lexeme +from spacy.lexeme cimport Lexeme from spacy.lexeme cimport BLANK_WORD -from spacy.string_tools cimport is_whitespace +from spacy.string_tools cimport substr + from . import util from os import path cimport cython +def get_normalized(unicode lex, size_t length): + if lex.isalpha() and lex.islower(): + return lex + else: + return get_word_shape(lex, length) + + +def get_word_shape(lex, length): + shape = "" + last = "" + shape_char = "" + seq = 0 + for c in lex: + if c.isalpha(): + if c.isupper(): + shape_char = "X" + else: + shape_char = "x" + elif c.isdigit(): + shape_char = "d" + else: + shape_char = c + if shape_char == last: + seq += 1 + else: + seq = 0 + last = shape_char + if seq < 3: + shape += shape_char + assert shape + return shape + + + +def set_orth_flags(lex, length): + return 0 + + cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} self.vocab = new Vocab() + self.ortho = new Vocab() + self.distri = new Vocab() self.vocab[0].set_empty_key(0) + self.distri[0].set_empty_key(0) + self.ortho[0].set_empty_key(0) self.load_tokenization(util.read_tokenization(name)) def load_tokenization(self, token_rules=None): @@ -80,7 +123,7 @@ cdef class Language: return word_ptr cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): - word = init_lexeme(self, string, hashed, split, length) + word = self.init_lexeme(string, hashed, split, length) self.vocab[0][hashed] = word self.bacov[hashed] = string return word @@ -121,6 +164,55 @@ cdef class Language: cdef int find_split(self, unicode word, size_t length): return -1 + cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, + int split, size_t length): + cdef Lexeme* word = calloc(1, sizeof(Lexeme)) + + word.sic = hashed + + cdef unicode tail_string + cdef unicode lex + if split != 0 and split < length: + lex = substr(string, 0, split, length) + tail_string = substr(string, split, length, length) + else: + lex = string + tail_string = '' + + word.lex = self.hash_string(lex, len(lex)) + self.bacov[word.lex] = lex + word.orth = self.ortho[0][word.lex] + if word.orth == NULL: + word.orth = self.init_orth(word.lex, lex) + word.dist = self.distri[0][word.lex] + + # Now recurse, and deal with the tail + if tail_string: + word.tail = self.lookup(-1, tail_string, len(tail_string)) + return word + + cdef Orthography* init_orth(self, StringHash hashed, unicode lex): + cdef Orthography* orth = calloc(1, sizeof(Orthography)) + orth.first = lex[0] + + cdef int length = len(lex) + + orth.flags = set_orth_flags(lex, length) + + cdef unicode last3 = substr(lex, length - 3, length, length) + cdef unicode norm = get_normalized(lex, length) + cdef unicode shape = get_word_shape(lex, length) + + orth.last3 = self.hash_string(last3, len(last3)) + orth.shape = self.hash_string(shape, len(shape)) + orth.norm = self.hash_string(norm, len(norm)) + + self.bacov[orth.last3] = last3 + self.bacov[orth.shape] = shape + self.bacov[orth.norm] = norm + + self.ortho[0][hashed] = orth + return orth cdef inline bint _is_whitespace(Py_UNICODE c) nogil: @@ -137,7 +229,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cpdef vector[size_t] expand_chunk(size_t addr) except *: cdef vector[size_t] tokens = vector[size_t]() word = addr - while word is not NULL: + while word != NULL: tokens.push_back(word) word = word.tail return tokens diff --git a/spacy/string_tools.pyx b/spacy/string_tools.pyx index 5397fd647..2f199766f 100644 --- a/spacy/string_tools.pyx +++ b/spacy/string_tools.pyx @@ -1,5 +1,6 @@ # cython: profile=True + cpdef unicode substr(unicode string, int start, int end, size_t length): if end >= length: end = -1 diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 84411982a..5359761c0 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,12 +1,9 @@ from libcpp.vector cimport vector -from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport Lexeme_addr +from spacy.spacy cimport Lexeme_addr from cython.operator cimport dereference as deref from spacy.spacy cimport Language - -cdef enum Field: - lex +from spacy.lexeme cimport StringAttr cdef class Tokens: @@ -17,5 +14,5 @@ cdef class Tokens: cpdef int append(self, Lexeme_addr token) cpdef int extend(self, Tokens other) except -1 - cpdef list group_by(self, Field attr) - cpdef dict count_by(self, Field attr) + cpdef object group_by(self, StringAttr attr) + cpdef dict count_by(self, StringAttr attr) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 67461ac44..3e26b1cea 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref from cython.operator cimport preincrement as inc +from spacy.lexeme cimport Lexeme +from spacy.lexeme cimport attr_of, norm_of, shape_of +from spacy.spacy cimport StringHash + + cdef class Tokens: def __cinit__(self, Language lang): self.lang = lang @@ -32,17 +37,53 @@ cdef class Tokens: for el in other: self.append(el) - cpdef list group_by(self, Field attr): - pass + cpdef object group_by(self, StringAttr attr): + '''Group tokens that share the property attr into Tokens instances, and + return a list of them. Returns a tuple of three lists: + + (string names, hashes, tokens) - cpdef dict count_by(self, Field attr): + The lists are aligned, so the ith entry in string names is the string + that the ith entry in hashes unhashes to, which the Tokens instance + is grouped by. + + You can then use count_by or group_by on the Tokens + for further processing. Calling group_by and then asking the length + of the Tokens objects is equivalent to count_by, but somewhat slower. + ''' + # Implementation here is working around some of the constraints in + # Cython about what type of thing can go in what type of container. + # Long story short, it's pretty hard to get a Python object like + # Tokens into a vector or array. If we really need this to run faster, + # we can be tricky and get the Python list access out of the loop. What + # we'd do is store pointers to the underlying vectors. + # So far, speed isn't mattering here. + cdef dict indices = {} + cdef list groups = [] + cdef list names = [] + cdef list hashes = [] + + cdef StringHash key + cdef Lexeme_addr t + for t in self.vctr[0]: + key = attr_of(t, attr) + if key in indices: + groups[indices[key]].append(t) + else: + indices[key] = len(groups) + groups.append(Tokens(self.lang)) + names.append(self.lang.unhash(key)) + hashes.append(key) + groups[-1].append(t) + return names, hashes, groups + + cpdef dict count_by(self, StringAttr attr): counts = {} cdef Lexeme_addr t - cdef Lexeme* word + cdef StringHash key for t in self.vctr[0]: - word = t - if word.lex not in counts: - counts[word.lex] = 0 - counts[word.lex] += 1 + key = attr_of(t, attr) + if key not in counts: + counts[key] = 0 + counts[key] += 1 return counts - diff --git a/tests/test_group_by.py b/tests/test_group_by.py new file mode 100644 index 000000000..2f9dd6ce0 --- /dev/null +++ b/tests/test_group_by.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +import pytest + +from spacy import en +from spacy.lexeme import lex_of + +from spacy import SIC, LEX, NORM, SHAPE, LAST3 + + +def test_group_by_lex(): + tokens = en.tokenize("I like the red one and I like the blue one") + names, hashes, groups = tokens.group_by(LEX) + + assert len(groups[0]) == 2 + assert en.unhash(lex_of(groups[0][0])) == 'I' + assert names[0] == 'I' + assert len(groups[1]) == 2 + assert en.unhash(lex_of(groups[1][0])) == 'like' + assert names[1] == "like" + assert len(groups[2]) == 2 + assert len(groups[3]) == 1 + + +def test_group_by_last3(): + tokens = en.tokenize("I the blithe swarthy mate ate on the filthy deck") + names, hashes, groups = tokens.group_by(LAST3) + + assert len(groups[0]) == 1 + assert en.unhash(lex_of(groups[0][0])) == 'I' + assert len(groups[1]) == 3 + assert en.unhash(lex_of(groups[1][0])) == 'the' + assert len(groups[2]) == 2 + assert len(groups[3]) == 2 + assert len(groups[4]) == 1 diff --git a/tests/test_orth.py b/tests/test_orth.py new file mode 100644 index 000000000..8d9939f4c --- /dev/null +++ b/tests/test_orth.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals + +import pytest + +from spacy.en import lookup, unhash + +from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of +from spacy.lexeme import shape_of + +@pytest.fixture +def C3P0(): + return lookup("C3P0") + + +def test_shape(C3P0): + assert unhash(shape_of(C3P0)) == "XdXd" diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py index 88da4f595..d1cd10bf6 100644 --- a/tests/test_pre_punct.py +++ b/tests/test_pre_punct.py @@ -48,3 +48,10 @@ def test_three_same_open(open_puncts): assert len(tokens) == 4 assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[3])) == word_str + + +def test_open_appostrophe(): + string = "'The" + tokens = expand_chunk(lookup(string)) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == "'"