* Merge string views feature branch

2025-12-24 10:33:17 +03:00 · 2014-07-23 17:37:48 +01:00 · 2014-07-23 17:37:48 +01:00 · 150cf6dd3b
commit 150cf6dd3b
parent 9fd085bf90 e80ec5525f
18 changed files with 346 additions and 584 deletions
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,177 +0,0 @@
 # Makefile for Sphinx documentation
 #
 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 endif
 # Internal variables.
 PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 # the i18n builder cannot share the environment and doctrees with the others
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 help:
 	@echo "Please use \`make <target>' where <target> is one of"
 	@echo "  html       to make standalone HTML files"
 	@echo "  dirhtml    to make HTML files named index.html in directories"
 	@echo "  singlehtml to make a single large HTML file"
 	@echo "  pickle     to make pickle files"
 	@echo "  json       to make JSON files"
 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 	@echo "  qthelp     to make HTML files and a qthelp project"
 	@echo "  devhelp    to make HTML files and a Devhelp project"
 	@echo "  epub       to make an epub"
 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 	@echo "  text       to make text files"
 	@echo "  man        to make manual pages"
 	@echo "  texinfo    to make Texinfo files"
 	@echo "  info       to make Texinfo files and run them through makeinfo"
 	@echo "  gettext    to make PO message catalogs"
 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 	@echo "  xml        to make Docutils-native XML files"
 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 	@echo "  linkcheck  to check all external links for integrity"
 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 clean:
 	rm -rf $(BUILDDIR)/*
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 dirhtml:
 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 singlehtml:
 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 	@echo
 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 pickle:
 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 	@echo
 	@echo "Build finished; now you can process the pickle files."
 json:
 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 	@echo
 	@echo "Build finished; now you can process the JSON files."
 htmlhelp:
 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 	@echo
 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 qthelp:
 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 	@echo
 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spacy.qhcp"
 	@echo "To view the help file:"
 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spacy.qhc"
 devhelp:
 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 	@echo
 	@echo "Build finished."
 	@echo "To view the help file:"
 	@echo "# mkdir -p $$HOME/.local/share/devhelp/spacy"
 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spacy"
 	@echo "# devhelp"
 epub:
 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 	@echo
 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 latex:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo
 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 	      "(use \`make latexpdf' here to do that automatically)."
 latexpdf:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo "Running LaTeX files through pdflatex..."
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 latexpdfja:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo "Running LaTeX files through platex and dvipdfmx..."
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 text:
 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
 	@echo
 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
 man:
 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
 	@echo
 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
 texinfo:
 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
 	@echo
 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
 	@echo "Run \`make' in that directory to run these through makeinfo" \
 	      "(use \`make info' here to do that automatically)."
 info:
 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
 	@echo "Running Texinfo files through makeinfo..."
 	make -C $(BUILDDIR)/texinfo info
 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
 gettext:
 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
 	@echo
 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
 changes:
 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
 	@echo
 	@echo "The overview file is in $(BUILDDIR)/changes."
 linkcheck:
 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
 	@echo
 	@echo "Link check complete; look for any errors in the above output " \
 	      "or in $(BUILDDIR)/linkcheck/output.txt."
 doctest:
 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
 	@echo "Testing of doctests in the sources finished, look at the " \
 	      "results in $(BUILDDIR)/doctest/output.txt."
 xml:
 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
 	@echo
 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
 pseudoxml:
 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
 	@echo
 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
--- a/docs/conf.py
+++ b/docs/conf.py
@ -1,269 +0,0 @@
 # -*- coding: utf-8 -*-
 #
 # spacy documentation build configuration file, created by
 # sphinx-quickstart on Thu Jul  3 21:54:08 2014.
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
 #
 # Note that not all possible configuration values are present in this
 # autogenerated file.
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 import sys
 import os
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, os.path.abspath('.'))
 sys.path.insert(0, os.path.join(os.path.abspath('.'), 'spacy'))
 # -- General configuration ------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
 #needs_sphinx = '1.0'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.doctest',
    'sphinx.ext.intersphinx',
    'sphinx.ext.coverage',
    'sphinx.ext.viewcode',
 ]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # The suffix of source filenames.
 source_suffix = '.rst'
 # The encoding of source files.
 #source_encoding = 'utf-8-sig'
 # The master toctree document.
 master_doc = 'index'
 # General information about the project.
 project = u'spacy'
 copyright = u'2014, Matthew Honnibal'
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
 version = '0.0'
 # The full version, including alpha/beta/rc tags.
 release = '0.0'
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #language = None
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
 #today = ''
 # Else, today_fmt is used as the format for a strftime call.
 #today_fmt = '%B %d, %Y'
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
 #default_role = None
 # If true, '()' will be appended to :func: etc. cross-reference text.
 #add_function_parentheses = True
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
 #add_module_names = True
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
 #show_authors = False
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
 # If true, keep warnings as "system message" paragraphs in the built documents.
 #keep_warnings = False
 # -- Options for HTML output ----------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 html_theme = 'default'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #html_theme_options = {}
 # Add any paths that contain custom themes here, relative to this directory.
 #html_theme_path = []
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
 #html_title = None
 # A shorter title for the navigation bar.  Default is the same as html_title.
 #html_short_title = None
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
 #html_logo = None
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
 #html_favicon = None
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 # directly to the root of the documentation.
 #html_extra_path = []
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
 #html_last_updated_fmt = '%b %d, %Y'
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
 #html_use_smartypants = True
 # Custom sidebar templates, maps document names to template names.
 #html_sidebars = {}
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
 #html_additional_pages = {}
 # If false, no module index is generated.
 #html_domain_indices = True
 # If false, no index is generated.
 #html_use_index = True
 # If true, the index is split into individual pages for each letter.
 #html_split_index = False
 # If true, links to the reST sources are added to the pages.
 #html_show_sourcelink = True
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
 #html_show_sphinx = True
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
 #html_show_copyright = True
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
 #html_use_opensearch = ''
 # This is the file name suffix for HTML files (e.g. ".xhtml").
 #html_file_suffix = None
 # Output file base name for HTML help builder.
 htmlhelp_basename = 'spacydoc'
 # -- Options for LaTeX output ---------------------------------------------
 latex_elements = {
 # The paper size ('letterpaper' or 'a4paper').
 #'papersize': 'letterpaper',
 # The font size ('10pt', '11pt' or '12pt').
 #'pointsize': '10pt',
 # Additional stuff for the LaTeX preamble.
 #'preamble': '',
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
  ('index', 'spacy.tex', u'spacy Documentation',
   u'Matthew Honnibal', 'manual'),
 ]
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
 #latex_logo = None
 # For "manual" documents, if this is true, then toplevel headings are parts,
 # not chapters.
 #latex_use_parts = False
 # If true, show page references after internal links.
 #latex_show_pagerefs = False
 # If true, show URL addresses after external links.
 #latex_show_urls = False
 # Documents to append as an appendix to all manuals.
 #latex_appendices = []
 # If false, no module index is generated.
 #latex_domain_indices = True
 # -- Options for manual page output ---------------------------------------
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
    ('index', 'spacy', u'spacy Documentation',
     [u'Matthew Honnibal'], 1)
 ]
 # If true, show URL addresses after external links.
 #man_show_urls = False
 # -- Options for Texinfo output -------------------------------------------
 # Grouping the document tree into Texinfo files. List of tuples
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
  ('index', 'spacy', u'spacy Documentation',
   u'Matthew Honnibal', 'spacy', 'One line description of project.',
   'Miscellaneous'),
 ]
 # Documents to append as an appendix to all manuals.
 #texinfo_appendices = []
 # If false, no module index is generated.
 #texinfo_domain_indices = True
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
 #texinfo_show_urls = 'footnote'
 # If true, do not generate a @detailmenu in the "Top" node's menu.
 #texinfo_no_detailmenu = False
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {'http://docs.python.org/': None}
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,26 +0,0 @@
 .. spacy documentation master file, created by
   sphinx-quickstart on Thu Jul  3 21:54:08 2014.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 Welcome to spacy's documentation!
 =================================
 Contents:
 .. toctree::
   :maxdepth: 2
 .. py:function:: enumerate(sequence[, start=0])
   Return an iterator that yields tuples of an index and an item of the
      *sequence*. (And so on.)
 Indices and tables
 ==================
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,8 +1,17 @@
 from .lexeme import lex_of
 from .lexeme import sic_of
 from .tokens import Tokens
-__all__ = [lex_of, sic_of]
+# Don't know how to get the enum Python visible :(
 SIC = 0
 LEX = 1
 NORM = 2
 SHAPE = 3
 LAST3 = 4
 __all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
 """
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,8 +1,8 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
-from spacy.lexeme cimport Lexeme
+from spacy.spacy cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme_addr
 from spacy.spacy cimport Language
 from spacy.tokens cimport Tokens
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from . import util
@ -37,12 +36,15 @@ cdef class English(spacy.Language):
 cdef bint is_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        # ...Unless we're at 0
        return i == 0
    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
        return False
    # Don't count commas as punct if the next char is a number
    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
        return False
-    # Don't count periods as punct if the next char is a number
+    # Don't count periods as punct if the next char is not whitespace
-    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+    if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
        return False
    return not word[i].isalnum()
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -2,8 +2,8 @@ from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.lexeme cimport Lexeme
+from spacy.spacy cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme_addr
 from spacy.tokens cimport Tokens
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -4,11 +4,11 @@ boldly assume no collisions.
 '''
 from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from spacy.spacy cimport Language
 from . import util
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -4,39 +4,53 @@ from libc.stdint cimport uint64_t
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
 ctypedef size_t Lexeme_addr
 ctypedef char Bits8
 ctypedef uint64_t Bits64
 from spacy.spacy cimport Language
 cdef struct Orthography:
    StringHash last3
    StringHash shape
    StringHash norm
    Py_UNICODE first
    Bits8 flags
 cdef struct Distribution:
    double prob
    ClusterID cluster
    Bits64 tagdict
    Bits8 flags
 cdef struct Lexeme:
    StringHash sic # Hash of the original string
    StringHash lex # Hash of the word, with punctuation and clitics split off
    StringHash normed # Hash of the normalized version of lex
    StringHash last3 # Last 3 characters of the token
    Py_UNICODE first # First character of the token
-    double prob # What is the log probability of the lex value?
+    Distribution* dist # Distribution info, lazy loaded
-    ClusterID cluster # Brown cluster of the token
+    Orthography* orth  # Extra orthographic views
    bint oft_upper # Is the lowered version of the lex value often in all caps?
    bint oft_title # Is the lowered version of the lex value often title-cased?
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
 cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
                         int split, size_t length)
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
 # over the Lexeme, via:
 # for field in range(LexAttr.n): get_attr(Lexeme*, field)
 cdef enum HashFields:
    sic
    lex
    normed
    cluster
    n
-#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
+cdef enum StringAttr:
    SIC
    LEX
    NORM
    SHAPE
    LAST3
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 cpdef StringHash sic_of(size_t lex_id) except 0
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -11,49 +11,29 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
-cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
+# Reiterate the enum, for python
-                         int split, size_t length):
+#SIC = StringAttr.sic
-    assert split <= length
+#LEX = StringAttr.lex
-    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+#NORM = StringAttr.norm
 #SHAPE = StringAttr.shape
 #LAST3 = StringAttr.last3
    word.first = <Py_UNICODE>(string[0] if string else 0)
    word.sic = hashed
-    cdef unicode tail_string
+cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
-    cdef unicode lex 
+    if attr == SIC:
-    if split != 0 and split < length:
+        return sic_of(lex_id)
-        lex = substr(string, 0, split, length)
+    elif attr == LEX:
-        tail_string = substr(string, split, length, length)
+        return lex_of(lex_id)
    elif attr == NORM:
        return norm_of(lex_id)
    elif attr == SHAPE:
        return shape_of(lex_id)
    elif attr == LAST3:
        return last3_of(lex_id)
    else:
-        lex = string
+        raise StandardError
        tail_string = ''
    assert lex
    #cdef unicode normed = normalize_word_string(lex)
    cdef unicode normed = '?'
    cdef unicode last3 = substr(string, length - 3, length, length)
    assert normed
    assert len(normed)
    word.lex = lang.hash_string(lex, len(lex))
    word.normed = lang.hash_string(normed, len(normed))
    word.last3 = lang.hash_string(last3, len(last3))
    lang.bacov[word.lex] = lex
    lang.bacov[word.normed] = normed
    lang.bacov[word.last3] = last3
    # These are loaded later
    word.prob = 0
    word.cluster = 0
    word.oft_upper = False
    word.oft_title = False
    # Now recurse, and deal with the tail
    if tail_string:
        word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
    return word
 cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +62,35 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).lex
 cpdef StringHash norm_of(size_t lex_id) except 0:
    '''Access the `lex' field of the Lexeme pointed to by lex_id.
    The lex field is the hash of the string you would expect to get back from
    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
    delimited tokens split off.  The other fields refer to properties of the
    string that the lex field stores a hash of, except sic and tail.
    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
    [u'Hi', u'!', u'world']
    '''
    return (<Lexeme*>lex_id).orth.norm
 cpdef StringHash shape_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).orth.shape
 cpdef StringHash last3_of(size_t lex_id) except 0:
    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
    the hash of the last three characters of the word:
    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
    return (<Lexeme*>lex_id).orth.last3
 cpdef ClusterID cluster_of(size_t lex_id):
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
@ -98,7 +107,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
-    return (<Lexeme*>lex_id).cluster
+    return (<Lexeme*>lex_id).dist.cluster
 cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +118,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
-    return (<Lexeme*>lex_id).first
+    return (<Lexeme*>lex_id).orth.first
 cpdef double prob_of(size_t lex_id):
@ -122,18 +131,7 @@ cpdef double prob_of(size_t lex_id):
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
-    pass
+    return (<Lexeme*>lex_id).dist.prob
 cpdef StringHash last3_of(size_t lex_id):
    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
    the hash of the last three characters of the word:
    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
    return (<Lexeme*>lex_id).last3
 cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +146,12 @@ cpdef bint is_oft_upper(size_t lex_id):
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
-    return (<Lexeme*>lex_id).oft_upper
+    return False
    #cdef Lexeme* w = <Lexeme*>lex_id
    #return w.orth.last3 if w.orth != NULL else 0
    #return (<Lexeme*>lex_id).oft_upper
 cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +166,5 @@ cpdef bint is_oft_title(size_t lex_id):
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
-    return (<Lexeme*>lex_id).oft_title
+    return False
    #return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
-ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
+ctypedef dense_hash_map[StringHash, size_t] Vocab
-ctypedef int (*Splitter)(unicode word, size_t length)
+from spacy.lexeme cimport Lexeme
 from spacy.tokens cimport Tokens
 # Put these above import to avoid circular import problem
 ctypedef char Bits8
 ctypedef uint64_t Bits64
 ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
-from spacy.tokens cimport Tokens
+from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
 cdef class Language:
    cdef object name
    cdef Vocab* vocab
    cdef Vocab* distri
    cdef Vocab* ortho
    cdef dict bacov
    cdef int find_split(self, unicode word, size_t length)
@ -26,3 +37,8 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                             int split, size_t length)
    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -6,22 +6,65 @@ from libc.stdlib cimport calloc, free
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B
-from spacy.lexeme cimport init_lexeme
+from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD
-from spacy.string_tools cimport is_whitespace
+from spacy.string_tools cimport substr
 from . import util
 from os import path
 cimport cython
 def get_normalized(unicode lex, size_t length):
    if lex.isalpha() and lex.islower():
        return lex
    else:
        return get_word_shape(lex, length)
 def get_word_shape(lex, length):
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in lex:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape
 def set_orth_flags(lex, length):
    return 0
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.vocab = new Vocab()
        self.ortho = new Vocab()
        self.distri = new Vocab()
        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
    def load_tokenization(self, token_rules=None):
@ -80,7 +123,7 @@ cdef class Language:
        return <Lexeme_addr>word_ptr
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
-        word = init_lexeme(self, string, hashed, split, length)
+        word = self.init_lexeme(string, hashed, split, length)
        self.vocab[0][hashed] = <Lexeme_addr>word
        self.bacov[hashed] = string
        return word   
@ -121,6 +164,55 @@ cdef class Language:
    cdef int find_split(self, unicode word, size_t length):
        return -1
    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                             int split, size_t length):
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        word.sic = hashed
        cdef unicode tail_string
        cdef unicode lex 
        if split != 0 and split < length:
            lex = substr(string, 0, split, length)
            tail_string = substr(string, split, length, length)
        else:
            lex = string
            tail_string = ''
        word.lex = self.hash_string(lex, len(lex))
        self.bacov[word.lex] = lex
        word.orth = <Orthography*>self.ortho[0][word.lex]
        if word.orth == NULL:
            word.orth = self.init_orth(word.lex, lex)
        word.dist = <Distribution*>self.distri[0][word.lex]
        # Now recurse, and deal with the tail
        if tail_string:
            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
        return word
    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
        cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
        orth.first = <Py_UNICODE>lex[0]
        cdef int length = len(lex)
        orth.flags = set_orth_flags(lex, length)
        cdef unicode last3 = substr(lex, length - 3, length, length)
        cdef unicode norm = get_normalized(lex, length)
        cdef unicode shape = get_word_shape(lex, length)
        orth.last3 = self.hash_string(last3, len(last3))
        orth.shape = self.hash_string(shape, len(shape))
        orth.norm = self.hash_string(norm, len(norm))
        self.bacov[orth.last3] = last3
        self.bacov[orth.shape] = shape
        self.bacov[orth.norm] = norm
        self.ortho[0][hashed] = <size_t>orth
        return orth
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +229,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
 cpdef vector[size_t] expand_chunk(size_t addr) except *:
    cdef vector[size_t] tokens = vector[size_t]()
    word = <Lexeme*>addr
-    while word is not NULL:
+    while word != NULL:
        tokens.push_back(<size_t>word)
        word = word.tail
    return tokens
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,5 +1,6 @@
 # cython: profile=True
 cpdef unicode substr(unicode string, int start, int end, size_t length):
    if end >= length:
        end = -1
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,12 +1,9 @@
 from libcpp.vector cimport vector
-from spacy.lexeme cimport Lexeme
+from spacy.spacy cimport Lexeme_addr
 from spacy.lexeme cimport Lexeme_addr
 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
-
+from spacy.lexeme cimport StringAttr
 cdef enum Field:
    lex
 cdef class Tokens:
@ -17,5 +14,5 @@ cdef class Tokens:
    cpdef int append(self, Lexeme_addr token)
    cpdef int extend(self, Tokens other) except -1
-    cpdef list group_by(self, Field attr)
+    cpdef object group_by(self, StringAttr attr)
-    cpdef dict count_by(self, Field attr)
+    cpdef dict count_by(self, StringAttr attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as inc
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport attr_of, norm_of, shape_of
 from spacy.spacy cimport StringHash
 cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
@ -32,17 +37,53 @@ cdef class Tokens:
        for el in other:
            self.append(el)
-    cpdef list group_by(self, Field attr):
+    cpdef object group_by(self, StringAttr attr):
-        pass
+        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
-    cpdef dict count_by(self, Field attr):
+        (string names, hashes, tokens)
        The lists are aligned, so the ith entry in string names is the string
        that the ith entry in hashes unhashes to, which the Tokens instance
        is grouped by.
        You can then use count_by or group_by on the Tokens
        for further processing. Calling group_by and then asking the length
        of the Tokens objects is equivalent to count_by, but somewhat slower.
        '''
        # Implementation here is working around some of the constraints in
        # Cython about what type of thing can go in what type of container.
        # Long story short, it's pretty hard to get a Python object like
        # Tokens into a vector or array. If we really need this to run faster,
        # we can be tricky and get the Python list access out of the loop. What
        # we'd do is store pointers to the underlying vectors.
        # So far, speed isn't mattering here.
        cdef dict indices = {}
        cdef list groups = []
        cdef list names = []
        cdef list hashes = []
        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
            key = attr_of(t, attr)
            if key in indices:
                groups[indices[key]].append(t)
            else:
                indices[key] = len(groups)
                groups.append(Tokens(self.lang))
                names.append(self.lang.unhash(key))
                hashes.append(key)
                groups[-1].append(t)
        return names, hashes, groups
    cpdef dict count_by(self, StringAttr attr):
        counts = {}
        cdef Lexeme_addr t
-        cdef Lexeme* word
+        cdef StringHash key
        for t in self.vctr[0]:
-            word = <Lexeme*>t
+            key = attr_of(t, attr)
-            if word.lex not in counts:
+            if key not in counts:
-                counts[word.lex] = 0
+                counts[key] = 0
-            counts[word.lex] += 1
+            counts[key] += 1
        return counts
--- a/tests/test_group_by.py
+++ b/tests/test_group_by.py
@ -0,0 +1,35 @@
 from __future__ import unicode_literals
 import pytest
 from spacy import en
 from spacy.lexeme import lex_of
 from spacy import SIC, LEX, NORM, SHAPE, LAST3
 def test_group_by_lex():
    tokens = en.tokenize("I like the red one and I like the blue one")
    names, hashes, groups = tokens.group_by(LEX)
    assert len(groups[0]) == 2
    assert en.unhash(lex_of(groups[0][0])) == 'I'
    assert names[0] == 'I'
    assert len(groups[1]) == 2
    assert en.unhash(lex_of(groups[1][0])) == 'like'
    assert names[1] == "like"
    assert len(groups[2]) == 2
    assert len(groups[3]) == 1
 def test_group_by_last3():
    tokens = en.tokenize("I the blithe swarthy mate ate on the filthy deck")
    names, hashes, groups = tokens.group_by(LAST3)
    assert len(groups[0]) == 1
    assert en.unhash(lex_of(groups[0][0])) == 'I'
    assert len(groups[1]) == 3
    assert en.unhash(lex_of(groups[1][0])) == 'the'
    assert len(groups[2]) == 2
    assert len(groups[3]) == 2
    assert len(groups[4]) == 1
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -0,0 +1,16 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.en import lookup, unhash
 from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of
 from spacy.lexeme import shape_of
@pytest.fixture
 def C3P0():
    return lookup("C3P0")
 def test_shape(C3P0):
    assert unhash(shape_of(C3P0)) == "XdXd"
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -48,3 +48,10 @@ def test_three_same_open(open_puncts):
        assert len(tokens) == 4
        assert unhash(lex_of(tokens[0])) == p
        assert unhash(lex_of(tokens[3])) == word_str
 def test_open_appostrophe():
    string = "'The"
    tokens = expand_chunk(lookup(string))
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[0])) == "'"