* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc.

2025-09-19 02:22:43 +03:00 · 2014-07-05 20:51:42 +02:00 · 2014-07-05 20:51:42 +02:00 · 556f6a18ca
commit 556f6a18ca
parent 5c1705d5be
17 changed files with 9704 additions and 0 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -0,0 +1,28 @@
+from .lexeme import lex_of
+from .lexeme import sic_of
+
+
+__all__ = [lex_of, sic_of]
+
+
+"""
+from .tokens import ids_from_string
+from .tokens import group_by
+
+from .lex import sic_of
+from .lex import lex_of
+from .lex import normed_of
+from .lex import first_of
+from .lex import last_three_of
+
+from .lex import cluster_of
+from .lex import prob_of
+
+from .lex import is_oft_upper
+from .lex import is_oft_title
+
+from .lex import can_noun
+from .lex import can_verb
+from .lex import can_adj
+from .lex import can_adv
+"""
--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -0,0 +1,17 @@
+from ext.sparsehash cimport dense_hash_map
+from spacy.lexeme cimport StringHash
+from spacy.lexeme cimport Lexeme
+
+
+ctypedef Py_UNICODE* string_ptr
+ctypedef size_t Lexeme_addr # For python interop 
+ctypedef Lexeme* Lexeme_ptr
+
+
+cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
+
+
+cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
+cdef StringHash hash_string(unicode s, size_t length) except 0
+cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -0,0 +1,165 @@
+'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
+so that strings can be retrieved from hashes.  Use 64-bit hash values and
+boldly assume no collisions.
+'''
+from __future__ import unicode_literals
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+
+from spacy.lexeme cimport Lexeme
+from ext.murmurhash cimport MurmurHash64A
+from ext.murmurhash cimport MurmurHash64B
+
+
+STRINGS = {}
+LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
+LEXEMES.set_empty_key(0)
+
+
+cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+
+
+cpdef Lexeme_addr lookup(unicode string) except 0:
+    '''.. function:: enumerate(sequence[, start=0])
+    Fetch a Lexeme representing a word string. If the word has not been seen,
+    construct one, splitting off any attached punctuation or clitics.  A
+    reference to BLANK_WORD is returned for the empty string.
+    
+    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
+    '''
+    if string == '':
+        return <Lexeme_addr>&BLANK_WORD
+    cdef size_t length = len(string)
+    cdef StringHash hashed = hash_string(string, length)
+    cdef Lexeme* word_ptr = LEXEMES[hashed]
+    cdef size_t n
+    if word_ptr == NULL:
+        word_ptr = _add(hashed, string, _find_split(string, length), length)
+    return <Lexeme_addr>word_ptr
+
+
+cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
+    '''Fetch a Lexeme representing a word string. If the word has not been seen,
+    construct one, given the specified start and end indices.  A negative index
+    significes 0 for start, and the string length for end --- i.e. the string
+    will not be sliced if start == -1 and end == -1.
+    
+    A reference to BLANK_WORD is returned for the empty string.
+    '''
+    if string == '':
+        return <Lexeme_addr>&BLANK_WORD
+    cdef size_t length = len(string)
+    cdef StringHash hashed = hash_string(string, length)
+    cdef Lexeme* chunk_ptr = LEXEMES[hashed]
+    if chunk_ptr == NULL:
+        chunk_ptr = _add(hashed, string, start, length)
+    return <Lexeme_addr>chunk_ptr
+
+
+cdef StringHash hash_string(unicode s, size_t length) except 0:
+    '''Hash unicode with MurmurHash64A'''
+    assert length
+    return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
+
+
+cpdef unicode unhash(StringHash hash_value):
+    '''Fetch a string from the reverse index, given its hash value.'''
+    cdef string_ptr string = STRINGS[hash_value]
+    if string == NULL:
+        raise ValueError(hash_value)
+
+    return string
+
+
+cdef unicode normalize_word_string(unicode word):
+    '''Return a normalized version of the word, mapping:
+    - 4 digit strings into !YEAR
+    - Other digit strings into !DIGITS
+    - All other strings into lower-case
+    '''
+    cdef unicode s
+    if word.isdigit() and len(word) == 4:
+        return '!YEAR'
+    elif word[0].isdigit():
+        return '!DIGITS'
+    else:
+        return word.lower()
+    
+
+cpdef unicode _substr(unicode string, int start, int end, size_t length):
+    if end >= length:
+        end = -1
+    if start >= length:
+        start = 0
+    if start <= 0 and end < 0:
+        return string
+    elif start < 0:
+        start = 0
+    elif end < 0:
+        end = length
+    return string[start:end]
+  
+
+cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
+    assert string
+    assert split <= length
+    word = _init_lexeme(string, hashed, split, length)
+    LEXEMES[hashed] = word
+    STRINGS[hashed] = string
+    return word
+
+
+cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
+                          int split, size_t length) except NULL:
+    assert split <= length
+    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+
+    word.first = <Py_UNICODE>(string[0] if string else 0)
+    word.sic = hashed
+    
+    cdef unicode tail_string
+    cdef unicode lex 
+    if split != 0 and split < length:
+        lex = _substr(string, 0, split, length)
+        tail_string = _substr(string, split, length, length)
+    else:
+        lex = string
+        tail_string = ''
+    assert lex
+    cdef unicode normed = normalize_word_string(lex)
+    cdef unicode last3 = _substr(string, length - 3, length, length)
+
+    assert normed
+    assert len(normed)
+    
+    word.lex = hash_string(lex, len(lex))
+    word.normed = hash_string(normed, len(normed))
+    word.last3 = hash_string(last3, len(last3))
+
+    STRINGS[word.lex] = lex
+    STRINGS[word.normed] = normed
+    STRINGS[word.last3] = last3
+
+    # These are loaded later
+    word.prob = 0
+    word.cluster = 0
+    word.oft_upper = False
+    word.oft_title = False
+    
+    # Now recurse, and deal with the tail
+    if tail_string:
+        word.tail = <Lexeme*>lookup(tail_string)
+    return word
+
+
+cdef size_t _find_split(unicode word, size_t length):
+    cdef size_t i = 0
+    if word[0].isalnum():
+        while i < length and word[i].isalnum():
+            i += 1
+    else:
+        # Split off a punctuation character, or a sequence of the same punctuation character
+        while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
+            i += 1
+    return i
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -0,0 +1,35 @@
+from libc.stdint cimport uint64_t
+
+
+ctypedef int ClusterID
+ctypedef uint64_t StringHash
+
+
+cdef struct Lexeme:
+    StringHash sic # Hash of the original string
+    StringHash lex # Hash of the word, with punctuation and clitics split off
+    StringHash normed # Hash of the normalized version of lex
+    StringHash last3 # Last 3 characters of the token
+    Py_UNICODE first # First character of the token
+
+    double prob # What is the log probability of the lex value?
+    ClusterID cluster # Brown cluster of the token
+
+    bint oft_upper # Is the lowered version of the lex value often in all caps?
+    bint oft_title # Is the lowered version of the lex value often title-cased?
+    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
+
+
+# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
+# has a conditional to pick out the correct item.  This allows safe iteration
+# over the Lexeme, via:
+# for field in range(LexAttr.n): get_attr(Lexeme*, field)
+cdef enum HashFields:
+    sic
+    lex
+    normed
+    cluster
+    n
+
+
+#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -0,0 +1,114 @@
+'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
+Mostly useful from Python-space. From Cython-space, you can just cast to
+Lexeme* yourself.
+'''
+
+
+cpdef StringHash sic_of(size_t lex_id) except 0:
+    '''Access the `sic' field of the Lexeme pointed to by lex_id.
+    
+    The sic field stores the hash of the whitespace-delimited string-chunk used to
+    construct the Lexeme.
+    
+    >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
+    [u'Hi!', u'', u'world]
+    '''
+    return (<Lexeme*>lex_id).sic
+
+
+cpdef StringHash lex_of(size_t lex_id) except 0:
+    '''Access the `lex' field of the Lexeme pointed to by lex_id.
+
+    The lex field is the hash of the string you would expect to get back from
+    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
+    delimited tokens split off.  The other fields refer to properties of the
+    string that the lex field stores a hash of, except sic and tail.
+
+    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    [u'Hi', u'!', u'world']
+    '''
+    return (<Lexeme*>lex_id).lex
+
+
+cpdef ClusterID cluster_of(size_t lex_id):
+    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
+    gives an integer representation of the cluster ID of the word, 
+    which should be understood as a binary address:
+
+    >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
+    >>> token_ids = [lookup(s) for s in strings]
+    >>> clusters = [cluster_of(t) for t in token_ids]
+    >>> print ["{0:b"} % cluster_of(t) for t in token_ids]
+    ["100111110110", "100111100100", "01010111011001", "100111110110"]
+
+    The clusterings are unideal, but often slightly useful.
+    "pineapple" and "apple" share a long prefix, indicating a similar meaning,
+    while "dapple" is totally different. On the other hand, "scalable" receives
+    the same cluster ID as "pineapple", which is not what we'd like.
+    '''
+    return (<Lexeme*>lex_id).cluster
+
+
+cpdef Py_UNICODE first_of(size_t lex_id):
+    '''Access the `first' field of the Lexeme pointed to by lex_id, which
+    stores the first character of the lex string of the word.
+
+    >>> lex_id = lookup(u'Hello')
+    >>> unhash(first_of(lex_id))
+    u'H'
+    '''
+    return (<Lexeme*>lex_id).first
+
+
+cpdef double prob_of(size_t lex_id):
+    '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
+    the smoothed unigram log probability of the word, as estimated from a large
+    text corpus.  By default, probabilities are based on counts from Gigaword,
+    smoothed using Knesser-Ney; but any probabilities file can be supplied to
+    load_probs.
+    
+    >>> prob_of(lookup(u'world'))
+    -20.10340371976182
+    '''
+    pass
+
+
+cpdef StringHash last3_of(size_t lex_id):
+    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
+    the hash of the last three characters of the word:
+
+    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
+    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
+    [u'llo', u'!']
+    '''
+    return (<Lexeme*>lex_id).last3
+
+
+cpdef bint is_oft_upper(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    in all-upper case frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.95, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_upper(lookup(u'abc'))
+    True
+    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
+    True
+    '''
+    return (<Lexeme*>lex_id).oft_upper
+
+
+cpdef bint is_oft_title(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    title-cased frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.3, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_title(lookup(u'marcus'))
+    True
+    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
+    True
+    '''
+    return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -0,0 +1,5 @@
+from libcpp.vector cimport vector
+from spacy.lexeme cimport Lexeme
+
+
+cpdef vector[size_t] expand_chunk(size_t addr) except *
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+from spacy.lexeme cimport Lexeme
+
+
+cpdef vector[size_t] expand_chunk(size_t addr) except *:
+    cdef vector[size_t] tokens = vector[size_t]()
+    word = <Lexeme*>addr
+    while word is not NULL:
+        tokens.push_back(<size_t>word)
+        word = word.tail
+    return tokens
+
+
+"""
+cpdef vector[size_t] ids_from_text(unicode text) except *:
+    cdef size_t length = len(text)
+    cdef Py_UNICODE* characters = <Py_UNICODE*>text
+
+    cdef size_t i
+    cdef Py_UNICODE c
+
+    cdef vector[size_t] tokens = vector[size_t]()
+    cdef unicode current = u''
+    cdef Lexeme* token
+    cdef int alnum_end = -1
+    cdef size_t alnum_start = 0
+    cdef bint seen_alnum = False
+    for i in range(length):
+        c = characters[i]
+        if is_whitespace(c):
+            token = <Lexeme*>lookup(current)
+            tokens.push_back(<size_t>token)
+            clitic = 0
+            while token.clitics[clitic]:
+                tokens.push_back(token.clitics[clitic])
+                clitic += 1
+            current = u''
+            alnum_start = 0
+            alnum_end = -1
+            seen_alnum = False
+        else:
+            if not seen_alnum and c.isalnum():
+                alnum_start = i
+                seen_alnum = True
+            elif seen_alnum and alnum_end == -1 and not c.isalnum():
+                alnum_end = i
+            current += c
+    if current:
+        token = <Lexeme*>lookup(current)
+        tokens.push_back(<size_t>token)
+        clitic = 0
+        while token.clitics[clitic]:
+            tokens.push_back(token.clitics[clitic])
+            clitic += 1
+    return tokens
+"""
+
+#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
+#    pass
+
+
+cdef inline bint is_whitespace(Py_UNICODE c):
+    # TODO: Support other unicode spaces
+    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+    if c == u' ':
+        return True
+    elif c == u'\n':
+        return True
+    elif c == u'\t':
+        return True
+    else:
+        return False
--- a/spacy/util.py
+++ b/spacy/util.py
@ -0,0 +1,75 @@
+def utf8open(loc, mode='r'):
+    return codecs.open(loc, mode, 'utf8')
+
+
+def load_case_stats(data_dir):
+    case_loc = path.join(data_dir, 'english.case')
+    case_stats = {}
+    with utf8open(case_loc) as cases_file:
+        for line in cases_file:
+            word, upper, title = line.split()
+            case_stats[word] = (float(upper), float(title))
+    return case_stats
+
+
+def load_clitics(data_dir):
+    clitics_loc = path.join(data_dir, 'clitics.txt')
+    entries = []
+    seen = set()
+    with utf8open(clitics_loc) as clitics_file:
+        for line in clitics_file:
+            line = line.strip()
+            if line.startswith('#'):
+                continue
+            if not line:
+                continue
+            clitics = line.split()
+            word = clitics.pop(0)
+            norm_form = clitics.pop(0)
+            assert word not in seen, word
+            seen.add(word)
+            entries.append((word, norm_form, clitics))
+    return entries
+ 
+
+"""
+    def load_browns(self, data_dir):
+        cdef Lexeme* w
+        case_stats = load_case_stats(data_dir)
+        brown_loc = path.join(data_dir, 'bllip-clusters')
+        assert path.exists(brown_loc)
+        cdef size_t start 
+        cdef int end 
+        with utf8open(brown_loc) as browns_file:
+            for i, line in enumerate(browns_file):
+                cluster_str, word, freq_str = line.split()
+                # Decode as a little-endian string, so that we can do & 15 to get
+                # the first 4 bits. See redshift._parse_features.pyx
+                cluster = int(cluster_str[::-1], 2)
+                upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
+                start = 0
+                end = -1
+                find_slice(&start, &end, word)
+                print "Load", repr(word), start, end
+                w = <Lexeme*>init_word(word, start, end, cluster,
+                                      upper_pc, title_pc, int(freq_str))
+                self.words[_hash_str(word)] = <size_t>w
+                self.strings[<size_t>w] = word
+
+    def load_clitics(self, data_dir):
+        cdef unicode orig_str
+        cdef unicode clitic
+        for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
+            w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
+            self.words[w.orig] = <size_t>w
+            self.strings[<size_t>w] = orig_str
+            assert len(clitic_strs) < MAX_CLITICS
+            assert clitic_strs
+            for i, clitic in enumerate(clitic_strs):
+                # If we write punctuation here, assume we want to keep it,
+                # so tell it the slice boundaries (the full string)
+                w.clitics[i] = self.lookup_slice(clitic, 0, -1)
+            # Ensure we null terminate
+            w.clitics[i+1] = 0
+"""
+
--- a/tests/.test_tokenizer.py.swo
+++ b/tests/.test_tokenizer.py.swo
--- a/tests/my_test.py
+++ b/tests/my_test.py
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of
+from spacy.spacy import expand_chunk
+from spacy.en import lookup
+from spacy.en import unhash
+
+import pytest
+
+
+@pytest.fixture
+def close_puncts():
+    return [')', ']', '}', '*']
+
+
+def test_close(close_puncts):
+    word_str = 'Hello'
+    for p in close_puncts:
+        string = word_str + p
+        token = lookup(string)
+        assert unhash(lex_of(token)) == word_str
+        tokens = expand_chunk(token)
+        assert len(tokens) == 2
+        assert unhash(lex_of(tokens[0])) == word_str
+        assert unhash(lex_of(tokens[1])) == p
+
+
+def test_two_different_close(close_puncts):
+    word_str = 'Hello'
+    for p in close_puncts:
+        string = word_str + p + "'"
+        token = lookup(string)
+        assert unhash(lex_of(token)) == word_str
+        tokens = expand_chunk(token)
+        assert len(tokens) == 3
+        assert unhash(lex_of(tokens[0])) == word_str
+        assert unhash(lex_of(tokens[1])) == p
+        assert unhash(lex_of(tokens[2])) == "'"
+
+
+def test_three_same_close(close_puncts):
+    word_str = 'Hello'
+    for p in close_puncts:
+        string = word_str + p + p + p
+        tokens = expand_chunk(lookup(string))
+        assert len(tokens) == 2
+        assert unhash(lex_of(tokens[0])) == word_str
+        assert unhash(lex_of(tokens[1])) == p + p + p
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of
+from spacy.spacy import expand_chunk
+from spacy.en import lookup
+from spacy.en import unhash
+
+import pytest
+
+
+@pytest.fixture
+def open_puncts():
+    return ['(', '[', '{', '*']
+
+
+def test_open(open_puncts):
+    word_str = 'Hello'
+    for p in open_puncts:
+        string = p + word_str
+        token = lookup(string)
+        assert unhash(lex_of(token)) == p
+        tokens = expand_chunk(token)
+        assert len(tokens) == 2
+        assert unhash(lex_of(tokens[0])) == p
+        assert unhash(lex_of(tokens[1])) == word_str
+
+
+def test_two_different_open(open_puncts):
+    word_str = 'Hello'
+    for p in open_puncts:
+        string = p + "`" + word_str
+        token = lookup(string)
+        assert unhash(lex_of(token)) == p
+        tokens = expand_chunk(token)
+        assert len(tokens) == 3
+        assert unhash(lex_of(tokens[0])) == p
+        assert unhash(lex_of(tokens[1])) == "`"
+        assert unhash(lex_of(tokens[2])) == word_str
+
+
+def test_three_same_open(open_puncts):
+    word_str = 'Hello'
+    for p in open_puncts:
+        string = p + p + p + word_str
+        token = lookup(string)
+        assert unhash(lex_of(token)) == p + p + p
+        tokens = expand_chunk(token)
+        assert len(tokens) == 2
+        assert unhash(lex_of(tokens[0])) == p + p + p
+        assert unhash(lex_of(tokens[1])) == word_str
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@ -0,0 +1,39 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of, sic_of
+from spacy.spacy import expand_chunk
+from spacy.en import lookup
+from spacy.en import unhash
+
+import pytest
+
+
+@pytest.fixture
+def paired_puncts():
+    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
+
+
+def test_token(paired_puncts):
+    word_str = 'Hello'
+    for open_, close_ in paired_puncts:
+        string = open_ + word_str + close_
+        tokens = expand_chunk(lookup(string))
+        assert len(tokens) == 3
+        assert unhash(lex_of(tokens[0])) == open_
+        assert unhash(lex_of(tokens[1])) == word_str
+        assert unhash(lex_of(tokens[2])) == close_
+        assert unhash(sic_of(tokens[0])) == string
+
+
+def test_two_different(paired_puncts):
+    word_str = 'Hello'
+    for open_, close_ in paired_puncts:
+        string = "`" + open_ + word_str + close_ + "'"
+        tokens = expand_chunk(lookup(string))
+        assert len(tokens) == 5
+        assert unhash(lex_of(tokens[0])) == "`"
+        assert unhash(lex_of(tokens[1])) == open_
+        assert unhash(lex_of(tokens[2])) == word_str
+        assert unhash(lex_of(tokens[2])) == word_str
+        assert unhash(lex_of(tokens[3])) == close_
+        assert unhash(lex_of(tokens[4])) == "'"
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -0,0 +1,30 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of
+from spacy.en import lookup
+from spacy.en import unhash
+
+
+def test_neq():
+    addr = lookup('Hello')
+    assert lookup('bye') != addr
+
+
+def test_eq():
+    addr = lookup('Hello')
+    assert lookup('Hello') == addr
+
+
+def test_round_trip():
+    hello = lookup('Hello')
+    assert unhash(lex_of(hello)) == 'Hello'
+
+
+def test_case_neq():
+    addr = lookup('Hello')
+    assert lookup('hello') != addr
+
+
+def test_punct_neq():
+    addr = lookup('Hello')
+    assert lookup('Hello,') != addr