* Pass OOV probability around

2025-07-13 01:32:32 +03:00 · 2015-07-25 23:29:51 +02:00 · 2015-07-25 23:29:51 +02:00 · fd525f0675
commit fd525f0675
parent 5b6bf4d4a6
3 changed files with 19 additions and 16 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -110,8 +110,8 @@ def _read_freqs(loc):
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
-    probs['-OOV-'] = math.log(counts.smoother(0)) - log_total
-    return probs
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob


 def _read_senses(loc):
@ -144,29 +144,30 @@ def setup_vocab(src_dir, dst_dir):
        print("Warning: Word vectors file not found")
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
-    probs = _read_probs(src_dir / 'words.sgt.prob')
+    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
-        probs = _read_freqs(src_dir / 'freqs.txt')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
-        min_prob = 0.0
+        oov_prob = 0.0
    else:
-        min_prob = min(probs.values())
+        oov_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
-            probs[word] = min_prob
+            probs[word] = oov_prob

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        entry = get_lex_props(word)
-        if word in clusters:
-            entry['prob'] = float(prob)
-            cluster = clusters.get(word, '0')
-            # Decode as a little-endian string, so that we can do & 15 to get
-            # the first 4 bits. See _parse_features.pyx
-            entry['cluster'] = int(cluster[::-1], 2)
-            vocab[word] = entry
+        entry['prob'] = float(prob)
+        cluster = clusters.get(word, '0')
+        # Decode as a little-endian string, so that we can do & 15 to get
+        # the first 4 bits. See _parse_features.pyx
+        entry['cluster'] = int(cluster[::-1], 2)
+        vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
+    with (dst_dir / 'oov_prob').open('w') as file_:
+        file_.write('%f' % oov_prob)


 def main(lang_data_dir, corpora_dir, model_dir):
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -31,6 +31,7 @@ cdef class Vocab:
    cdef readonly int length
    cdef public object _serializer
    cdef public object data_dir
+    cdef public float oov_prob

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -37,7 +37,7 @@ cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
    def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
-                 pos_tags=None):
+                 pos_tags=None, oov_prob=-30):
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
@ -61,6 +61,7 @@ cdef class Vocab:

        self._serializer = None
        self.data_dir = data_dir
+        self.oov_prob = oov_prob

    property serializer:
        def __get__(self):
@ -90,7 +91,7 @@ cdef class Vocab:
        if len(string) < 3:
            mem = self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string)
+        props = self.lexeme_props_getter(string, self.oov_prob)
        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
        if is_oov:
            lex.id = 0