diff options
| author | TJ DeVries <devries.timothyj@gmail.com> | 2020-07-31 00:05:22 -0400 |
|---|---|---|
| committer | TJ DeVries <devries.timothyj@gmail.com> | 2020-07-31 00:05:22 -0400 |
| commit | fa0382d93e73b66e7ec769cec27b9fbb21020641 (patch) | |
| tree | 624d5dc3de80426956a1c46447f1f26443a87a64 /scratch | |
| parent | ababfbfd88334ca6d94d5d0a8b6324dd6600d602 (diff) | |
Streamed some refactoring. More work to do
Diffstat (limited to 'scratch')
| -rw-r--r-- | scratch/bk_tree.lua | 270 | ||||
| -rw-r--r-- | scratch/file_finder.lua | 13 | ||||
| -rw-r--r-- | scratch/fuzzy_tester.lua | 8 | ||||
| -rw-r--r-- | scratch/ngrams.lua | 87 | ||||
| -rw-r--r-- | scratch/picker_locations.lua | 4 | ||||
| -rwxr-xr-x | scratch/slow_proc.sh | 8 |
6 files changed, 380 insertions, 10 deletions
diff --git a/scratch/bk_tree.lua b/scratch/bk_tree.lua new file mode 100644 index 0000000..3957224 --- /dev/null +++ b/scratch/bk_tree.lua @@ -0,0 +1,270 @@ + +--------------------------- +-- bk-tree datastructure +-- +-- http://en.wikipedia.org/wiki/BK-tree +-- @module bk-tree +-- @author Robin Hübner +-- robinhubner@gmail.com +-- @release version 1.0.2 +-- @license MIT + +local bk_tree = {} + + +local function lazy_copy(t1) + + local cp = {} + + for k, v in pairs(t1) do + cp[k] = v + end + + return cp + +end + +local function min(a, b, c) + + local min_val = a + + if b < min_val then min_val = b end + if c < min_val then min_val = c end + + return min_val + +end + +---------------------------------- +--- Levenshtein distance function. +-- @tparam string s1 +-- @tparam string s2 +-- @treturn number the levenshtein distance +-- @within Metrics +function bk_tree.levenshtein_dist(s1, s2) + + if s1 == s2 then return 0 end + if s1:len() == 0 then return s2:len() end + if s2:len() == 0 then return s1:len() end + if s1:len() < s2:len() then s1, s2 = s2, s1 end + + t = {} + for i=1, #s1+1 do + t[i] = {i-1} + end + + for i=1, #s2+1 do + t[1][i] = i-1 + end + + local cost + for i=2, #s1+1 do + + for j=2, #s2+1 do + cost = (s1:sub(i-1,i-1) == s2:sub(j-1,j-1) and 0) or 1 + t[i][j] = min( + t[i-1][j] + 1, + t[i][j-1] + 1, + t[i-1][j-1] + cost) + end + + end + + return t[#s1+1][#s2+1] + +end + +function bk_tree.hook(param) + + local name, callee = debug.getlocal(2, 1) + local f = debug.getinfo(2, "f").func + local p = debug.getinfo(3, "f").func + --[[ previous function in the callstack, if called from the same place, + don't add to the insert/remove counters. ]]-- + + if f == bk_tree.insert and p ~= bk_tree.insert then + callee.stats.nodes = callee.stats.nodes + 1 + elseif f == bk_tree.remove and p ~= bk_tree.remove then + callee.stats.nodes = callee.stats.nodes - 1 + elseif f == bk_tree.query and p == bk_tree.query then + callee.stats.queries = callee.stats.queries + 1 + end + +end + +--- Hooks debugging into tree execution. +-- Keeps track of number of nodes created, queries made, +-- note that this must be run directly after tree is created +-- in order to get correct information. +-- @within Debug +--- @usage +-- local bktree = require "bk-tree" +-- local tree = bktree:new("word") +-- tree:debug() +-- tree:insert("perceive") +-- tree:insert("beautiful") +-- tree:insert("definitely") +-- local result = tree:query("definately", 3) +-- tree:print_stats() +-- +-- -- output +-- Nodes: 4 +-- Queries: 3 +-- Nodes Queried: 75% +function bk_tree:debug() + + local nc = 0 + if self.root then nc = 1 end + self.stats = { nodes = nc, queries = 0 } + debug.sethook(self.hook, "c") + +end + +--- Print execution stats. +-- Prints nodes queried and total nodes, as well as a fraction of +-- nodes visited to satisfy the query, resets the counter of nodes queried when called. +-- @within Debug +-- @see debug +function bk_tree:print_stats() + + print("\nNodes: " .. self.stats.nodes) + print("Queries: " .. self.stats.queries) + print("Nodes Queried: " .. self.stats.queries/self.stats.nodes*100 .. "%\n") + self.stats.queries = 0 + +end + +--- Fetch execution stats. +-- Returns a copy of the execution stats that @{print_stats} would print, requires debug to have been enabled +-- to not just return defaults. Useful if you want to profile things. +-- @within Debug +-- @return {key = value,...} +function bk_tree:get_stats() + + return lazy_copy(self.stats) + +end + +--------------------------- +--- Creates a new bk-tree. +-- @constructor +-- @string[opt] root_word the root of the new tree +-- @tparam[opt=levenshtein_dist] function dist_func the distance function used +-- @see levenshtein_dist +-- @return the new bk-tree instance +--- @usage +-- local bktree = require "bk-tree" +-- local tree = bktree:new("word") +function bk_tree:new(root_word, dist_func) + + local n_obj = {} + if root_word then n_obj.root = { str = root_word, children = {} } end + n_obj.dist_func = dist_func or self.levenshtein_dist + + setmetatable(n_obj, self) + self.__index = self + + return n_obj + +end + +-------------------------------- +--- Inserts word into the tree. +-- @string word +-- @treturn bool true if inserted, false if word already exists in tree +--- @usage +-- local bktree = require "bk-tree" +-- local tree = bktree:new("root") +-- local success = tree:insert("other_word") +function bk_tree:insert(word, node) + + node = node or self.root + + if not node then + self.root = { str = word, children = {} } + return true + end + + local dist = self.dist_func(word, node.str) + if dist == 0 then return false end + + local some_node = node.children[dist] + + if not some_node then + node.children[dist] = { str = word, children = {} } + return true + end + + return self:insert(word, some_node) + +end + +-------------------------------- +--- Query the tree for matches. +-- @string word +-- @tparam number n max edit distance to use when querying +-- @treturn {{str=string,distance=number},....} table of tables with matching words, empty table if no matches +--- @usage +-- local bktree = require "bk-tree" +-- local tree = bktree:new("word") +-- tree:insert("hello") +-- tree:insert("goodbye") +-- tree:insert("woop") +-- local result = tree:query("woop", 1) +function bk_tree:query(word, n, node, matches) + + node = node or self.root + matches = matches or {} + + if not node then return matches end + + local dist = self.dist_func(word, node.str) + if dist <= n then matches[#matches+1] = {str = node.str, distance = dist} end + + for k, child in pairs(node.children) do + if child ~= nil then + if k >= dist-n and k <= dist+n then + self:query(word, n, child, matches) + end + end + end + + return matches + +end + +--------------------------------------------------------- +--- Queries the the tree for a match, sorts the results. +-- Calls @{query} and returns the results sorted. +-- @string word +-- @tparam number n max edit distance to use when querying +-- @treturn {{str=string,distance=number},....} table of tables with matching words sorted by distance, empty table if no matches +--- @usage +-- local bktree = require "bk-tree" +-- local tree = bktree:new("word") +-- tree:insert("woop") +-- tree:insert("worp") +-- tree:insert("warp") +-- local result = tree:query_sorted("woop", 3) +function bk_tree:query_sorted(word, n) + + local result = self:query(word, n) + + table.sort(result, function(a,b) return a.distance < b.distance end) + + return result + +end + + +local tree = bk_tree:new("word") +tree:insert("hello") +tree:insert("welp") +tree:insert("function") +tree:insert("this long line what") +tree:insert("what this long line what") +print(vim.inspect(tree)) +print(vim.inspect(tree:query_sorted("what", 3))) + + +return bk_tree diff --git a/scratch/file_finder.lua b/scratch/file_finder.lua index 8b5bb7b..ae84e2f 100644 --- a/scratch/file_finder.lua +++ b/scratch/file_finder.lua @@ -1,3 +1,4 @@ +package.loaded['telescope.pickers'] = nil local telescope = require('telescope') -- Goals: @@ -61,15 +62,15 @@ local file_sorter = telescope.sorters.new { if prompt == '' then return 0 end if not line then return -1 end - local dist = string_distance(prompt, line) - -- if dist > (0.75 * #line) and #prompt > 3 then - -- return -1 - -- end - - return dist + return tonumber(vim.fn.systemlist(string.format( + "echo '%s' | ~/tmp/fuzzy_test/target/debug/fuzzy_test '%s'", + line, + prompt + ))[1]) end } + local file_previewer = telescope.previewers.vim_buffer local file_picker = telescope.pickers.new { diff --git a/scratch/fuzzy_tester.lua b/scratch/fuzzy_tester.lua new file mode 100644 index 0000000..0971381 --- /dev/null +++ b/scratch/fuzzy_tester.lua @@ -0,0 +1,8 @@ + +line = "hello" +prompt = "h" +print(vim.inspect(vim.fn.systemlist(string.format( + "echo '%s' | ~/tmp/fuzzy_test/target/debug/fuzzy_test '%s'", + line, + prompt +)))) diff --git a/scratch/ngrams.lua b/scratch/ngrams.lua new file mode 100644 index 0000000..8b763a8 --- /dev/null +++ b/scratch/ngrams.lua @@ -0,0 +1,87 @@ + +local function ngrams(counts, doc) + local DEPTH = 5 + local docLen = #doc + local min, concat = math.min, table.concat + for i = 1, docLen - 1 do + for j = i, min(i + DEPTH - 1, docLen) do + if not doc[j] then break end + local k = concat(doc, " ", i, j) + counts[k] = (counts[k] or 0) + 1 + end + end +end + + +local bz = io.popen('bzcat /home/tj/Downloads/pages.xml.bz2') +local title, content = "", "" +local inText = false + +local numDocs = 0 +local globalCounts = {} + +local function set(t) + local s = {} + for _, v in pairs(t) do s[v] = true end + return s +end + +local bad = set({ + 'after', 'also', 'article', 'date', 'defaultsort', 'external', 'first', 'from', + 'have', 'html', 'http', 'image', 'infobox', 'links', 'name', 'other', 'preserve', + 'references', 'reflist', 'space', 'that', 'this', 'title', 'which', 'with', + 'quot', 'ref', 'name', 'http', 'amp', 'ndash', 'www', 'cite', 'nbsp', + 'style', 'text', 'align', 'center', 'background' + }) + +local function isnumber(w) + s, e = w:find("[0-9]+") + return s +end + +for line in bz:lines() do + local _, _, mTitle = line:find("<title>(.*)</title>") + local _, _, bText = line:find("<text[^>]*>([^<]*)") + local eText, _ = line:find("</text>") + + if mTitle then + title = mTitle + elseif bText then + content = bText + inText = true + elseif inText then + content = content .. line + end + + if eText then + words = {} + for v in content:gmatch("%w+") do + v = v:lower() + if #v >= 3 and #v < 12 and not bad[v] and not isnumber(v) then + table.insert(words, v) + else + table.insert(words, nil) + end + end + + ngrams(globalCounts, words) + inText = false + + numDocs = numDocs + 1 + if numDocs % 10 == 0 then + io.write(string.format("Working... %d documents processed.\r", numDocs)) + io.flush() + end + + if numDocs == 500 then + local f = io.open('/tmp/freqs.lua.txt', 'w') + for k, v in pairs(globalCounts) do + f:write(k, '\t', v, '\n') + end + f:close() + + globalCounts = {} + os.exit(0) + end + end +end diff --git a/scratch/picker_locations.lua b/scratch/picker_locations.lua new file mode 100644 index 0000000..82fe7f7 --- /dev/null +++ b/scratch/picker_locations.lua @@ -0,0 +1,4 @@ +package.loaded['telescope'] = nil +package.loaded['telescope.init'] = nil +package.loaded['telescope.picker'] = nil +package.loaded['telescope.finder'] = nil diff --git a/scratch/slow_proc.sh b/scratch/slow_proc.sh index bbcf8bf..580e617 100755 --- a/scratch/slow_proc.sh +++ b/scratch/slow_proc.sh @@ -1,10 +1,10 @@ echo "hello" sleep 1 -echo "cool" +echo "help" sleep 1 -echo "world" +echo "hi" sleep 1 -echo "x" +echo "husband" sleep 1 -echo "y" +echo "helper" |
