diff options
| author | TJ DeVries <devries.timothyj@gmail.com> | 2020-07-31 00:05:22 -0400 |
|---|---|---|
| committer | TJ DeVries <devries.timothyj@gmail.com> | 2020-07-31 00:05:22 -0400 |
| commit | fa0382d93e73b66e7ec769cec27b9fbb21020641 (patch) | |
| tree | 624d5dc3de80426956a1c46447f1f26443a87a64 /scratch/ngrams.lua | |
| parent | ababfbfd88334ca6d94d5d0a8b6324dd6600d602 (diff) | |
Streamed some refactoring. More work to do
Diffstat (limited to 'scratch/ngrams.lua')
| -rw-r--r-- | scratch/ngrams.lua | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/scratch/ngrams.lua b/scratch/ngrams.lua new file mode 100644 index 0000000..8b763a8 --- /dev/null +++ b/scratch/ngrams.lua @@ -0,0 +1,87 @@ + +local function ngrams(counts, doc) + local DEPTH = 5 + local docLen = #doc + local min, concat = math.min, table.concat + for i = 1, docLen - 1 do + for j = i, min(i + DEPTH - 1, docLen) do + if not doc[j] then break end + local k = concat(doc, " ", i, j) + counts[k] = (counts[k] or 0) + 1 + end + end +end + + +local bz = io.popen('bzcat /home/tj/Downloads/pages.xml.bz2') +local title, content = "", "" +local inText = false + +local numDocs = 0 +local globalCounts = {} + +local function set(t) + local s = {} + for _, v in pairs(t) do s[v] = true end + return s +end + +local bad = set({ + 'after', 'also', 'article', 'date', 'defaultsort', 'external', 'first', 'from', + 'have', 'html', 'http', 'image', 'infobox', 'links', 'name', 'other', 'preserve', + 'references', 'reflist', 'space', 'that', 'this', 'title', 'which', 'with', + 'quot', 'ref', 'name', 'http', 'amp', 'ndash', 'www', 'cite', 'nbsp', + 'style', 'text', 'align', 'center', 'background' + }) + +local function isnumber(w) + s, e = w:find("[0-9]+") + return s +end + +for line in bz:lines() do + local _, _, mTitle = line:find("<title>(.*)</title>") + local _, _, bText = line:find("<text[^>]*>([^<]*)") + local eText, _ = line:find("</text>") + + if mTitle then + title = mTitle + elseif bText then + content = bText + inText = true + elseif inText then + content = content .. line + end + + if eText then + words = {} + for v in content:gmatch("%w+") do + v = v:lower() + if #v >= 3 and #v < 12 and not bad[v] and not isnumber(v) then + table.insert(words, v) + else + table.insert(words, nil) + end + end + + ngrams(globalCounts, words) + inText = false + + numDocs = numDocs + 1 + if numDocs % 10 == 0 then + io.write(string.format("Working... %d documents processed.\r", numDocs)) + io.flush() + end + + if numDocs == 500 then + local f = io.open('/tmp/freqs.lua.txt', 'w') + for k, v in pairs(globalCounts) do + f:write(k, '\t', v, '\n') + end + f:close() + + globalCounts = {} + os.exit(0) + end + end +end |
