diff options
Diffstat (limited to 'scratch/ngrams.lua')
| -rw-r--r-- | scratch/ngrams.lua | 87 |
1 files changed, 0 insertions, 87 deletions
diff --git a/scratch/ngrams.lua b/scratch/ngrams.lua deleted file mode 100644 index 8b763a8..0000000 --- a/scratch/ngrams.lua +++ /dev/null @@ -1,87 +0,0 @@ - -local function ngrams(counts, doc) - local DEPTH = 5 - local docLen = #doc - local min, concat = math.min, table.concat - for i = 1, docLen - 1 do - for j = i, min(i + DEPTH - 1, docLen) do - if not doc[j] then break end - local k = concat(doc, " ", i, j) - counts[k] = (counts[k] or 0) + 1 - end - end -end - - -local bz = io.popen('bzcat /home/tj/Downloads/pages.xml.bz2') -local title, content = "", "" -local inText = false - -local numDocs = 0 -local globalCounts = {} - -local function set(t) - local s = {} - for _, v in pairs(t) do s[v] = true end - return s -end - -local bad = set({ - 'after', 'also', 'article', 'date', 'defaultsort', 'external', 'first', 'from', - 'have', 'html', 'http', 'image', 'infobox', 'links', 'name', 'other', 'preserve', - 'references', 'reflist', 'space', 'that', 'this', 'title', 'which', 'with', - 'quot', 'ref', 'name', 'http', 'amp', 'ndash', 'www', 'cite', 'nbsp', - 'style', 'text', 'align', 'center', 'background' - }) - -local function isnumber(w) - s, e = w:find("[0-9]+") - return s -end - -for line in bz:lines() do - local _, _, mTitle = line:find("<title>(.*)</title>") - local _, _, bText = line:find("<text[^>]*>([^<]*)") - local eText, _ = line:find("</text>") - - if mTitle then - title = mTitle - elseif bText then - content = bText - inText = true - elseif inText then - content = content .. line - end - - if eText then - words = {} - for v in content:gmatch("%w+") do - v = v:lower() - if #v >= 3 and #v < 12 and not bad[v] and not isnumber(v) then - table.insert(words, v) - else - table.insert(words, nil) - end - end - - ngrams(globalCounts, words) - inText = false - - numDocs = numDocs + 1 - if numDocs % 10 == 0 then - io.write(string.format("Working... %d documents processed.\r", numDocs)) - io.flush() - end - - if numDocs == 500 then - local f = io.open('/tmp/freqs.lua.txt', 'w') - for k, v in pairs(globalCounts) do - f:write(k, '\t', v, '\n') - end - f:close() - - globalCounts = {} - os.exit(0) - end - end -end |
