summaryrefslogtreecommitdiff
path: root/scratch/ngrams.lua
blob: 8b763a87cae4f9afe45241e5018fc5305e9cf8cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

local function ngrams(counts, doc)
  local DEPTH = 5
  local docLen = #doc
  local min, concat = math.min, table.concat
  for i = 1, docLen - 1 do
    for j = i, min(i + DEPTH - 1, docLen) do
      if not doc[j] then break end
      local k = concat(doc, " ", i, j)
      counts[k] = (counts[k] or 0) + 1
    end
  end
end


local bz = io.popen('bzcat /home/tj/Downloads/pages.xml.bz2')
local title, content = "", ""
local inText = false

local numDocs = 0
local globalCounts = {}

local function set(t) 
  local s = {}
  for _, v in pairs(t) do s[v] = true end
  return s
end

local bad = set({
                  'after', 'also', 'article', 'date', 'defaultsort', 'external', 'first', 'from',
                  'have', 'html', 'http', 'image', 'infobox', 'links', 'name', 'other', 'preserve',
                  'references', 'reflist', 'space', 'that', 'this', 'title', 'which', 'with',
                  'quot', 'ref', 'name', 'http', 'amp', 'ndash', 'www', 'cite', 'nbsp',
                  'style', 'text', 'align', 'center', 'background'
                })

local function isnumber(w)
  s, e = w:find("[0-9]+")
  return s
end

for line in bz:lines() do
  local _, _, mTitle = line:find("<title>(.*)</title>")
  local _, _, bText = line:find("<text[^>]*>([^<]*)")
  local eText, _ = line:find("</text>")

  if mTitle then
    title = mTitle
  elseif bText then
    content = bText
    inText = true
  elseif inText then
    content = content .. line
  end
  
  if eText then
    words = {}
    for v in content:gmatch("%w+") do
      v = v:lower()
      if #v >= 3 and #v < 12 and not bad[v] and not isnumber(v) then
        table.insert(words, v)
      else
        table.insert(words, nil)
      end
    end

    ngrams(globalCounts, words)
    inText = false

    numDocs = numDocs + 1
    if numDocs % 10 == 0 then
      io.write(string.format("Working... %d documents processed.\r", numDocs))
      io.flush()
    end
    
    if numDocs == 500 then
      local f = io.open('/tmp/freqs.lua.txt', 'w')
      for k, v in pairs(globalCounts) do
        f:write(k, '\t', v, '\n')
      end
      f:close()

      globalCounts = {}
      os.exit(0)
    end
  end  
end