#!/usr/bin/env ruby require 'find' require 'kconv' require 'ftools' class TextRetriever def TextRetriever.add(a,s) a.push(s.tojis) if s =~ /^(Subject|To|From)/ end def TextRetriever.mailtext(file) buf = [] File.open(file,"r"){ |f| s = '' header = true f.each { |line| line.chomp! if header then if line =~ /^$/ then TextRetriever.add(buf,s) header = false end if line =~ /^\s+(.*)$/ then s += $1 else TextRetriever.add(buf,s) s = line end else buf.push(line) break if line =~ /Content-Transfer-Encoding.*base/ end } } buf end def TextRetriever.normaltext(file) buf = [] File.open(file,"r"){ |f| buf = f.readlines } buf end def TextRetriever.text(file) if file =~ /\/Mail\/.*\/\d+$/ then TextRetriever.mailtext(file) else TextRetriever.normaltext(file) end end end class FreqfileGenerator def initialize(getaroot) @getaroot = getaroot @home = ENV['HOME'] @offset = @home.length + 1 end def validtext(file) if file =~ /\/(cw\.c|xr\.c)$/ then nil else test(?f,file) && ( file =~ /\/Mail\/.*\/\d+$/ || file =~ /\/PIM\/.*\/\d{14}\.txt$/ || file =~ /\.(c|h|pl|rb|tex|txt|html)$/ ) end end def generate(dir) path = "#{@home}/#{dir}" Find.find(path){ |file| relpath = file[@offset,1000] freqfile = "#{@getaroot}/freqfiles/#{relpath}" if validtext(file) then if !test(?f,freqfile) || File.stat(freqfile).mtime < File.stat(file).mtime then puts freqfile create_freqfile(file,freqfile) end end } end def create_freqfile(file,freqfile) out = TextRetriever.text(file) tmpfile = "/tmp/mecabtmp#{$$}" mecab = IO.popen("/usr/local/bin/nkf -e | /usr/local/bin/mecab > #{tmpfile}","w") out.each { |line| mecab.puts line } mecab.close freqfile =~ /^(.*\/)[^\/]+$/ dir = $1 File.makedirs(dir) File.open(tmpfile,"r"){ |i| File.open(freqfile,"w"){ |o| freq = {} while s = i.gets do next if s =~ /^EOS$/ s.gsub!(/\s.*$/,'') freq[s] = freq[s].to_i + 1 end o.puts "@#{file}" freq.each { |word,count| next if word[0] == 0x0a #puts word #printf("[%x]\n",word[0]) o.puts "#{count} #{word}" } } } File.delete(tmpfile) end end irroot = ARGV[0] exit unless irroot fg = FreqfileGenerator.new(irroot) # fg.generate('Mail/inbox') fg.generate('DOC') # fg.generate('PIM')