 # Analyzing the stdlib

I took my lexical complexity estimator and ran it against the standard library again. I aggregated the results by "functional units" (e.g. all the files under test/, which belong to Test::Unit, under the "test" group), and generated a few graphs...

Functional units ordered by their average complexity: Complexity vs. amount of code: ### Improved code

I simplified the lexical complexity estimator by getting rid of the part that computed stationary distributions based on the transition matrix, and using the observed token frequencies instead. It's a pity though because by doing so I dropped most of the maths (however simple) behind (/me hears sighs of relief). Some consequences:

• it runs almost an order of magnitude faster
• results differ the most for small files, due to the edge effect associated to the last token; but this doesn't really matter because these are the files where the markov process is more likely to be periodic anyway...
```require 'matrix'

def transition_matrix(filename)
indices = Hash.new{|h,k| h[k] = indices.size}
probs = []
last_tok = nil
ntokens = 0
freqs = []
IO.popen("ruby -y -c #{filename} 2>&1") do |f|
f.each do |line|
if md = /^Reading a token: Next token is token (\S+)/.match(line)
tok = md
ntokens += 1
freqs[indices[tok]] ||= 0
freqs[indices[tok]] += 1
if last_tok
probs[indices[last_tok]] ||= []
probs[indices[last_tok]][indices[tok]] ||= 0
probs[indices[last_tok]][indices[tok]] += 1
end
last_tok = tok
end
end
end
if ntokens == 0
return [nil, nil, nil, 0]
end
probs.map! do |row|
sum = row.inject(0){|s,x| s + (x || 0)}
row.map{|x| 1.0 * (x || 0) / sum }
end

freqs = freqs.map{|p| 1.0 * p / ntokens }
cols = [probs.size, probs.map{|x| x.size}.max].max
probs <<  * cols if probs.size < cols
[Matrix.rows(probs.map{|row| row +  * (cols - row.size)}), freqs, indices, ntokens]
end

LOG2 = Math.log(2)
def H(probs)
probs.to_a.inject(0) do |s,x|
if x < 1e-6 # for continuity
s
else
s - 1.0 * x * Math.log(x) / LOG2
end
end
end

if __FILE__ == \$0

require 'find'
require 'rbconfig'
require 'yaml'

dir = ARGV || Config::CONFIG["rubylibdir"]

\$stdout.sync = true
begin
statistics = {}
Dir.chdir(dir) do
Find.find(".") do |file|
next unless /\.rb\$/ =~ file
#next unless %r{^./rss.rb\$} =~ file
groupname = File.dirname(file).split(%r{/})
groupname = File.basename(file)[/[^.]+/] if groupname.nil?
puts "Processing #{file} (under #{groupname})"

t_matrix, freqs, indices, tokens = transition_matrix(file)
next unless t_matrix # skip empty files

h_rate = freqs.inject([0,0]) do |(s,idx), p|
[s + p * H(t_matrix.row(idx)), idx + 1]
end.first

h_iid = H(freqs)
h_eq1 = freqs.inject([0,0]) do |(s,idx), p|
non_zero = t_matrix.row(idx).to_a.select{|x| x > 0}.size
non_zero = [1, non_zero].max
[s + p * Math.log(non_zero) / LOG2, idx + 1]
end.first
h_eq2 = Math.log(t_matrix.row_size) / LOG2

(statistics[groupname] ||= []) << [file, h_rate, h_eq1, h_iid, h_eq2, tokens]
#puts t_matrix
p [file, h_rate, h_eq1, h_iid, h_eq2, tokens]
# [file,
#  h_rate  either the entropy rate of the markov chain if there is
#          a stationary distribution or the weighted sum of the
#          entropies for each row in the transition matrix, based on
#          the observed frequencies for the prev token type
#  h_eq1   entropy if all possible (as observed by an instance of the
#          process) choices given the prev token are equidistributed
#  h_iid   we ignore the markovian process and consider this as a
#          stochastic process of indep. identically distributed
#          random variables (so all tokens are considered independent)
#  h_eq2   entropy per token if we consider them all equally probable
#  tokens  number of tokens
# ]
end
end

ensure
File.open("lexical.entropy", "w"){|f| Marshal.dump(statistics, f) }
puts "=" * 80
puts statistics.to_yaml
raise if \$!
end

end
```

Last modified:2005/12/20 12:10:12
Keyword(s):[blog] [ruby] [lexical] [complexity] [stdlib] [standard] [library]
References:[Ruby]