8: def run
9:
10:
11: logger.info "Filling the queue with apache requests (this could take a while)"
12: queue = RequestQueue.new
13: logger.info "No apache requests found, you may need to run the apache_importer before reconciling page views." && break unless queue.size > 0
14:
15: invalid_date = repository.adapter.query('select distinct created_at from page_views order by created_at asc limit 1').first + 1/60000.0
16:
17:
18: page_view_size = repository.adapter.query('select count(*) from page_views where created_at <= ?', invalid_date).first
19: block_size = 10000
20: loops = (page_view_size / block_size) + 1
21:
22: query = "select *,ctid from page_views\nwhere created_at <= ?\norder by ctid asc\nlimit ?\n"
23:
24: non_matches = 0
25: matches = 0
26: loops.times do |i|
27: logger.info "Batch #{i}:"
28: logger.info "Matches: #{matches}"
29: logger.info "NonMatches: #{non_matches}"
30: logger.info "Match %: #{matches / (matches + non_matches).to_f * 100}%"
31: page_views = repository.adapter.query(query, invalid_date, block_size)
32:
33:
34: page_views.map { |v| PageViewRequest.new(v.ctid, nil, nil, v.uri, v.referrer == "/" ? "-" : v.referrer, v.created_at, v.session_id) }.each_with_index do |page_view, j|
35: match = queue.expanded_search(page_view)
36: if match
37: logger.info "\tMatch found for page view #{i}-#{j} with CTID #{page_view.id} -- #{match.id} -- QueueSize: #{queue.size}"
38: page_view.update!(match.remote_ip, match.request_date)
39: match.mark_as_processed!
40: matches += 1
41: else
42: logger.info "\tNo match found for page view #{i}-#{j} with CTID #{page_view.id}"
43: non_matches += 1
44: end
45: end
46: end
47:
48: logger.info "Matches: #{matches}"
49: logger.info "NonMatches: #{non_matches}"
50: logger.info "Match %: #{matches / (matches + non_matches).to_f * 100}%"
51:
52: exit!(0)
53:
54: end