Change fetch api (covidatlas#822)

* Add scraper as first arg to all fetch methods. * Script to add 'this' as first arg to all fetch calls. * Clarify tape test messaging. * Throw on invalid scraper. * Add test:integration yarn script. * Add cacheKey to fetch API public methods. * Script to add 'default' cache key to all fetch calls. * Add 'default' cache key to all scraper fetch calls. * Add scraper fetch call format check unit test. * Add option to log cache calls if LOG_CACHE_CALLS is set. * First draft, migration of cache hits to temp dir. * Fix cache key collisions. * Zip migrated file. * Update GB to use latest fetch API. * Differentiating cache keys for timeseries, even though they're ignored. * Add timeseries to report compare. * Move cache migration to separate module to simplify code review. * Allow comma-delimited list for --location * Update new/changed scrapers to use new fetch API.
shaperilio · Apr 16, 2020 · 27c32c0 · 27c32c0
1 parent 0fc9b11
commit 27c32c0
Show file tree

Hide file tree

Showing 154 changed files with 984 additions and 411 deletions.
diff --git a/package.json b/package.json
@@ -23,6 +23,7 @@
     "qa": "pta ./src/qa/*.qa.js -r log | node src/qa/utils/qa-reporter.js",
     "test": "tape tests/**/*-test.js | tap-spec",
     "test:unit": "tape tests/unit/**/*-test.js | tap-spec",
+    "test:integration": "tape tests/integration/**/*-test.js | tap-spec",
     "test:watch": "tape-watch tests/**/*-test.js -p tap-spec",
     "test:tz": "tape scripts/test-timezones.js | tap-spec",
     "timeseries": "node src/shared/timeseries/index.js",

diff --git a/scripts/change-fetch-api/add-cacheKey.rb b/scripts/change-fetch-api/add-cacheKey.rb
@@ -0,0 +1,181 @@
+# Hack all of the scraper files.
+#
+# USAGE:
+#
+# - cd into this directory,
+# - run `ruby this-script.rb WRITE [FILENAME]`
+#
+# WRITE is either true or false.
+# - 'save' to overwrite source files
+# - 'print' to dump to console
+# - 'mute' to not print or save (useful to see the changes that would happen)
+#
+# FILENAME: name of the file to work with.  If missing,
+# do all files.
+#
+# eg.,
+#   ruby <this-script.rb> print DEU/_shared.js
+
+if (ARGV.size < 1) then
+  puts "usage: ruby <this_script.rb> WRITE [FILENAME]"
+  puts "where WRITE = save/print/mute"
+  return
+end
+
+WRITE = ARGV[0].to_s.downcase
+FILENAME = (ARGV.size > 1) ? ARGV[1] : nil
+
+# Skip some files.
+# Not bothering to try to determine these programmatically.
+# IGNORE_FILES = %w(
+# AUS/_shared/get-data-with-tested-negative-applied.js
+# AUS/_shared/get-key.js
+# )
+
+METHODS = 'page|fetch|raw|json|jsonAndCookies|csv|tsv|pdf|headless'
+
+
+# NEED TO DO THIS:
+FETCH_RE = /(await\s+.*?\.(?:#{METHODS})\s*\(.*?, )([^,)]*)(.*)/
+
+
+# Manual tests to verify
+# lin = "const $ = await fetch.page(this, healthUrl);
+# const $ = await fetch.page(this, healthUrl, some_other_stuff);
+# const $ = await fetch.csv(this, 'something.com', some_other_stuff);
+# "
+# lin.scan(FETCH_RE).each do |m|
+#   puts '----'
+#   puts m.inspect
+#   puts "#{m[0]}#{m[1]}, 'default'#{m[2]}"
+# end
+
+
+# Print warnings only for each file f in scraper_dir.
+def validate(scraper_dir, f)
+  fpath = File.join(scraper_dir, f)
+  src = File.read(fpath)
+  [ FETCH_RE ].each do |re|
+    puts "WARN: No match for #{re} in #{f}" if (src !~ re)
+  end
+end
+
+
+def add_cacheKey_to_fetch_calls(src)
+  original_src = "CLONE: #{src}"
+
+  matches = src.scan(FETCH_RE)
+  # puts "add cacheKey: #{matches.inspect}"
+  matches.uniq.each do |m|
+    raise "bad re? #{m}" if m.size != 3
+    before, url, after = m
+    wholeline = [before, url, after].join('')
+    newline = "#{before}#{url}, 'default'#{after}"
+    puts "  + \"#{wholeline}\" => \"#{newline}\""
+    src = src.gsub(wholeline, newline)
+  end
+
+  src
+end
+
+
+# # Specific hack
+# def postmigration_AU_QLD_stuff(src)
+#   old = "async function getCurrentArticlePage(obj) {
+#   const $ = await fetch.page(this, obj.url);
+#   const anchors = $('#content h3:first-of-type > a');
+#   const currentArticleUrl = anchors[0].attribs.href;
+#   return fetch.page(currentArticleUrl);
+# }"
+#   new = "async function getCurrentArticlePage(obj) {
+#   const $ = await fetch.page(obj, obj.url);
+#   const anchors = $('#content h3:first-of-type > a');
+#   const currentArticleUrl = anchors[0].attribs.href;
+#   return fetch.page(obj, currentArticleUrl);
+# }"
+#   src = src.gsub(old, new)
+#   src
+# end
+
+
+# def post_migration_check(src)
+#   matches = src.scan(FETCH_RE)
+#   # puts "add this: #{matches.inspect}"
+#   matches.each do |m|
+#     raise "bad re? #{m}" if m.size != 3
+#     wholeline, before, after = m
+#     if (after !~ /this, /) then
+#       puts "  ??? Missing 'this' in fetch call in \"#{wholeline}\" ???"
+#     end
+#   end
+# end
+
+########################################
+
+scraper_dir = File.join(__dir__, '..', '..', 'src', 'shared', 'scrapers')
+
+files = []
+Dir.chdir(scraper_dir) do
+  files = Dir.glob(File.join('**', '*.js'))
+end
+# puts "Pre remove count: #{files.count}"
+# files -= IGNORE_FILES
+# puts "Post remove count: #{files.count}"
+puts "#{files.size} scraper files."
+
+
+if (!FILENAME.nil?) then
+  if (!files.include?(FILENAME)) then
+    puts "#{FILENAME} is not in the list of scraper files:"
+    puts files.sort.map { |s| "   #{s}" }
+    return
+  else
+    files = [FILENAME]
+  end
+end
+
+files.sort!
+
+puts "VALIDATION ========================================"
+files.each do |f|
+  validate(scraper_dir, f)
+end
+puts "END VALIDATION ===================================="
+
+
+# During dev, just do one file.
+# add_filename_to_scraper_this(scraper_dir, files[0])
+# files = [files[0]]
+# files = ['DEU/_shared.js']
+
+puts "MUTATION ========================================"
+files.each do |f|
+  puts
+  puts '=' * 50
+  puts f
+  puts '-' * 50
+  fpath = File.join(scraper_dir, f)
+  src = File.read(fpath)
+
+  src = add_cacheKey_to_fetch_calls(src)
+
+  # post_migration_check(src)
+  puts
+
+  case(WRITE)
+  when 'save' then
+    File.open(fpath, 'w') { |p| p.puts(src) }
+  when 'print' then
+    puts
+    puts "Result:"
+    puts "-" * 50
+    puts src
+    puts "-" * 50
+    puts
+  when 'mute' then
+    puts ''
+  end
+
+end
+puts "END MUTATION ===================================="
+puts
diff --git a/scripts/change-fetch-api/migrate-scrapers-add-this-to-fetch.rb b/scripts/change-fetch-api/migrate-scrapers-add-this-to-fetch.rb
@@ -0,0 +1,205 @@
+# Hack all of the scraper files.
+#
+# USAGE:
+#
+# - cd into this directory,
+# - run `ruby this-script.rb WRITE [FILENAME]`
+#
+# WRITE is either true or false.
+# - 'save' to overwrite source files
+# - 'print' to dump to console
+# - 'mute' to not print or save (useful to see the changes that would happen)
+#
+# FILENAME: name of the file to work with.  If missing,
+# do all files.
+#
+# eg.,
+#   ruby <this-script.rb> print DEU/_shared.js
+
+if (ARGV.size < 1) then
+  puts "usage: ruby <this_script.rb> WRITE [FILENAME]"
+  puts "where WRITE = save/print/mute"
+  return
+end
+
+WRITE = ARGV[0].to_s.downcase
+FILENAME = (ARGV.size > 1) ? ARGV[1] : nil
+
+# Skip some files.
+# Not bothering to try to determine these programmatically.
+IGNORE_FILES = %w(
+AUS/_shared/get-data-with-tested-negative-applied.js
+AUS/_shared/get-key.js
+)
+
+
+LOCATION_RE = /(\s*)(city|county|state|country):/
+METHODS = 'page|fetch|raw|json|jsonAndCookies|csv|tsv|pdf|headless|getArcGISCSVURLFromOrgId|getArcGISCSVURL'
+
+# The fancy RE below splits a line like "await fetch.csv(this.url)"
+# into ["await fetch.csv(this.url)", "await fetch.csv(", "this.url)"]
+# It can screw up in some cases, so we add a hack.
+FETCH_RE = /((await\s+.*?\.(?:#{METHODS})\s*\()(.*))/
+
+# Print warnings only for each file f in scraper_dir.
+def validate(scraper_dir, f)
+  fpath = File.join(scraper_dir, f)
+  src = File.read(fpath)
+  [ LOCATION_RE, FETCH_RE ].each do |re|
+    puts "WARN: No match for #{re} in #{f}" if (src !~ re)
+  end
+end
+
+
+# DISABLING THIS -- scrapers should already have _path.
+# def add_filename_to_scraper_this(src)
+#   m = src.match(LOCATION_RE)
+#   # puts "add filename: #{m.inspect}"
+#   if (m.nil?) then
+#     puts "  - skipping adding filepath (no match for RE)"
+#     return src
+#   end
+# 
+#   if (src =~ /filepath: __filename/)
+#     puts "  - skipping adding _filepath, already added"
+#     return src
+#   end
+# 
+#   spaces = m[1].gsub("\n", '')
+#   loctype = m[2]
+#   puts "  + adding filepath above #{loctype}"
+#   add_code = "
+# #{spaces}_filepath: __filename,
+# #{spaces}#{loctype}:"
+#   src = src.sub(LOCATION_RE, add_code)
+#   src
+# end
+
+
+def add_this_to_fetch_calls(src)
+  original_src = "CLONE: #{src}"
+
+  matches = src.scan(FETCH_RE)
+  puts "add this: #{matches.inspect}"
+  matches.uniq.each do |m|
+    raise "bad re? #{m}" if m.size != 3
+    wholeline, before, after = m
+    if (after =~ /this, /) then
+      puts "  - 'this, ' already in \"#{wholeline}\", skipping"
+    else
+      newline = "#{before}this, #{after}"
+      puts "  + \"#{wholeline}\" => \"#{newline}\""
+      src = src.gsub(wholeline, newline)
+    end
+  end
+
+  if (original_src !~ /this,\s*this,/ && src =~ /this,\s*this,/) then
+    src = src.gsub(/this,\s*this,/, 'this, ')
+  end
+  raise "still have 'this, this'" if (src =~ /this,\s*this,/)
+
+  src
+end
+
+# Specific hack
+def postmigration_AU_QLD_stuff(src)
+  old = "async function getCurrentArticlePage(obj) {
+  const $ = await fetch.page(this, obj.url);
+  const anchors = $('#content h3:first-of-type > a');
+  const currentArticleUrl = anchors[0].attribs.href;
+  return fetch.page(currentArticleUrl);
+}"
+  new = "async function getCurrentArticlePage(obj) {
+  const $ = await fetch.page(obj, obj.url);
+  const anchors = $('#content h3:first-of-type > a');
+  const currentArticleUrl = anchors[0].attribs.href;
+  return fetch.page(obj, currentArticleUrl);
+}"
+  src = src.gsub(old, new)
+  src
+end
+
+
+def post_migration_check(src)
+  matches = src.scan(FETCH_RE)
+  # puts "add this: #{matches.inspect}"
+  matches.each do |m|
+    raise "bad re? #{m}" if m.size != 3
+    wholeline, before, after = m
+    if (after !~ /this, /) then
+      puts "  ??? Missing 'this' in fetch call in \"#{wholeline}\" ???"
+    end
+  end
+end
+
+########################################
+
+scraper_dir = File.join(__dir__, '..', '..', 'src', 'shared', 'scrapers')
+
+files = []
+Dir.chdir(scraper_dir) do
+  files = Dir.glob(File.join('**', '*.js'))
+end
+# puts "Pre remove count: #{files.count}"
+files -= IGNORE_FILES
+# puts "Post remove count: #{files.count}"
+puts "#{files.size} scraper files."
+
+
+if (!FILENAME.nil?) then
+  if (!files.include?(FILENAME)) then
+    puts "#{FILENAME} is not in the list of scraper files:"
+    puts files.sort.map { |s| "   #{s}" }
+    return
+  else
+    files = [FILENAME]
+  end
+end
+
+files.sort!
+
+puts "VALIDATION ========================================"
+files.each do |f|
+  validate(scraper_dir, f)
+end
+puts "END VALIDATION ===================================="
+
+
+# During dev, just do one file.
+# add_filename_to_scraper_this(scraper_dir, files[0])
+# files = [files[0]]
+# files = ['DEU/_shared.js']
+
+puts "MUTATION ========================================"
+files.each do |f|
+  puts
+  puts '=' * 50
+  puts f
+  puts '-' * 50
+  fpath = File.join(scraper_dir, f)
+  src = File.read(fpath)
+  # src = add_filename_to_scraper_this(src)
+  src = add_this_to_fetch_calls(src)
+  raise "BAD this, this" if (src =~ /this, this,/)
+  src = postmigration_AU_QLD_stuff(src)
+
+  post_migration_check(src)
+  puts
+
+  case(WRITE)
+  when 'save' then
+    File.open(fpath, 'w') { |p| p.puts(src) }
+  when 'print' then
+    puts
+    puts "Result:"
+    puts "-" * 50
+    puts src
+    puts "-" * 50
+    puts
+  when 'mute' then
+    puts ''
+  end
+
+end
+puts "END MUTATION ===================================="
+puts