Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AO3-6664 Imported_from_url search is too slow #4709

Merged
merged 22 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions app/models/story_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -798,10 +798,7 @@ def download_with_timeout(location, limit = 10)
begin
# we do a little cleanup here in case the user hasn't included the 'http://'
# or if they've used capital letters or an underscore in the hostname
uri = URI.parse(location)
uri = URI.parse('http://' + location) if uri.class.name == "URI::Generic"
uri.host.downcase!
uri.host.tr!(" ", "-")
uri = UrlFormatter.new(location).standardized
response = Net::HTTP.get_response(uri)
case response
when Net::HTTPSuccess
Expand Down
16 changes: 11 additions & 5 deletions app/models/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -379,13 +379,19 @@ def self.find_by_url_cache_key(url)
def self.find_by_url_uncached(url)
url = UrlFormatter.new(url)
Work.where(imported_from_url: url.original).first ||
Work.where(imported_from_url: [url.minimal, url.no_www, url.with_www, url.encoded, url.decoded]).first ||
Work.where("imported_from_url LIKE ?", "%#{url.minimal_no_http}%").select { |w|
Work.where(imported_from_url: [url.minimal,
url.with_http, url.with_https,
url.no_www, url.with_www,
url.encoded, url.decoded,
url.minimal_no_protocol_no_www]).first ||
Work.where("imported_from_url LIKE ? or imported_from_url LIKE ?",
"http://#{url.minimal_no_protocol_no_www}%",
"https://#{url.minimal_no_protocol_no_www}%").select do |w|
work_url = UrlFormatter.new(w.imported_from_url)
['original', 'minimal', 'no_www', 'with_www', 'encoded', 'decoded'].any? { |method|
%w[original minimal no_www with_www with_http with_https encoded decoded].any? do |method|
work_url.send(method) == url.send(method)
}
}.first
end
end.first
end

def self.find_by_url(url)
Expand Down
38 changes: 23 additions & 15 deletions lib/url_formatter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,34 @@ def minimal (input = url)
uri = Addressable::URI.parse(input)
queries = CGI::parse(uri.query) unless uri.query.nil?
if queries.nil?
return input.gsub(/(\?|#).*$/, '')
input.gsub(/[?#].*$/, "")
else
queries.keep_if { |k,v| ['sid'].include? k }
querystring = ('?' + URI.encode_www_form(queries)) unless queries.empty?
return input.gsub(/(\?|#).*$/, '') << querystring.to_s
queries.keep_if { |k, _| ["sid"].include? k }
querystring = "?#{URI.encode_www_form(queries)}" unless queries.empty?
input.gsub(/[?#].*$/, "") << querystring.to_s
end
end

def minimal_no_http
minimal.gsub(/https?:\/\/www\./, "")
def minimal_no_protocol_no_www
minimal.gsub(%r{https?://(www\.)?}, "")
end

def no_www
minimal.gsub(/http:\/\/www\./, "http://")
minimal.gsub(%r{(https?)://www\.}, "\\1://")
end

def with_www
minimal.gsub(/http:\/\//, "http://www.")
minimal.gsub(%r{(https?)://}, "\\1://www.")
end


def with_http
minimal.gsub(%r{https?://}, "").prepend("http://")
end

def with_https
minimal.gsub(%r{https?://}, "").prepend("https://")
end

def encoded
minimal URI::Parser.new.escape(url)
end
Expand All @@ -47,13 +55,13 @@ def decoded
URI::Parser.new.unescape(minimal)
end

# Adds http if not present and downcases the host
# Adds http if not present, downcases the host and hyphenates spaces
# Extracted from story parser class
# Returns a Generic::URI
def standardized
clean_url = URI.parse(url)
clean_url = URI.parse('http://' + url) if clean_url.class.name == "URI::Generic"
clean_url.host = clean_url.host.downcase
clean_url.to_s
uri = URI.parse(url)
uri = URI.parse("http://#{url}") if uri.instance_of?(URI::Generic)
uri.host = uri.host.downcase.tr(" ", "-")
uri
end

end
63 changes: 52 additions & 11 deletions spec/lib/url_formatter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
end
end

describe '#minimal' do
it "should remove anchors and query parameters from url" do
describe "#minimal" do
it "should remove anchors and query parameters from url containing no 'www'" do
url = "http://ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).minimal).to eq("http://ao3.org")
end
Expand All @@ -20,18 +20,37 @@
end
end

describe '#no_www' do
describe "minimal_no_protocol_no_www" do
it "should handle http url containing 'www'" do
url = "http://www.ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).minimal_no_protocol_no_www).to eq("ao3.org")
end
it "should handle http url NOT containing 'www'" do
url = "http://ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).minimal_no_protocol_no_www).to eq("ao3.org")
end
it "should handle httpS url containing 'www'" do
url = "https://ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).minimal_no_protocol_no_www).to eq("ao3.org")
end
it "should handle httpS url NOT containing 'www'" do
url = "https://ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).minimal_no_protocol_no_www).to eq("ao3.org")
end
end

describe "#no_www" do
it "should remove www from the url" do
url = "http://www.ao3.org"
expect(UrlFormatter.new(url).no_www).to eq("http://ao3.org")
end
it "should remove www, query parameters and anchors from the url" do
url = "http://www.ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).no_www).to eq("http://ao3.org")
url = "http://www.ao3.org/123?evil=false#monkeys"
expect(UrlFormatter.new(url).no_www).to eq("http://ao3.org/123")
end
end

describe '#with_www' do
describe "#with_www" do
it "should add www to the url" do
url = "http://ao3.org"
expect(UrlFormatter.new(url).with_www).to eq("http://www.ao3.org")
Expand All @@ -42,27 +61,49 @@
end
end

describe '#encoded' do
describe "#with_http" do
it "should add http:// to the url" do
url = "ao3.org"
expect(UrlFormatter.new(url).with_http).to eq("http://ao3.org")
end
it "should switch https to http and remove query parameters and anchors" do
url = "https://ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).with_http).to eq("http://ao3.org")
end
end

describe "#with_https" do
it "should add https:// to the url" do
url = "ao3.org"
expect(UrlFormatter.new(url).with_https).to eq("https://ao3.org")
end
it "should switch http to https and remove query parameters and anchors" do
url = "http://ao3.org?evil=false#monkeys"
expect(UrlFormatter.new(url).with_https).to eq("https://ao3.org")
end
end

describe "#encoded" do
it "should URI encode the url" do
url = "http://ao3.org/why would you do this"
expect(UrlFormatter.new(url).encoded).to eq("http://ao3.org/why%20would%20you%20do%20this")
end
end

describe '#decoded' do
describe "#decoded" do
it "should URI decode the url" do
url = "http://ao3.org/why%20would%20you%20do%20this"
expect(UrlFormatter.new(url).decoded).to eq("http://ao3.org/why would you do this")
end
end

describe '#standardized' do
describe "#standardized" do
it "should add http" do
expect(UrlFormatter.new('ao3.org').standardized).to eq("http://ao3.org")
expect(UrlFormatter.new("ao3.org").standardized.to_s).to eq("http://ao3.org")
end
it "should downcase the domain" do
url = "http://YAYCAPS.COM/ILOVECAPS"
expect(UrlFormatter.new(url).standardized).to eq("http://yaycaps.com/ILOVECAPS")
expect(UrlFormatter.new(url).standardized.to_s).to eq("http://yaycaps.com/ILOVECAPS")
end
end

Expand Down
29 changes: 20 additions & 9 deletions spec/models/work_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -417,12 +417,11 @@

describe "#find_by_url" do
it "should find imported works with various URL formats" do
[
'http://foo.com/bar.html',
'http://foo.com/bar',
'http://lj-site.com/bar/foo?color=blue',
'http://www.foo.com/bar'
].each do |url|
%w[http://foo.com/bar.html
http://foo.com/bar
http://lj-site.com/bar/foo?color=blue
https://www.lj-site.com/bar/foo?color=blue
http://www.foo.com/bar https://www.foo.com/bar].each do |url|
work = create(:work, imported_from_url: url)
expect(Work.find_by_url(url)).to eq(work)
work.destroy
Expand All @@ -431,9 +430,9 @@

it "should not mix up imported works with similar URLs or significant query parameters" do
{
'http://foo.com/12345' => 'http://foo.com/123',
'http://efiction-site.com/viewstory.php?sid=123' => 'http://efiction-site.com/viewstory.php?sid=456',
'http://www.foo.com/i-am-something' => 'http://foo.com/i-am-something/else'
"http://foo.com/12345" => "http://foo.com/123",
"http://efiction-site.com/viewstory.php?sid=123" => "http://efiction-site.com/viewstory.php?sid=456",
"http://www.foo.com/i-am-something" => "http://foo.com/i-am-something/else"
}.each do |import_url, find_url|
work = create(:work, imported_from_url: import_url)
expect(Work.find_by_url(find_url)).to_not eq(work)
Expand All @@ -447,6 +446,18 @@
work.destroy
end

it "finds works imported with HTTP protocol and irrelevant query parameters" do
work = create(:work, imported_from_url: "http://lj-site.com/thing1?style=mine")
expect(Work.find_by_url("https://lj-site.com/thing1?style=other")).to eq(work)
work.destroy
end

it "finds works imported with HTTPS protocol and irrelevant query parameters" do
work = create(:work, imported_from_url: "https://lj-site.com/thing1?style=mine")
expect(Work.find_by_url("http://lj-site.com/thing1?style=other")).to eq(work)
work.destroy
end

it "gets the work from cache when searching for an imported work by URL" do
url = "http://lj-site.com/thing2"
work = create(:work, imported_from_url: url)
Expand Down
Loading