From 2f3afafb738ae848a8a2d164780571cf9a7eb6ce Mon Sep 17 00:00:00 2001 From: Mitsuhiro Shibuya Date: Sun, 2 Apr 2023 18:37:18 +0900 Subject: [PATCH] Try to preserve the original URI as much as possible Closes #2631 --- lib/carrierwave/downloader/base.rb | 21 +++++++------ lib/carrierwave/utilities/uri.rb | 22 +++++++------ spec/downloader/base_spec.rb | 50 ++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 29 deletions(-) diff --git a/lib/carrierwave/downloader/base.rb b/lib/carrierwave/downloader/base.rb index 1c2b95eea..8a1f7112e 100644 --- a/lib/carrierwave/downloader/base.rb +++ b/lib/carrierwave/downloader/base.rb @@ -6,6 +6,8 @@ module CarrierWave module Downloader class Base + include CarrierWave::Utilities::Uri + attr_reader :uploader def initialize(uploader) @@ -61,17 +63,16 @@ def download(url, remote_headers = {}) # # [url (String)] The URL where the remote file is stored # - def process_uri(uri) - uri_parts = uri.split('?') - encoded_uri = Addressable::URI.parse(uri_parts.shift).normalize.to_s - query = uri_parts.any? ? "?#{uri_parts.join('?')}" : '' - begin - URI.parse("#{encoded_uri}#{query}") - rescue URI::InvalidURIError - URI.parse("#{encoded_uri}#{URI::DEFAULT_PARSER.escape(query)}") - end + def process_uri(source) + uri = Addressable::URI.parse(source) + uri.host = uri.normalized_host + # Perform decode first, as the path is likely to be already encoded + uri.path = encode_path(decode_uri(uri.path)) if uri.path =~ CarrierWave::Utilities::Uri::PATH_UNSAFE + uri.query = encode_non_ascii(uri.query) if uri.query + uri.fragment = encode_non_ascii(uri.fragment) if uri.fragment + URI.parse(uri.to_s) rescue URI::InvalidURIError, Addressable::URI::InvalidURIError - raise CarrierWave::DownloadError, "couldn't parse URL: #{uri}" + raise CarrierWave::DownloadError, "couldn't parse URL: #{source}" end ## diff --git a/lib/carrierwave/utilities/uri.rb b/lib/carrierwave/utilities/uri.rb index 432348b29..81b7bc6bd 100644 --- a/lib/carrierwave/utilities/uri.rb +++ b/lib/carrierwave/utilities/uri.rb @@ -4,20 +4,22 @@ module CarrierWave module Utilities module Uri # based on Ruby < 2.0's URI.encode - SAFE_STRING = URI::REGEXP::PATTERN::UNRESERVED + '\/' - UNSAFE = Regexp.new("[^#{SAFE_STRING}]", false) + PATH_SAFE = URI::REGEXP::PATTERN::UNRESERVED + '\/' + PATH_UNSAFE = Regexp.new("[^#{PATH_SAFE}]", false) + NON_ASCII = /[^[:ascii:]]/.freeze private def encode_path(path) - path.to_s.gsub(UNSAFE) do - us = $& - tmp = '' - us.each_byte do |uc| - tmp << sprintf('%%%02X', uc) - end - tmp - end + URI::DEFAULT_PARSER.escape(path, PATH_UNSAFE) + end + + def encode_non_ascii(str) + URI::DEFAULT_PARSER.escape(str, NON_ASCII) + end + + def decode_uri(str) + URI::DEFAULT_PARSER.unescape(str) end end # Uri end # Utilities diff --git a/spec/downloader/base_spec.rb b/spec/downloader/base_spec.rb index 20d40ad5d..eb4fa5c3e 100644 --- a/spec/downloader/base_spec.rb +++ b/spec/downloader/base_spec.rb @@ -5,7 +5,7 @@ let(:uploader) { uploader_class.new } let(:file) { File.read(file_path("test.jpg")) } let(:filename) { "test.jpg" } - let(:uri) { "http://www.example.com/#{CGI.escape(filename)}" } + let(:uri) { "http://www.example.com/#{URI::DEFAULT_PARSER.escape(filename)}" } subject { CarrierWave::Downloader::Base.new(uploader) } @@ -42,6 +42,29 @@ end end + context "with internationalized domain name" do + let(:uri) { "https://ドメイン名例.jp/test.jpg" } + before do + stub_request(:get, 'https://xn--eckwd4c7cu47r2wf.jp/test.jpg').to_return(body: file) + allow(Resolv).to receive(:getaddresses).with('xn--eckwd4c7cu47r2wf.jp').and_return(['1.2.3.4']) + end + + it "downloads a file" do + expect(subject.download(uri).file.read).to eq file + end + end + + context "with non-ascii characters in the query and the fragment" do + let(:uri) { "http://example.com/test.jpg?q=А#あああ" } + before do + stub_request(:get, "http://example.com/test.jpg?q=%D0%90").to_return(body: file) + end + + it "downloads a file" do + expect(subject.download(uri).file.read).to eq file + end + end + context 'with request headers' do let(:authentication_headers) do { @@ -172,62 +195,71 @@ end describe '#process_uri' do + it "returns an URI instance" do + uri = "http://example.com/" + expect(subject.process_uri(uri)).to be_an_instance_of(URI::HTTP) + end + it "converts a URL with internationalized domain name to Punycode URI" do uri = "http://ドメイン名例.jp/#{CGI.escape(filename)}" processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq 'http://xn--eckwd4c7cu47r2wf.jp/test.jpg' end it "parses but not escape already escaped uris" do uri = 'http://example.com/%5B.jpg' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) + expect(processed.to_s).to eq(uri) + end + + it "does not perform normalization on path when not necessary" do + uri = 'http://example.com/o%CC%88.png' + processed = subject.process_uri(uri) expect(processed.to_s).to eq(uri) end it "parses but not escape uris with query-string-only characters not needing escaping" do uri = 'http://example.com/?foo[]=bar' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq(uri) end it "escapes and parse unescaped uris" do uri = 'http://example.com/ %[].jpg' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq('http://example.com/%20%25%5B%5D.jpg') end it "parses but not escape uris with query-string characters representing urls not needing escaping " do uri = 'http://example.com/?src0=https%3A%2F%2Fi.vimeocdn.com%2Fvideo%2F1234_1280x720.jpg' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq(uri) end it "escapes and parse brackets in uri paths without harming the query string" do uri = 'http://example.com/].jpg?test[]' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq('http://example.com/%5D.jpg?test[]') end it "escapes and parse unescaped characters in path" do uri = 'http://example.com/あああ.jpg' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq('http://example.com/%E3%81%82%E3%81%82%E3%81%82.jpg') end it "escapes and parse unescaped characters in query string" do uri = 'http://example.com/?q=あああ' processed = subject.process_uri(uri) - expect(processed.class).to eq(URI::HTTP) expect(processed.to_s).to eq('http://example.com/?q=%E3%81%82%E3%81%82%E3%81%82') end + it "escapes and parse unescaped characters in the fragment" do + uri = 'http://example.com/#あああ' + processed = subject.process_uri(uri) + expect(processed.to_s).to eq('http://example.com/#%E3%81%82%E3%81%82%E3%81%82') + end + it "throws an exception on bad uris" do uri = '~http:' expect { subject.process_uri(uri) }.to raise_error(CarrierWave::DownloadError)