From 474c40cff5ff34648cb9cca955f4ca55590d5ab1 Mon Sep 17 00:00:00 2001 From: Steven Daniels Date: Wed, 8 Apr 2015 12:28:04 +0800 Subject: [PATCH 1/2] Extract sheets in the correct order --- lib/roo/excelx.rb | 139 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 29 deletions(-) diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb index fec18959..345db728 100644 --- a/lib/roo/excelx.rb +++ b/lib/roo/excelx.rb @@ -505,39 +505,120 @@ def clean_sheet(sheet) @cleaned[sheet] = true end + # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx + # documents require a workbook.xml file, so a if the file is missing + # it is not a valid xlsx file. In these cases, an ArgumentError is + # raised. + # + # wb - a Zip::Entry for the workbook.xml file. + # path - A String for Zip::Entry's destination path. + # + # Examples + # + # extract_worksheet_ids(, 'tmpdir/roo_workbook.xml') + # # => ["rId1", "rId2", "rId3"] + # + # Returns an Array of Strings. + def extract_worksheet_ids(wb, path) + fail ArgumentError 'missing required workbook file' if wb.nil? + + wb.extract(path) + workbook_doc = Roo::Utils.load_xml(path).remove_namespaces! + workbook_doc.xpath('//sheet').map{ |s| s.attributes['id'].value } + end + + # Internal + # + # wb_rels - A Zip::Entry for the workbook.xml.rels file. + # path - A String for the Zip::Entry's destination path. + # + # Examples + # + # extract_worksheets(, 'tmpdir/roo_workbook.xml.rels') + # # => { + # "rId1"=>"worksheets/sheet1.xml", + # "rId2"=>"worksheets/sheet2.xml", + # "rId3"=>"worksheets/sheet3.xml" + # } + # + # Returns a Hash. + def extract_worksheets(wb_rels, path) + fail ArgumentError 'missing required workbook file' if wb_rels.nil? + + wb_rels.extract(path) + rels_doc = Roo::Utils.load_xml(path).remove_namespaces! + worksheet_type ='http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet' + + relationships = rels_doc.xpath('//Relationship').select do |relationship| + relationship.attributes['Type'].value == worksheet_type + end + + relationships.inject({}) do |hash, relationship| + attributes = relationship.attributes + id = attributes['Id']; + hash[id.value] = attributes['Target'].value + hash + end + end + # Extracts all needed files from the zip file def process_zipfile(tmpdir, zipfilename) @sheet_files = [] - Zip::File.foreach(zipfilename) do |entry| + entries = Zip::File.open(zipfilename).to_a.sort_by(&:name) + + # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames + # are not in order. With Numbers 3.1, the first sheet is always + # sheet.xml, not sheet1.xml. With Google, the order of the worksheets is + # independent of a worksheet's filename (i.e. sheet6.xml can be the + # first worksheet). + # + # workbook.xml lists the correct order of worksheets and + # workbook.xml.rels lists the filenames for those worksheets. + # + # workbook.xml: + # + # + # workbook.xml.rel: + # + # + wb = entries.find{ |e| e.name[/workbook.xml$/] } + wb_rels = entries.find{ |e| e.name[/workbook.xml.rels$/] } + + sheet_ids = extract_worksheet_ids(wb, "#{tmpdir}/roo_workbook.xml") + sheets = extract_worksheets(wb_rels, "#{tmpdir}/roo_workbook.xml.rels") + + sheet_ids.each_with_index do |id, i| + name = sheets[id] + entry = entries.find { |entry| entry.name =~ /#{name}$/ } + path = "#{tmpdir}/roo_sheet#{i + 1}" + @sheet_files << path + entry.extract(path) + end + + entries.each do |entry| path = - case entry.name.downcase - when /workbook.xml$/ - "#{tmpdir}/roo_workbook.xml" - when /sharedstrings.xml$/ - "#{tmpdir}/roo_sharedStrings.xml" - when /styles.xml$/ - "#{tmpdir}/roo_styles.xml" - when /sheet.xml$/ - path = "#{tmpdir}/roo_sheet" - @sheet_files.unshift(path) - path - when /sheet([0-9]+).xml$/ - # Numbers 3.1 exports first sheet without sheet number. Such sheets - # are always added to the beginning of the array which, naturally, - # causes other sheets to be pushed to the next index which could - # lead to sheet references getting overwritten, so we need to - # handle that case specifically. - nr = $1 - sheet_files_index = nr.to_i - 1 - sheet_files_index += 1 if @sheet_files[sheet_files_index] - @sheet_files[sheet_files_index] = "#{tmpdir}/roo_sheet#{nr.to_i}" - when /comments([0-9]+).xml$/ - nr = $1 - @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}" - when /sheet([0-9]+).xml.rels$/ - nr = $1 - @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}" - end + case entry.name.downcase + when /sharedstrings.xml$/ + "#{tmpdir}/roo_sharedStrings.xml" + when /styles.xml$/ + "#{tmpdir}/roo_styles.xml" + when /comments([0-9]+).xml$/ + # FIXME: Most of the time, The order of the comment files are the same + # the sheet order, i.e. sheet1.xml's comments are in comments1.xml. + # In some situations, this isn't true. The true location of a + # sheet's comment file is in the sheet1.xml.rels file. SEE + # ECMA-376 12.3.3 in "Ecma Office Open XML Part 1". + nr = $1 + @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}" + # I suspect these files need to be ordered appropriately, too + when /sheet([0-9]+).xml.rels$/ + # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but + # it also stores the location for sharedStrings, comments, + # drawings, etc. + nr = $1 + @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}" + end + if path entry.extract(path) end From 4e93b7c76264f66da1cda89320c035faf6c53220 Mon Sep 17 00:00:00 2001 From: Steven Daniels Date: Thu, 9 Apr 2015 18:13:30 +0800 Subject: [PATCH 2/2] Refactored Roo::Excelx#process_zipfile Extracted another method from process_zipfile. I could do more refactoring to reduce the line count, but readability might suffer. e.g. when /sharedstrings.xml$/ ... when /styles\.xml$/ ... becomes when /(sharedstrings|styles)\.xml$/ "#{tmpdir}/roo_#{Regexp.last_match[1]}.xml" --- lib/roo/excelx.rb | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb index 345db728..080ee9ac 100644 --- a/lib/roo/excelx.rb +++ b/lib/roo/excelx.rb @@ -519,7 +519,8 @@ def clean_sheet(sheet) # # => ["rId1", "rId2", "rId3"] # # Returns an Array of Strings. - def extract_worksheet_ids(wb, path) + def extract_worksheet_ids(entries, path) + wb = entries.find { |e| e.name[/workbook.xml$/] } fail ArgumentError 'missing required workbook file' if wb.nil? wb.extract(path) @@ -542,7 +543,8 @@ def extract_worksheet_ids(wb, path) # } # # Returns a Hash. - def extract_worksheets(wb_rels, path) + def extract_worksheet_rels(entries, path) + wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] } fail ArgumentError 'missing required workbook file' if wb_rels.nil? wb_rels.extract(path) @@ -561,6 +563,16 @@ def extract_worksheets(wb_rels, path) end end + def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) + sheet_ids.each_with_index do |id, i| + name = sheets[id] + entry = entries.find { |entry| entry.name =~ /#{name}$/ } + path = "#{tmpdir}/roo_sheet#{i + 1}" + @sheet_files << path + entry.extract(path) + end + end + # Extracts all needed files from the zip file def process_zipfile(tmpdir, zipfilename) @sheet_files = [] @@ -581,19 +593,9 @@ def process_zipfile(tmpdir, zipfilename) # workbook.xml.rel: # # - wb = entries.find{ |e| e.name[/workbook.xml$/] } - wb_rels = entries.find{ |e| e.name[/workbook.xml.rels$/] } - - sheet_ids = extract_worksheet_ids(wb, "#{tmpdir}/roo_workbook.xml") - sheets = extract_worksheets(wb_rels, "#{tmpdir}/roo_workbook.xml.rels") - - sheet_ids.each_with_index do |id, i| - name = sheets[id] - entry = entries.find { |entry| entry.name =~ /#{name}$/ } - path = "#{tmpdir}/roo_sheet#{i + 1}" - @sheet_files << path - entry.extract(path) - end + sheet_ids = extract_worksheet_ids(entries, "#{tmpdir}/roo_workbook.xml") + sheets = extract_worksheet_rels(entries, "#{tmpdir}/roo_workbook.xml.rels") + extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) entries.each do |entry| path = @@ -608,20 +610,17 @@ def process_zipfile(tmpdir, zipfilename) # In some situations, this isn't true. The true location of a # sheet's comment file is in the sheet1.xml.rels file. SEE # ECMA-376 12.3.3 in "Ecma Office Open XML Part 1". - nr = $1 - @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}" - # I suspect these files need to be ordered appropriately, too + nr = Regexp.last_match[1].to_i + @comments_files[nr - 1] = "#{tmpdir}/roo_comments#{nr}" when /sheet([0-9]+).xml.rels$/ # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but # it also stores the location for sharedStrings, comments, # drawings, etc. - nr = $1 - @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}" + nr = Regexp.last_match[1].to_i + @rels_files[nr - 1] = "#{tmpdir}/roo_rels#{nr}" end - if path - entry.extract(path) - end + entry.extract(path) if path end end