From 474c40cff5ff34648cb9cca955f4ca55590d5ab1 Mon Sep 17 00:00:00 2001
From: Steven Daniels <stevendaniels88@gmail.com>
Date: Wed, 8 Apr 2015 12:28:04 +0800
Subject: [PATCH 1/2] Extract sheets in the correct order

---
 lib/roo/excelx.rb | 139 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 110 insertions(+), 29 deletions(-)
diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb
index fec18959..345db728 100644
--- a/lib/roo/excelx.rb
+++ b/lib/roo/excelx.rb
@@ -505,39 +505,120 @@ def clean_sheet(sheet)
     @cleaned[sheet] = true
   end
 
+  # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx
+  #           documents require a workbook.xml file, so a if the file is missing
+  #           it is not a valid xlsx file. In these cases, an ArgumentError is
+  #           raised.
+  #
+  # wb - a Zip::Entry for the workbook.xml file.
+  # path - A String for Zip::Entry's destination path.
+  #
+  # Examples
+  #
+  #   extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
+  #   # => ["rId1", "rId2", "rId3"]
+  #
+  # Returns an Array of Strings.
+  def extract_worksheet_ids(wb, path)
+      fail ArgumentError 'missing required workbook file' if wb.nil?
+
+      wb.extract(path)
+      workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
+      workbook_doc.xpath('//sheet').map{ |s| s.attributes['id'].value }
+  end
+
+  # Internal
+  #
+  # wb_rels - A Zip::Entry for the workbook.xml.rels file.
+  # path - A String for the Zip::Entry's destination path.
+  #
+  # Examples
+  #
+  #   extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
+  #   # => {
+  #         "rId1"=>"worksheets/sheet1.xml",
+  #         "rId2"=>"worksheets/sheet2.xml",
+  #         "rId3"=>"worksheets/sheet3.xml"
+  #        }
+  #
+  # Returns a Hash.
+  def extract_worksheets(wb_rels, path)
+    fail ArgumentError 'missing required workbook file' if wb_rels.nil?
+
+    wb_rels.extract(path)
+    rels_doc = Roo::Utils.load_xml(path).remove_namespaces!
+    worksheet_type ='http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet'
+
+    relationships = rels_doc.xpath('//Relationship').select do |relationship|
+      relationship.attributes['Type'].value == worksheet_type
+    end
+
+    relationships.inject({}) do |hash, relationship|
+      attributes = relationship.attributes
+      id = attributes['Id'];
+      hash[id.value] = attributes['Target'].value
+      hash
+    end
+  end
+
   # Extracts all needed files from the zip file
   def process_zipfile(tmpdir, zipfilename)
     @sheet_files = []
-    Zip::File.foreach(zipfilename) do |entry|
+    entries = Zip::File.open(zipfilename).to_a.sort_by(&:name)
+
+    # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
+    #       are not in order. With Numbers 3.1, the first sheet is always
+    #       sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
+    #       independent of a worksheet's filename (i.e. sheet6.xml can be the
+    #       first worksheet).
+    #
+    #       workbook.xml lists the correct order of worksheets and
+    #       workbook.xml.rels lists the filenames for those worksheets.
+    #
+    #       workbook.xml:
+    #         <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
+    #         <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
+    #       workbook.xml.rel:
+    #         <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+    #         <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+    wb = entries.find{ |e| e.name[/workbook.xml$/] }
+    wb_rels = entries.find{ |e| e.name[/workbook.xml.rels$/] }
+
+    sheet_ids = extract_worksheet_ids(wb, "#{tmpdir}/roo_workbook.xml")
+    sheets = extract_worksheets(wb_rels, "#{tmpdir}/roo_workbook.xml.rels")
+
+    sheet_ids.each_with_index do |id, i|
+      name = sheets[id]
+      entry = entries.find { |entry| entry.name =~ /#{name}$/ }
+      path = "#{tmpdir}/roo_sheet#{i + 1}"
+      @sheet_files << path
+      entry.extract(path)
+    end
+
+    entries.each do |entry|
       path =
-        case entry.name.downcase
-        when /workbook.xml$/
-          "#{tmpdir}/roo_workbook.xml"
-        when /sharedstrings.xml$/
-          "#{tmpdir}/roo_sharedStrings.xml"
-        when /styles.xml$/
-          "#{tmpdir}/roo_styles.xml"
-        when /sheet.xml$/
-          path = "#{tmpdir}/roo_sheet"
-          @sheet_files.unshift(path)
-          path
-        when /sheet([0-9]+).xml$/
-          # Numbers 3.1 exports first sheet without sheet number. Such sheets
-          # are always added to the beginning of the array which, naturally,
-          # causes other sheets to be pushed to the next index which could
-          # lead to sheet references getting overwritten, so we need to
-          # handle that case specifically.
-          nr = $1
-          sheet_files_index = nr.to_i - 1
-          sheet_files_index += 1 if @sheet_files[sheet_files_index]
-          @sheet_files[sheet_files_index] = "#{tmpdir}/roo_sheet#{nr.to_i}"
-        when /comments([0-9]+).xml$/
-          nr = $1
-          @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}"
-        when /sheet([0-9]+).xml.rels$/
-          nr = $1
-          @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}"
-        end
+      case entry.name.downcase
+      when /sharedstrings.xml$/
+        "#{tmpdir}/roo_sharedStrings.xml"
+      when /styles.xml$/
+        "#{tmpdir}/roo_styles.xml"
+      when /comments([0-9]+).xml$/
+        # FIXME: Most of the time, The order of the comment files are the same
+        #       the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
+        #       In some situations, this isn't true. The true location of a
+        #       sheet's comment file is in the sheet1.xml.rels file. SEE
+        #       ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
+        nr = $1
+        @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}"
+        # I suspect these files need to be ordered appropriately, too
+      when /sheet([0-9]+).xml.rels$/
+        # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
+        #        it also stores the location for sharedStrings, comments,
+        #        drawings, etc.
+        nr = $1
+        @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}"
+      end
+
       if path
         entry.extract(path)
       end

From 4e93b7c76264f66da1cda89320c035faf6c53220 Mon Sep 17 00:00:00 2001
From: Steven Daniels <stevendaniels88@gmail.com>
Date: Thu, 9 Apr 2015 18:13:30 +0800
Subject: [PATCH 2/2] Refactored Roo::Excelx#process_zipfile

Extracted another method from process_zipfile. I could do more
refactoring to reduce the line count, but readability might suffer.
e.g.

      when /sharedstrings.xml$/
        ...
      when /styles\.xml$/
        ...

becomes

    when /(sharedstrings|styles)\.xml$/
      "#{tmpdir}/roo_#{Regexp.last_match[1]}.xml"
---
 lib/roo/excelx.rb | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb
index 345db728..080ee9ac 100644
--- a/lib/roo/excelx.rb
+++ b/lib/roo/excelx.rb
@@ -519,7 +519,8 @@ def clean_sheet(sheet)
   #   # => ["rId1", "rId2", "rId3"]
   #
   # Returns an Array of Strings.
-  def extract_worksheet_ids(wb, path)
+  def extract_worksheet_ids(entries, path)
+      wb = entries.find { |e| e.name[/workbook.xml$/] }
       fail ArgumentError 'missing required workbook file' if wb.nil?
 
       wb.extract(path)
@@ -542,7 +543,8 @@ def extract_worksheet_ids(wb, path)
   #        }
   #
   # Returns a Hash.
-  def extract_worksheets(wb_rels, path)
+  def extract_worksheet_rels(entries, path)
+    wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
     fail ArgumentError 'missing required workbook file' if wb_rels.nil?
 
     wb_rels.extract(path)
@@ -561,6 +563,16 @@ def extract_worksheets(wb_rels, path)
     end
   end
 
+  def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
+    sheet_ids.each_with_index do |id, i|
+      name = sheets[id]
+      entry = entries.find { |entry| entry.name =~ /#{name}$/ }
+      path = "#{tmpdir}/roo_sheet#{i + 1}"
+      @sheet_files << path
+      entry.extract(path)
+    end
+  end
+
   # Extracts all needed files from the zip file
   def process_zipfile(tmpdir, zipfilename)
     @sheet_files = []
@@ -581,19 +593,9 @@ def process_zipfile(tmpdir, zipfilename)
     #       workbook.xml.rel:
     #         <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
     #         <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
-    wb = entries.find{ |e| e.name[/workbook.xml$/] }
-    wb_rels = entries.find{ |e| e.name[/workbook.xml.rels$/] }
-
-    sheet_ids = extract_worksheet_ids(wb, "#{tmpdir}/roo_workbook.xml")
-    sheets = extract_worksheets(wb_rels, "#{tmpdir}/roo_workbook.xml.rels")
-
-    sheet_ids.each_with_index do |id, i|
-      name = sheets[id]
-      entry = entries.find { |entry| entry.name =~ /#{name}$/ }
-      path = "#{tmpdir}/roo_sheet#{i + 1}"
-      @sheet_files << path
-      entry.extract(path)
-    end
+    sheet_ids = extract_worksheet_ids(entries, "#{tmpdir}/roo_workbook.xml")
+    sheets = extract_worksheet_rels(entries, "#{tmpdir}/roo_workbook.xml.rels")
+    extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
 
     entries.each do |entry|
       path =
@@ -608,20 +610,17 @@ def process_zipfile(tmpdir, zipfilename)
         #       In some situations, this isn't true. The true location of a
         #       sheet's comment file is in the sheet1.xml.rels file. SEE
         #       ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
-        nr = $1
-        @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}"
-        # I suspect these files need to be ordered appropriately, too
+        nr = Regexp.last_match[1].to_i
+        @comments_files[nr - 1] = "#{tmpdir}/roo_comments#{nr}"
       when /sheet([0-9]+).xml.rels$/
         # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
         #        it also stores the location for sharedStrings, comments,
         #        drawings, etc.
-        nr = $1
-        @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}"
+        nr = Regexp.last_match[1].to_i
+        @rels_files[nr - 1] = "#{tmpdir}/roo_rels#{nr}"
       end
 
-      if path
-        entry.extract(path)
-      end
+      entry.extract(path) if path
     end
   end