-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdocx_draft_layout.rb
executable file
·184 lines (137 loc) · 4.56 KB
/
docx_draft_layout.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env ruby -W0
###################################################
###
## File: docx_draft_style.rb
## Desc: Displays the paragraph style and contents of an MS Word *.docx file
#
require 'amazing_print' # Pretty print Ruby objects with proper indentation and colors
require 'pathname' # STDLIB
require 'docx' # a ruby library/gem for interacting with .docx files
require 'word_wrapper' # Pure ruby word wrapping
style_width = 20
line_width = 4 * style_width
if '--html' == ARGV.first
html_desired = true
ARGV.shift
else
html_desired = false
end
if '--no-char' == ARGV.first
show_char_styles = false
ARGV.shift
else
show_char_styles = true
end
if ARGV.empty? or ARGV.first == "-h" or ARGV.first == "--help"
puts
puts "Usage: docx_draft_style.rb [options] MS_WORD_DOCX++"
puts
puts " Where:"
puts " MS_WORD_DOCX++ is one or more Microsoft Word DOCX filenames"
puts
puts " options are:"
puts
puts " -h or --help This usage message is produced"
puts " --html Produces an HTML file for each input file"
puts " --no-char Do not show character styles within content"
puts
exit
end
class String
def wrap_with_style(style, html_preformatted = false)
html_preformatted ? "<#{style}>" + self + "</#{style}>" : "<#{style}>" + self + "</#{style}>"
end
end # if class String
# SMELL: This returns a string in which the spaces have been squeezed out
def paragraph_style(para)
begin
style_element = para.node.children.children.first.attributes.first.last
#ap style_element
style_element.value.to_s
rescue Exception => e
#puts "ERROR: #{e}"
#ap para.node
"Normal"
end
end # of def paragraph_style(para)
# A apragraph consists of one or more text_runs
# A paragraph has a style
# A text_run has a consistent style
def paragraph_contents(para, html = false, char_style = true)
# ap para
contents_string = ""
para.text_runs.each do |tr|
begin
name = tr.node.children[0].children[0].name
rescue Exception => e
#puts "ERROR: #{e}"
#ap tr.node.children[0] # .children[0]
#style = "text"
name = tr.node.children[0].name
end
if 'rStyle' == name
style = tr.node.children[0].children[0].attributes.first.last.value
elsif 'text' == name
style = nil
else
style = name
end
# SMELL: The TAB character is implemented as a character style; this code
# does not take into account tab stops, etc. May want to insert some
# spaces or let the tab style through regardless the value of char_style
if char_style # or 'tab' == style
contents_string += style.nil? ? tr.text : tr.text.wrap_with_style(style, html)
else
contents_string += tr.text
end
end # of para.text_runs.each do |tr|
#para.to_s
return contents_string
end # end of def paragraph_contents(para)
######################################################################
## Main Loop around the ARGV which should contain only file names
out_file = STDOUT
ARGV.each do |param|
given_document = Pathname.new(param)
unless '.docx' == given_document.extname.downcase
STDERR.puts
STDERR.puts "WARNING: Not a *.docx file -- skipping."
STDERR.puts " File: #{given_document}"
STDERR.puts
next
end
d = Docx::Document.open(given_document.to_s)
if html_desired
html_pathname = given_document.dirname + ( given_document.basename.to_s + ".html" )
out_file = html_pathname.open("w")
out_file.puts "<html><head><title>#{given_document}</title></head>"
out_file.puts "<body>"
out_file.puts '<a href="javascript:window.print()">Send to Printer</a><br /><br />'
out_file.puts "<pre>"
end
out_file.puts
out_file.puts "MS Word (docx) File Name: #{given_document}"
out_file.puts "Draft Layout Generated on: #{Time.now}"
out_file.puts "-"*(30+given_document.to_s.length)
out_file.puts
d.paragraphs.each do |para|
style = paragraph_style(para)
contents = paragraph_contents(para, html_desired, show_char_styles)
out_file.print style+" "*(style_width > style.size ? style_width-style.size : 3)
lines = WordWrapper::MinimumRaggedness.new(line_width, contents).wrap.split("\n")
x=1
lines.each do |a_line|
out_file.puts a_line
x-=1
out_file.print " "*style_width unless x>0
end
out_file.puts
end
out_file.puts
if html_desired
out_file.puts "</pre>"
out_file.puts "</body>"
out_file.puts "</html>"
#out_file.close
end
end # end of ARGV.each do |param|