forked from HariSekhon/DevOps-Python-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhadoop_hdfs_files_native_checksums.jy
executable file
·193 lines (162 loc) · 6.47 KB
/
hadoop_hdfs_files_native_checksums.jy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env jython
#
# Author: Hari Sekhon
# Date: 2013-06-20 18:21:02 +0100 (Thu, 20 Jun 2013)
#
# https://github.com/harisekhon/devops-python-tools
#
# License: see accompanying LICENSE file
#
# vim:ts=4:sts=4:sw=4:et:filetype=python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
__author__ = 'Hari Sekhon'
__version__ = '0.5.0'
# pylint: disable=wrong-import-position
import os
import sys
# Refusing to use either optparse or argparse since it's annoyingly non-portable across different versions of Python
import getopt
libdir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'pylib'))
sys.path.append(libdir)
try:
from harisekhon.utils import jython_only, die, printerr, ERRORS, \
isJavaOOM, java_oom_fix_msg, log_jython_exception, get_jython_exception
except ImportError as _:
print('module import failed: %s' % _, file=sys.stderr)
sys.exit(4)
jython_only()
try:
from org.apache.hadoop.conf import Configuration
from org.apache.hadoop.fs import Path
#from org.apache.hadoop.hdfs import DistributedFileSystem
from org.apache.hadoop.fs import FileSystem
from org.apache.hadoop.util import StringUtils
#from org.apache.hadoop.security import AccessControlException
except ImportError:
die("Couldn't find Hadoop Java classes, try: jython -J-cp `hadoop classpath` %s <args>" % sys.argv[0])
def usage(*msg):
""" Print usage and exit """
if msg:
printerr("".join(msg))
die("""
Hari Sekhon - https://github.com/harisekhon/devops-python-tools
================================================================================
%s - version %s
================================================================================
Jython program to fetch the HDFS native checksums for given files / all files under given directories.
Quick way of checking for duplicate files but not checking content itself. This is at least ~100x more efficient in terms of data transfer than 'hadoop fs -cat | md5sum'.
Caveat: files with differing block sizes will not match
Will be implemented in hadoop-9209 jira for versions 3.0.0, 0.23.7, 2.1.0-beta using a new command line param: hadoop fs -checksum
USAGE: jython -J-cp `hadoop classpath` %s <list of HDFS files and/or directories>
-n --no-header Don't output headers
""" % (os.path.basename(__file__), __version__, os.path.basename(__file__)), ERRORS["UNKNOWN"])
def main():
""" Parse cli args and launch hdfs_get_file_checksum() """
noheader = False
try:
opts, args = getopt.gnu_getopt(sys.argv[1:], "hn", ["help", "usage", "no-header"])
except getopt.GetoptError as _:
usage("error: %s" % _)
for opt, arg in opts:
if opt in ("-n", "--no-header"):
noheader = True
elif opt in ("-h", "--help", "--usage"):
usage()
else:
usage()
filelist = set()
for arg in args:
filelist.add(arg)
if not filelist:
usage("no file / directory specified")
filelist = sorted(filelist)
try:
HDFSChecksumReader().print_checksums(filelist, noheader)
except KeyboardInterrupt:
printerr("Caught Control-C...", 1)
sys.exit("OK")
except Exception as _:
printerr("Error running HDFSChecksumReader: %s" % _, 1)
if isJavaOOM(_.message):
printerr(java_oom_fix_msg())
#import traceback; traceback.print_exc()
sys.exit("CRITICAL")
except:
log_jython_exception()
sys.exit("CRITICAL")
class HDFSChecksumReader(object):
""" Class to hold HDFS Checksum Read state """
def __init__(self):
""" Instantiate State """
conf = Configuration()
#self.filesystem= DistributedFileSystem.get(conf)
self.filesystem = FileSystem.get(conf)
@staticmethod
def get_path(filename):
""" Return the path object for a given filename """
try:
path = Path(filename)
except Exception:
return None
if path:
return path
else:
return None
def print_checksums(self, filelist, noheader):
""" Recurses directories and calls print_checksum(filename) per file """
if not noheader:
print("=" * 150)
print("%-56s %-25s %-12s %s" % ("Checksum", "Algorithm", "BlockSize", "Filename"))
print("=" * 150)
for filename in filelist:
path = self.get_path(filename)
if not path:
printerr("Failed to get HDFS path object for file " + filename, 1)
continue
if not self.filesystem.exists(path):
#raise IOError, "HDFS File not found: %s" % filename
printerr("HDFS file/dir not found: %s" % filename, 1)
continue
self.print_checksum_recurse(filename, path)
def print_checksum_recurse(self, filename, path):
""" Recurses a path object if directory or passes to print_checksum if file """
if self.filesystem.isFile(path):
self.print_checksum(filename, path)
elif self.filesystem.isDirectory(path):
try:
_ = self.filesystem.listStatus(path)
for i in range(0, len(_)):
path = _[i].getPath()
self.print_checksum_recurse(path.toUri().getPath(), path)
except:
printerr(get_jython_exception().split("\n")[0], 1)
else:
raise IOError, ">>> %s is not a file or directory" % filename
def print_checksum(self, filename, path):
""" Prints the HDFS checksum for the given file + fs path object """
try:
checksum = self.filesystem.getFileChecksum(path)
if checksum:
print("%-56s %-25s %-12s %s" % (
StringUtils.byteToHexString(checksum.getBytes(), 0, checksum.getLength()),
checksum.getAlgorithmName(),
self.filesystem.getFileStatus(path).getBlockSize(),
filename)
)
else:
printerr("<NO CHECKSUM for file " + filename + ">", 1)
return 0
except Exception as _:
printerr(_, 1)
except:
print(">>> " + get_jython_exception().split("\n")[0])
return 0
return 1
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
pass