forked from HariSekhon/DevOps-Python-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpig_udfs.jy
executable file
·110 lines (85 loc) · 3.27 KB
/
pig_udfs.jy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
#
# Author: Hari Sekhon
# Date: 2015-03-12 21:16:54 +0000 (Thu, 12 Mar 2015)
#
# https://github.com/harisekhon/devops-python-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# vim:ts=4:sts=4:sw=4:et:filetype=python
"""
Jython utility functions for Pig - primarily written to help me do Pig => SolrCloud indexing
Written on Python 2.6 and tested on Pig 0.14 on Tez via Hortonworks HDP 2.2
call from pig:
REGISTER 'pig_udfs.jy' using jython as hari;
hari.md5('test')
For tonnes more functions that may be useful see the git submodule PyLib
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import hashlib
import os
import platform
import random
import socket
import string
import time
import uuid
__author__ = 'Hari Sekhon'
__version__ = '0.3'
# using pipe separator because it's common for hostname's host components to have dashes in them
# this makes it easier to distinguish the components of the returned chararrays
sep = '|'
alnum_charset = string.ascii_letters + string.digits
pid = str(os.getpid())
hostname = platform.node()
if not hostname:
hostname = platform.uname()[1]
if not hostname:
hostname = os.uname()[1]
if not hostname:
hostname = socket.getfqdn()
############
# shamelessly borrowed from the Pig UDF manual's Jython section :)
@outputSchemaFunction('squareSchema') # pylint: disable=undefined-variable
def square(num):
return (num) * (num)
@schemaFunction('squareSchema') # pylint: disable=undefined-variable
def squareSchema(input): # pylint: disable=invalid-name,redefined-builtin
return input
@outputSchema('percent:double') # pylint: disable=undefined-variable
def percent(num, total):
return num * 100 / total
############
@outputSchema('time_hostname:chararray') # pylint: disable=undefined-variable
def time_hostname():
# time precision here is a trade off between space and uniqueness, I find 4 decimal places isn't unique but 6 is
return '%.6f' % time.time() + sep + hostname
@outputSchema('time_hostname_pid:chararray') # pylint: disable=undefined-variable
def time_hostname_pid():
return time_hostname() + sep + pid
@outputSchema('random_alnum:chararray') # pylint: disable=undefined-variable
def random_alnum(size):
return ''.join(random.choice(alnum_charset) for x in range(size))
@outputSchema('random_uuid:chararray') # pylint: disable=undefined-variable
def random_uuid(size):
return time_hostname_pid() + random_alnum(size)
@outputSchema('md5:chararray') # pylint: disable=undefined-variable
def md5(my_string):
return hashlib.md5(my_string).hexdigest()
@outputSchema('md5_uuid:chararray') # pylint: disable=undefined-variable
def md5_uuid(my_string):
return time_hostname_pid() + sep + md5(my_string)
@outputSchema('sha1:chararray') # pylint: disable=undefined-variable
def sha1(my_string):
return hashlib.sha1(my_string).hexdigest()
@outputSchema('sha1_uuid:chararray') # pylint: disable=undefined-variable
def sha1_uuid(my_string):
return time_hostname_pid() + sep + sha1(my_string)
# uuid module doesn't seem to have uuid4() in Python 2.6, but does on Python 2.7
@outputSchema('uuid4:chararray') # pylint: disable=undefined-variable
def uuid4():
return uuid.uuid4().get_hex()