forked from markrmiller/solr-map-reduce-example
-
Notifications
You must be signed in to change notification settings - Fork 0
/
readAvroContainer.conf
149 lines (133 loc) · 6.17 KB
/
readAvroContainer.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Application configuration file in HOCON format (Human-Optimized Config Object Notation).
# HOCON syntax is defined at http://github.com/typesafehub/config/blob/master/HOCON.md
# and also used by Akka (http://www.akka.io) and Play (http://www.playframework.org/).
# For more examples see http://doc.akka.io/docs/akka/2.1.2/general/configuration.html
# morphline.conf example file
# this is a comment
# Specify server locations in a SOLR_LOCATOR variable; used later in variable substitutions:
SOLR_LOCATOR : {
# Name of solr collection
collection : collection1
# ZooKeeper ensemble
zkHost : "127.0.0.1:9983"
# The maximum number of documents to send to Solr per network batch (throughput knob)
# batchSize : 100
}
# Specify an array of one or more morphlines, each of which defines an ETL
# transformation chain. A morphline consists of one or more (potentially
# nested) commands. A morphline is a way to consume records (e.g. Flume events,
# HDFS files or blocks), turn them into a stream of records, and pipe the stream
# of records through a set of easily configurable transformations on it's way to
# Solr.
morphlines : [
{
# Name used to identify a morphline. E.g. used if there are multiple morphlines in a
# morphline config file
id : morphline1
# Import all morphline commands in these java packages and their subpackages.
# Other commands that may be present on the classpath are not visible to this morphline.
importCommands : ["org.kitesdk.**", "org.apache.solr.**"]
commands : [
{
# Parse Avro container file and emit a record for each avro object
readAvroContainer {
# Optionally, require the input record to match one of these MIME types:
# supportedMimeTypes : [avro/binary]
# Optionally, use a custom Avro schema in JSON format inline:
# readerSchemaString : """<json can go here>"""
# Optionally, use a custom Avro schema file in JSON format:
# readerSchemaFile : /path/to/syslog.avsc
}
}
{
# Consume the output record of the previous command and pipe another record downstream.
#
# extractAvroPaths is a command that uses zero or more avro path expressions to extract
# values from an Avro object. Each expression consists of a record output field name (on
# the left side of the colon ':') as well as zero or more path steps (on the right hand
# side), each path step separated by a '/' slash. Avro arrays are traversed with the '[]'
# notation.
#
# The result of a path expression is a list of objects, each of which is added to the
# given record output field.
#
# The path language supports all Avro concepts, including nested structures, records,
# arrays, maps, unions, etc, as well as a flatten option that collects the primitives in
# a subtree into a flat list.
extractAvroPaths {
flatten : false
paths : {
id : /id
text : /text
user_friends_count : /user_friends_count
user_location : /user_location
user_description : /user_description
user_statuses_count : /user_statuses_count
user_followers_count : /user_followers_count
user_name : /user_name
user_screen_name : /user_screen_name
created_at : /created_at
retweet_count : /retweet_count
retweeted : /retweeted
in_reply_to_user_id : /in_reply_to_user_id
source : /source
in_reply_to_status_id : /in_reply_to_status_id
media_url_https : /media_url_https
expanded_url : /expanded_url
}
}
}
# Consume the output record of the previous command and pipe another record downstream.
#
# convert timestamp field to native Solr timestamp format
# e.g. 2012-09-06T07:14:34Z to 2012-09-06T07:14:34.000Z
{
convertTimestamp {
field : created_at
inputFormats : ["yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd"]
inputTimezone : UTC
# outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSSZ"
outputTimezone : America/Los_Angeles
}
}
# Consume the output record of the previous command and pipe another record downstream.
#
# Command that sanitizes record fields that are unknown to Solr schema.xml by either
# deleting them (renameToPrefix is absent or a zero length string), or by moving them to a
# field prefixed with the given renameToPrefix (e.g. renameToPrefix = "ignored_" to use
# typical dynamic Solr fields).
#
# Recall that Solr throws an exception on any attempt to load a document that contains a
# field that isn't specified in schema.xml.
{
sanitizeUnknownSolrFields {
# Location from which to fetch Solr schema
solrLocator : ${SOLR_LOCATOR}
# renameToPrefix : "ignored_"
}
}
# log the record at DEBUG level to SLF4J
{ logDebug { format : "output record: {}", args : ["@{}"] } }
# load the record into a Solr server or MapReduce Reducer.
{
loadSolr {
solrLocator : ${SOLR_LOCATOR}
}
}
]
}
]