-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
157 lines (132 loc) · 5.14 KB
/
Makefile
File metadata and controls
157 lines (132 loc) · 5.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
hadoop.root=/usr/local/hadoop-3.3.5
jar.name=mr-design-patterns-1.0.jar
jar.path=target/${jar.name}
job.name=mr.design.patterns.inputoutput.generatingdata.RandomDataGeneration
local.input= 4
local.output=output
local.parameter_1 ?= 100
local.parameter_2 ?= /Users/bob/Desktop/outputs/design-patterns-mapreduce/input/words_alpha.txt
local.parameter_3 ?=
# DistributedGrep = '<row [^>]*?/>'
local.log = log
max.value = 500
# Pseudo-Cluster Execution
hdfs.user.name=forhadoop
hdfs.input=input
hdfs.output=output
# AWS EMR Execution
aws.emr.release=emr-6.0.0
aws.region=us-east-2
aws.bucket.name=replicatedjoin-aamod
aws.subnet.id=subnet-065f896d
aws.input=input
aws.output=output
aws.log.dir=log
aws.num.nodes=6
aws.instance.type=m4.large
# -----------------------------------------------------------
# Compiles code and builds jar (with dependencies).
jar:
mvn clean package
# Removes local output directory.
clean-local-output:
rm -rf ${local.output}*
clean-local-log:
rm -rf ${local.log}*
# Runs standalone
# Make sure Hadoop is set up (in /etc/hadoop files) for standalone operation (not pseudo-cluster).
# https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Standalone_Operation
local: jar clean-local-output
${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${local.input} ${local.output} ${local.parameter_1} ${local.parameter_2} ${local.parameter_3}
# Start HDFS
start-hdfs:
${hadoop.root}/sbin/start-dfs.sh
# Stop HDFS
stop-hdfs:
${hadoop.root}/sbin/stop-dfs.sh
# Start YARN
start-yarn: stop-yarn
${hadoop.root}/sbin/start-yarn.sh
# Stop YARN
stop-yarn:
${hadoop.root}/sbin/stop-yarn.sh
# Reformats & initializes HDFS.
format-hdfs: stop-hdfs
rm -rf /tmp/hadoop*
${hadoop.root}/bin/hdfs namenode -format
# Initializes user & input directories of HDFS.
init-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -rm -r -f /user
${hadoop.root}/bin/hdfs dfs -mkdir /user
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}/${hdfs.input}
# Load data to HDFS
upload-input-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -put ${local.input}/* /user/${hdfs.user.name}/${hdfs.input}
# Removes hdfs output directory.
clean-hdfs-output:
${hadoop.root}/bin/hdfs dfs -rm -r -f ${hdfs.output}*
# Download output from HDFS to local.
download-output-hdfs: clean-local-output
mkdir ${local.output}
${hadoop.root}/bin/hdfs dfs -get ${hdfs.output}/* ${local.output}
# Runs pseudo-clustered (ALL). ONLY RUN THIS ONCE, THEN USE: make pseudoq
# Make sure Hadoop is set up (in /etc/hadoop files) for pseudo-clustered operation (not standalone).
# https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation
pseudo: jar stop-yarn format-hdfs init-hdfs upload-input-hdfs start-yarn clean-local-output
${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${hdfs.input} ${hdfs.output}
make download-output-hdfs
# Runs pseudo-clustered (quickie).
pseudoq: jar clean-local-output clean-hdfs-output
${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${hdfs.input} ${hdfs.output}
make download-output-hdfs
# Create S3 bucket.
make-bucket:
aws s3 mb s3://${aws.bucket.name}
# Upload data to S3 input dir.
upload-input-aws: make-bucket
aws s3 sync ${local.input} s3://${aws.bucket.name}/${aws.input}
# Delete S3 output dir.
delete-output-aws:
aws s3 rm s3://${aws.bucket.name}/ --recursive --exclude "*" --include "${aws.output}*"
# Upload application to S3 bucket.
upload-app-aws:
aws s3 cp ${jar.path} s3://${aws.bucket.name}
# Main EMR launch.
aws: jar upload-app-aws delete-output-aws
aws emr create-cluster \
--name "Replicated Join MR" \
--release-label ${aws.emr.release} \
--instance-groups '[{"InstanceCount":${aws.num.nodes},"InstanceGroupType":"CORE","InstanceType":"${aws.instance.type}"},{"InstanceCount":1,"InstanceGroupType":"MASTER","InstanceType":"${aws.instance.type}"}]' \
--applications Name=Hadoop \
--steps '[{"Args":["${job.name}","s3://${aws.bucket.name}/${aws.input}","s3://${aws.bucket.name}/${aws.output}","Type":"CUSTOM_JAR","Jar":"s3://${aws.bucket.name}/${jar.name}","ActionOnFailure":"TERMINATE_CLUSTER","Name":"Custom JAR"}]' \
--log-uri s3://${aws.bucket.name}/${aws.log.dir} \
--use-default-roles \
--enable-debugging \
--auto-terminate
# Download output from S3.
download-output-aws: clean-local-output clean-local-log
mkdir ${local.output}
aws s3 sync s3://${aws.bucket.name}/${aws.output} ${local.output}
mkdir $(local.log)
aws s3 sync s3://${aws.bucket.name}/${aws.log.dir} ${local.log}
# Change to standalone mode.
switch-standalone:
cp config/standalone/*.xml ${hadoop.root}/etc/hadoop
# Change to pseudo-cluster mode.
switch-pseudo:
cp config/pseudo/*.xml ${hadoop.root}/etc/hadoop
# Package for release.
distro:
rm -f MR-Demo.tar.gz
rm -f MR-Demo.zip
rm -rf build
mkdir -p build/deliv/MR-Demo
cp -r src build/deliv/MR-Demo
cp -r config build/deliv/MR-Demo
cp -r input build/deliv/MR-Demo
cp pom.xml build/deliv/MR-Demo
cp Makefile build/deliv/MR-Demo
cp README.txt build/deliv/MR-Demo
tar -czf MR-Demo.tar.gz -C build/deliv MR-Demo
cd build/deliv && zip -rq ../../MR-Demo.zip MR-Demo