-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathavro.py
More file actions
21 lines (14 loc) · 734 Bytes
/
avro.py
File metadata and controls
21 lines (14 loc) · 734 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from pyspark import SparkContext
from pyspark.sql.context import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
sqlContext.setConf('spark.sql.avro.compression.codec', 'snappy')
# 讀取HDFS上的avro檔,也可以讀取本地的文件(--files指令)
input_path = '/tmp/page.avro'
output_path = '/tmp/result.avro'
df = sqlContext.read.format("com.databricks.spark.avro").load()
df = df.filter(df['domain'] != 'www.google.com')
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
if fs.exists(sc._jvm.org.apache.hadoop.fs.Path(output_path)):
fs.delete(sc._jvm.org.apache.hadoop.fs.Path(output_path), True)
df.repartition(1).write.format('com.databricks.spark.avro').save(output_path)