Displays a number of examples of file compression and decompression in different formats of both rendering and Compression.
Compress Json Files
val rdd = sc.parallelize( Array(1, 2, 3, 4, 5) ) // Define RDD val df = rdd.toDF() // df transform df.write.mode("overwrite").format("json").save("hdfs:///formats/file_no_compression_json") df.write.mode("overwrite").format("json").option("compression", "gzip").save("hdfs:///formats/file_with_gzip_json") df.write.mode("overwrite").format("json").option("compression", "snappy").save("hdfs:///formats/file_with_snappy_json")
Compress parquet Files
val rdd = sc.parallelize( Array(1, 2, 3, 4, 5) ) // Define RDD val df = rdd.toDF() // df transform df.write.mode("overwrite").format("parquet").mode("overwrite").save("hdfs:///formats/file_no_compression_parquet") df.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save("hdfs:///formats/file_with_gzip_parquet") df.write.mode("overwrite").format("parquet").option("compression", "snappy").mode("overwrite").save("hdfs:///formats/file_with_snappy_parquet")
Compress ORC Files
val rdd = sc.parallelize( Array(1, 2, 3, 4, 5) ) // Define RDD val df = rdd.toDF() // df transform df.write.mode("overwrite").format("orc").option("compression", "none").mode("overwrite").save("hdfs:///formats/file_no_compression_orc") df.write.mode("overwrite").format("orc").option("compression", "zlib").mode("overwrite").save("hdfs:///formats/file_with_zlib_orc") df.write.mode("overwrite").format("orc").option("compression", "snappy").mode("overwrite").save("hdfs:///formats/file_with_snappy_orc")
Unzip Json files
val rdd = sqlContext.read.json("hdfs:///formats/file_no_compression_json") rdd.collect() val rdd = sqlContext.read.json("hdfs:///formats/file_with_gzip_json") rdd.collect() val rdd = sqlContext.read.json("hdfs:///formats/file_with_snappy_json") rdd.collect()
Unzip files Parquet
val rdd = sqlContext.read.parquet("hdfs:///formats/file_no_compression_parquet") rdd.collect() val rdd = sqlContext.read.parquet("hdfs:///formats/file_with_gzip_parquet") rdd.collect() val rdd = sqlContext.read.parquet("hdfs:///formats/file_with_snappy_parquet") rdd.collect()
Unzip ORC files
val rdd = sqlContext.read.orc("hdfs:///formats/file_no_compression_orc") rdd.collect() val rdd = sqlContext.read.orc("hdfs:///formats/file_with_zlib_orc") rdd.collect() val rdd = sqlContext.read.orc("hdfs:///formats/file_with_snappy_orc") rdd.collect()
0 Comments