Spark Job for Removing Empty XML tags

The below spark program will read a set of XML files, parse it and then remove the empty tags and finally write the output as a sequence file.

package com.spark

import org.apache.spark.{SparkConf, SparkContext}

//Find empty tags and remove it and then write as sequence files
object TagHandler {
 def main(args: Array[String]) {
 if (args.length < 2) {
     println("Usage <Source File or Directory> <Destination File or Directory>")
     // bin/spark-submit --class com.spark.TagHandler --master local tagHandler-assembly-1.0.jar /data/xml /data/output
 }

 val inFile = args(0)
 val outFile = args(1)

 val htmlTags = List("<sub/>", "<sup/>", "<i/>", "<b/>")

 val conf = new SparkConf().setAppName("TagHandler")
 val sc = new SparkContext(conf)
 val wholeFiles = sc.wholeTextFiles(inFile)

 wholeFiles.collect().foreach { case (fileName, content) =>
 var newContent = content
 for (tag <- htmlTags) {
     newContent = newContent.replace(newContent, "")
 }
 val data = sc.parallelize(List((fileName, newContent)))
 data.saveAsSequenceFile(outFile)
 }
 sc.stop()
 }
}

 

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s