Spark XML – How to replace hyphen symbols found in XML elements

Using pattern matching:


import org.apache.spark.{SparkConf, SparkContext}

object TransFormXmlApp {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("Usage Source File Destination File")
    }
    val inFile = args(0)
    val outFile = args(1)
    val conf = new SparkConf().setAppName("TransFormXmlApp")
    val sc = new SparkContext(conf)
    val wholeFiles = sc.wholeTextFiles(inFile)
    val htmlReg = "<\\/*[a-z-A-Z]+".r     
    wholeFiles.collect().foreach { case (fileName, content) =>
      val newContent = htmlReg.replaceAllIn(content,  m => m.toString.replace("-","_"))
      println(newContent)
      val data = sc.parallelize(List(newContent))
      data.saveAsTextFile(outFile)
    }
    sc.stop()
  }
}

Using scala-xml API:


import org.apache.spark.{SparkConf, SparkContext}

import scala.xml._
import scala.xml.transform._

object TransFormXmlApp {
  def main(args: Array[String]) {
     if (args.length < 2) {         
         println("Usage Source File Destination File")     
      }  
      val inFile = args(0)     
      val outFile = args(1)     
      val conf = new SparkConf().setAppName("TransFormXmlApp")     
      val sc = new SparkContext(conf)     
      val wholeFiles = sc.wholeTextFiles(inFile)   
  
      //Replace hypen as underscore     
      val hypenAsUnderScoreRule = new RewriteRule {       
         override def transform(nodes: scala.xml.Node): Seq[Node] = nodes match {
         case e@Elem(prefix, label, attribs, scope, children@_*) => Elem(prefix, label.replace("-", "_"), attribs, scope, false, children: _*)
         case _ => nodes
      }
    }
    //Remove hyphen symbol
    val removeHyphenRule = new RewriteRule {
      override def transform(nodes: scala.xml.Node): Seq[Node] = nodes match {
        case e@Elem(prefix, label, attribs, scope, children@_*) => Elem(prefix, label.replace("-", ""), attribs, scope, false, children: _*)
        case _ => nodes
      }
    }
    wholeFiles.collect().foreach { case (fileName, content) =>
      val updatedXml = new RuleTransformer(hyphenAsUnderScoreRule).transform(XML.loadString(content))
      val data = sc.parallelize(updatedXml)
      data.saveAsTextFile(outFile)
    }
    sc.stop()
  }
}

Advertisements

One thought on “Spark XML – How to replace hyphen symbols found in XML elements

  1. Pingback: Spark SQL + XML – How to escape column names with hyphen symbol | Bala's Blog

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s