import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.sql.types._ 
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._


object CDATA {
  def main(args:Array[String])={
  
    val spark = SparkSession.builder().master("local[*]").appName("Weather Data").getOrCreate()
  
    import spark.implicits._
  
    spark.sparkContext.setLogLevel("WARN")
  
    val myschema = StructType(Array(
                     StructField("sid", StringType), 
                     StructField("mdate", DateType), 
                     StructField("mtype", StringType), 
                     StructField("mvalue", DoubleType)
                     ))

    val tempDF = spark.read.schema(myschema).option("dateFormat", "yyyyMMdd").csv("/home/lab/myprj/data/sample.csv")
    tempDF.createOrReplaceTempView("tempTable")
    
    val dailyAvgTemp = spark.sql(""" SELECT sid, mdate, (maxTemp+minTemp)/20 as avgTemp
                 FROM 
                 (SELECT sid, mdate, mvalue as maxTemp FROM tempTable WHERE mtype="TMAX" )
                 JOIN
                 (SELECT sid, mdate, mvalue as minTemp FROM tempTable WHERE mtype="TMIN" )
               USING (sid, mdate)
               """)
    dailyAvgTemp.createOrReplaceTempView("dailyAvgTempTable")
    
    // val z = spark.sql (""" select * from dailyAvgTempTable """)
           
    val avgTempPerStation = spark.sql (""" SELECT sid, AVG(avgTemp) as yearAvgTemp FROM dailyAvgTempTable GROUP BY sid """)
    avgTempPerStation.createOrReplaceTempView ("avgTempPerStationTable")
    
     //val z = spark.sql (""" select * from avgTempPerStationTable """)
    
    //z.show()
             
    val stationschema = StructType(Array(
                       StructField("sid", StringType), 
                       StructField("slat", DoubleType), 
                       StructField("slon", DoubleType),
                        StructField("sname", StringType)
                    )) 
                       
    val stationRDD = spark.sparkContext.textFile("/home/lab/myprj/data/stations.txt").map { line => 
        val sid = line.substring(0,11)
        val slat = line.substring(12,20).toDouble
        val slon = line.substring(21,30).toDouble 
        val sname = line.substring(41,71)
        Row(sid, slat, slon, sname)
      }
      
   val stationsDF = spark.createDataFrame(stationRDD, stationschema)
   stationsDF.createOrReplaceTempView("stationsTable")
   
      
   //val SQLstm1 = spark.sql(""" SELECT * FROM stationsTable """)
               
   //SQLstm1.show()
    
   val finalData = spark.sql (""" SELECT stationsTable.sid, slat, slon,  yearAvgTemp, sname 
                                        FROM avgTempPerStationTable,  stationsTable  WHERE stationsTable.sid= avgTempPerStationTable.sid """) 
       //finalData.show()
       
       finalData.write.format("csv").save("file:///home/lab/myprj/data/final.txt")
       
  spark.stop()
      }
  }

