import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.sql.types._ 
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._



object WDATA {
  def main(args:Array[String])={
  
    val spark = SparkSession.builder().master("local[*]").appName("Weather Data").getOrCreate()
  
    import spark.implicits._
  
    spark.sparkContext.setLogLevel("WARN")
  
    val myschema = StructType(Array(
                     StructField("sid", StringType), 
                     StructField("mdate", DateType), 
                     StructField("mtype", StringType), 
                     StructField("mvalue", DoubleType)
                     ))



    val tempDF = spark.read.schema(myschema).option("dateFormat", "yyyyMMdd").csv("/home/lab/myprj/data/sample.csv")
     
    val dmax = tempDF.filter($"mtype" === "TMAX").drop("mtype").withColumnRenamed("mvalue", "maxTemp")
    val dmin = tempDF.filter($"mtype" === "TMIN").drop("mtype").withColumnRenamed("mvalue", "minTemp")
    
    val maxAndMinTemp = dmax.join(dmin, Seq("sid", "mdate"))
    val dailyAvgTemp = maxAndMinTemp.select('sid, 'mdate, ('maxTemp + 'minTemp)/20).withColumnRenamed("((maxTemp + minTemp) / 20)", "avgTemp")
    
    
    val avgTempPerStation = dailyAvgTemp.groupBy($"sid").agg(avg($"avgTemp"))
    
    val stationschema = StructType(Array(
                       StructField("sid", StringType), 
                       StructField("slat", DoubleType), 
                       StructField("slon", DoubleType),
                       StructField("sname", StringType)
                       )) 
                       
    val stationRDD = spark.sparkContext.textFile("/home/lab/myprj/data/stations.txt").map { line => 
      val sid = line.substring(0,11)
      val slat = line.substring(12,20).toDouble
      val slon = line.substring(21,30).toDouble 
      val sname = line.substring(41,71)
      Row(sid, slat, slon, sname)
      }
   val stationsDF = spark.createDataFrame(stationRDD, stationschema)
   
   
   val finalData = avgTempPerStation.join(stationsDF, "sid")
   finalData.show()
   
  spark.stop()
      }
  }
  

