NotesWhat is notes.io?

Notes brand slogan

Notes - notes.io

/**
* Created by user on 29/12/14.
*/

import java.security.MessageDigest

import com.fasterxml.jackson.core.JsonParseException
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.{SparkConf, SparkContext}

import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.json4s.jackson.Serialization._

import org.apache.spark.SparkContext._
import scala.collection.immutable.Map

import org.apache.hadoop.io.{Text, LongWritable}
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.broadcast.Broadcast
import ua_parser.{Client, Parser}
import org.apache.spark.rdd.RDD
import scala.util.matching.Regex
import Ordering.Implicits._
import Numeric.Implicits._
import scala.math._
import scala.collection.mutable.ListBuffer
import java.net.URL
import scala.util.control._
import scala.io.Source
import com.github.nscala_time.time.Imports._
import org.json4s.DefaultFormats



object Aggregate {
type Click = Map[String, String]

def main(args:Array[String]) {

val conf = new SparkConf().setAppName("Enricher").setMaster("local").set("spark.executor.memory", "5g")

val sc = new SparkContext(conf)
val UnconvertedClicksLocation = "Test_files_smh/10-2014-smh-view-DCDS*"

//Reading as a textfile
var read_text = sc.textFile(UnconvertedClicksLocation,10)


//Adding member id with 0 value to make the file readable by spark
var missing_col = read_text.map(x=>(x,(x.indexOf("memberId"),x.indexOf("userAgent"),x.indexOf(""referer""),x.indexOf(""url"")))).map(x=>(if (x._2._1 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","memberId":"0"}"),x._2) else (x._1,x._2))).map(x=>(if (x._2._2 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","userAgent":"0"}"),x._2) else (x._1,x._2))).map(x=>(if (x._2._3 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","referer":"0"}"),x._2) else (x._1,x._2))).map(x=>(if (x._2._4 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","url":"0"}"),x._2) else (x._1,x._2))).map(c=>(c._1))

val clicks: RDD[Click] = (missing_col
.filter((l: String) => !(l contains "00000000-0000-0000-0000-000000000000"))
.map(expandRecord(_))
)


//Get some base sets for future calculations -
val memberid_trackingid = clicks.map(c=>(c("memberId"),c("trackingId"))).filter(t=>t._1 != "0").distinct
val ip_ua_disttracking = clicks.map(c=>(c("ipAddress"),c("userAgent"),c("trackingId"))).distinct.map(t=>((t._1,t._2),t._3))

/*5.A member who's logged with multiple devices*/
/*?same member id, 1-4 tracking ids*/
val pattern_5 = memberid_trackingid.map(t=>(t._1,1)).reduceByKey(_+_).filter(t=>(t._2<5)).join(memberid_trackingid).map(t=>(t._2._2,5)).distinct


/*7.A user clears his/her cookies or use private browsing while reading*/
/*?2-4 tracking ids, same UAS, same IP*/
val ip_ua_count = ip_ua_disttracking.map(t=>(t._1,1)).reduceByKey(_+_).filter(t=>(t._2 >=2 && t._2<=4))
val pattern_7 = ip_ua_count.join(ip_ua_disttracking).map(t=>(t._2._2,7)).distinct

/*Apply the hierarchial structure to the identified Tracking IDs by giving preference to multiple users then single users*/
var pattern_tids = pattern_5 ++ pattern_7

//pattern_tids.take(5).foreach(println)
var pattern_tids_final = pattern_tids.groupByKey().map(t=>(t._1,t._2.toList.sorted)).map(t=>(t._1,t._2.head))
//pattern_tids_final.take(5).foreach(println)

//Enrich the original data with a final visitor ID. Also enrich with information about source, referer and useragent.
val enriched_clicks = clicks
//.map(enrichClick(_))
.map(c=>(c,mapReferer(c("referer"),c("url"))))
.map(c=> (c._1.updated("finalReferer",c._2)))
.map(c=>(c("trackingId"),(c("trackingId"),c("memberId"),c("ipAddress"),c)))
.leftOuterJoin(pattern_tids_final)
.map(c=>(c._2._1._4, if (c._2._2.toString.contains("Some(5)")) c._2._1._2
else if (c._2._2.toString.contains("Some(7)")) c._2._1._3
else c._1))
.map(c=>(c._1.updated("visitorId",c._2)))

enriched_clicks.take(5).foreach(println)

metrics.saveAsTextFile("s3n://ffx-analytics/mu-sigma-data/session_metrics", classOf[GzipCodec])

}


def mapReferer(referer:String, url:String):String = {

if (referer == "direct") "DIRECT"
else if ((referer == "") && (!("""www.smh.com.au""".r.findAllIn(url).isEmpty))) "DIRECT"
else if ((referer == "") && (!("""www.theage.com.au""".r.findAllIn(url).isEmpty))) "DIRECT"
else if (!("""www.google..*""".r.findAllIn(referer).isEmpty)) "GOOGLE"
else if (!("""*.bing..*""".r.findAllIn(referer).isEmpty)) "BING"
else if (!("""*.yahoo..*""".r.findAllIn(referer).isEmpty)) "YAHOO"
else if (!("""*.drive..*""".r.findAllIn(referer).isEmpty)) "DRIVE"
else if (!("""www.smh.com.au""".r.findAllIn(referer).isEmpty)) "SMH"
else if (!("""www.theage.com.au""".r.findAllIn(referer).isEmpty)) "THEAGE"
else if (!("""traffic.outbrain.com""".r.findAllIn(referer).isEmpty)) "OUTBRAIN"
else if (!(""".*reddit.com""".r.findAllIn(referer).isEmpty)) "REDDIT"
else if (!(""".*twitter.com""".r.findAllIn(referer).isEmpty)) "TWITTER"
else if (!("""linkedin.com""".r.findAllIn(referer).isEmpty)) "LINKEDIN"
else if (!(""".*facebook.com""".r.findAllIn(referer).isEmpty)) "FACEBOOK"
else "referral"

}



def parseWithErrors(jsonRecord:String):Option[Map[String,JValue]] = {

implicit val formats = DefaultFormats

try {
Some(parse(jsonRecord).extract[Map[String,JValue]])
} catch {
case e: JsonParseException => {
val index = e.getLocation.getColumnNr - 2

// println(jsonRecord)

jsonRecord.charAt(index) match {
case ''' => parseWithErrors(jsonRecord.patch(index - 1, "", 1))
case 'v' => parseWithErrors(jsonRecord.patch(index - 1, "\u0007", 2))
case 'a' => parseWithErrors(jsonRecord.patch(index - 1, "\u0013", 2))
case _ => parseWithErrors(jsonRecord.patch(index - 1, "", 1))
}
}
case _: Throwable => None
}
}

def expandRecord(jsonRecord:String):Map[String,String] = {

val digest = MessageDigest.getInstance("MD5")

val partParsed:Option[Map[String,JValue]] = parseWithErrors(jsonRecord)

// Now, walk partParsed and convert each of the JValues to their primitive types.
// This is to circumvent https://github.com/json4s/json4s/issues/124

def reparse(v:JValue):Any = v match {
case JString(s) => s
case JInt(i) => i.toString()
case JArray(o) => (o
.asInstanceOf[List[JObject]]
.map(
(pair:JObject) => {
// we know that the pair has name, value keys
val p = (pair
.obj
.toMap
.mapValues(_.values.asInstanceOf[String])
)

Map[String,String](p("name") -> p("value"))
}
)
.foldLeft(Map[String,String]())(_ ++ _)
)
case _ => v
}

partParsed match {
case Some(s) => {
val withExtras = s.mapValues(reparse)
val trackingId = digest.digest(withExtras("trackingId").asInstanceOf[String].getBytes).map("%02x".format(_)).mkString
val extras = withExtras.getOrElse("extras", Map()).asInstanceOf[Map[String, String]]
(withExtras - "extras").updated("trackingId", trackingId).asInstanceOf[Map[String, String]] ++ extras
}
case None => Map[String,String]()
}
}

def serialise(click:Map[String,String]):String = {

implicit val formats = DefaultFormats

write(click)
}
}
     
 
what is notes.io
 

Notes.io is a web-based application for taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000 notes created and continuing...

With notes.io;

  • * You can take a note from anywhere and any device with internet connection.
  • * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
  • * You can quickly share your contents without website, blog and e-mail.
  • * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
  • * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.

Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.

Easy: Notes.io doesn’t require installation. Just write and share note!

Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )

Free: Notes.io works for 12 years and has been free since the day it was started.


You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;


Email: [email protected]

Twitter: http://twitter.com/notesio

Instagram: http://instagram.com/notes.io

Facebook: http://facebook.com/notesio



Regards;
Notes.io Team

     
 
Shortened Note Link
 
 
Looding Image
 
     
 
Long File
 
 

For written notes was greater than 18KB Unable to shorten.

To be smaller than 18KB, please organize your notes, or sign in.