Notes![what is notes.io? What is notes.io?](/theme/images/whatisnotesio.png)
![]() ![]() Notes - notes.io |
* Created by user on 29/12/14.
*/
import java.security.MessageDigest
import com.fasterxml.jackson.core.JsonParseException
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.{SparkConf, SparkContext}
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.json4s.jackson.Serialization._
import org.apache.spark.SparkContext._
import scala.collection.immutable.Map
import org.apache.hadoop.io.{Text, LongWritable}
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.broadcast.Broadcast
import ua_parser.{Client, Parser}
import org.apache.spark.rdd.RDD
import scala.util.matching.Regex
import Ordering.Implicits._
import Numeric.Implicits._
import scala.math._
import scala.collection.mutable.ListBuffer
import java.net.URL
import scala.util.control._
import scala.io.Source
import com.github.nscala_time.time.Imports._
import org.json4s.DefaultFormats
object Aggregate {
type Click = Map[String, String]
def main(args:Array[String]) {
val conf = new SparkConf().setAppName("Enricher").setMaster("local").set("spark.executor.memory", "5g")
val sc = new SparkContext(conf)
val UnconvertedClicksLocation = "Test_files_smh/10-2014-smh-view-DCDS*"
//Reading as a textfile
var read_text = sc.textFile(UnconvertedClicksLocation,10)
//Adding member id with 0 value to make the file readable by spark
var missing_col = read_text.map(x=>(x,(x.indexOf("memberId"),x.indexOf("userAgent"),x.indexOf(""referer""),x.indexOf(""url"")))).map(x=>(if (x._2._1 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","memberId":"0"}"),x._2) else (x._1,x._2))).map(x=>(if (x._2._2 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","userAgent":"0"}"),x._2) else (x._1,x._2))).map(x=>(if (x._2._3 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","referer":"0"}"),x._2) else (x._1,x._2))).map(x=>(if (x._2._4 == -1) (x._1.replace(x._1,x._1.substring(0,x._1.length-1) + ","url":"0"}"),x._2) else (x._1,x._2))).map(c=>(c._1))
val clicks: RDD[Click] = (missing_col
.filter((l: String) => !(l contains "00000000-0000-0000-0000-000000000000"))
.map(expandRecord(_))
)
//Get some base sets for future calculations -
val memberid_trackingid = clicks.map(c=>(c("memberId"),c("trackingId"))).filter(t=>t._1 != "0").distinct
val ip_ua_disttracking = clicks.map(c=>(c("ipAddress"),c("userAgent"),c("trackingId"))).distinct.map(t=>((t._1,t._2),t._3))
/*5.A member who's logged with multiple devices*/
/*?same member id, 1-4 tracking ids*/
val pattern_5 = memberid_trackingid.map(t=>(t._1,1)).reduceByKey(_+_).filter(t=>(t._2<5)).join(memberid_trackingid).map(t=>(t._2._2,5)).distinct
/*7.A user clears his/her cookies or use private browsing while reading*/
/*?2-4 tracking ids, same UAS, same IP*/
val ip_ua_count = ip_ua_disttracking.map(t=>(t._1,1)).reduceByKey(_+_).filter(t=>(t._2 >=2 && t._2<=4))
val pattern_7 = ip_ua_count.join(ip_ua_disttracking).map(t=>(t._2._2,7)).distinct
/*Apply the hierarchial structure to the identified Tracking IDs by giving preference to multiple users then single users*/
var pattern_tids = pattern_5 ++ pattern_7
//pattern_tids.take(5).foreach(println)
var pattern_tids_final = pattern_tids.groupByKey().map(t=>(t._1,t._2.toList.sorted)).map(t=>(t._1,t._2.head))
//pattern_tids_final.take(5).foreach(println)
//Enrich the original data with a final visitor ID. Also enrich with information about source, referer and useragent.
val enriched_clicks = clicks
//.map(enrichClick(_))
.map(c=>(c,mapReferer(c("referer"),c("url"))))
.map(c=> (c._1.updated("finalReferer",c._2)))
.map(c=>(c("trackingId"),(c("trackingId"),c("memberId"),c("ipAddress"),c)))
.leftOuterJoin(pattern_tids_final)
.map(c=>(c._2._1._4, if (c._2._2.toString.contains("Some(5)")) c._2._1._2
else if (c._2._2.toString.contains("Some(7)")) c._2._1._3
else c._1))
.map(c=>(c._1.updated("visitorId",c._2)))
enriched_clicks.take(5).foreach(println)
metrics.saveAsTextFile("s3n://ffx-analytics/mu-sigma-data/session_metrics", classOf[GzipCodec])
}
def mapReferer(referer:String, url:String):String = {
if (referer == "direct") "DIRECT"
else if ((referer == "") && (!("""www.smh.com.au""".r.findAllIn(url).isEmpty))) "DIRECT"
else if ((referer == "") && (!("""www.theage.com.au""".r.findAllIn(url).isEmpty))) "DIRECT"
else if (!("""www.google..*""".r.findAllIn(referer).isEmpty)) "GOOGLE"
else if (!("""*.bing..*""".r.findAllIn(referer).isEmpty)) "BING"
else if (!("""*.yahoo..*""".r.findAllIn(referer).isEmpty)) "YAHOO"
else if (!("""*.drive..*""".r.findAllIn(referer).isEmpty)) "DRIVE"
else if (!("""www.smh.com.au""".r.findAllIn(referer).isEmpty)) "SMH"
else if (!("""www.theage.com.au""".r.findAllIn(referer).isEmpty)) "THEAGE"
else if (!("""traffic.outbrain.com""".r.findAllIn(referer).isEmpty)) "OUTBRAIN"
else if (!(""".*reddit.com""".r.findAllIn(referer).isEmpty)) "REDDIT"
else if (!(""".*twitter.com""".r.findAllIn(referer).isEmpty)) "TWITTER"
else if (!("""linkedin.com""".r.findAllIn(referer).isEmpty)) "LINKEDIN"
else if (!(""".*facebook.com""".r.findAllIn(referer).isEmpty)) "FACEBOOK"
else "referral"
}
def parseWithErrors(jsonRecord:String):Option[Map[String,JValue]] = {
implicit val formats = DefaultFormats
try {
Some(parse(jsonRecord).extract[Map[String,JValue]])
} catch {
case e: JsonParseException => {
val index = e.getLocation.getColumnNr - 2
// println(jsonRecord)
jsonRecord.charAt(index) match {
case ''' => parseWithErrors(jsonRecord.patch(index - 1, "", 1))
case 'v' => parseWithErrors(jsonRecord.patch(index - 1, "\u0007", 2))
case 'a' => parseWithErrors(jsonRecord.patch(index - 1, "\u0013", 2))
case _ => parseWithErrors(jsonRecord.patch(index - 1, "", 1))
}
}
case _: Throwable => None
}
}
def expandRecord(jsonRecord:String):Map[String,String] = {
val digest = MessageDigest.getInstance("MD5")
val partParsed:Option[Map[String,JValue]] = parseWithErrors(jsonRecord)
// Now, walk partParsed and convert each of the JValues to their primitive types.
// This is to circumvent https://github.com/json4s/json4s/issues/124
def reparse(v:JValue):Any = v match {
case JString(s) => s
case JInt(i) => i.toString()
case JArray(o) => (o
.asInstanceOf[List[JObject]]
.map(
(pair:JObject) => {
// we know that the pair has name, value keys
val p = (pair
.obj
.toMap
.mapValues(_.values.asInstanceOf[String])
)
Map[String,String](p("name") -> p("value"))
}
)
.foldLeft(Map[String,String]())(_ ++ _)
)
case _ => v
}
partParsed match {
case Some(s) => {
val withExtras = s.mapValues(reparse)
val trackingId = digest.digest(withExtras("trackingId").asInstanceOf[String].getBytes).map("%02x".format(_)).mkString
val extras = withExtras.getOrElse("extras", Map()).asInstanceOf[Map[String, String]]
(withExtras - "extras").updated("trackingId", trackingId).asInstanceOf[Map[String, String]] ++ extras
}
case None => Map[String,String]()
}
}
def serialise(click:Map[String,String]):String = {
implicit val formats = DefaultFormats
write(click)
}
}
![]() |
Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...
With notes.io;
- * You can take a note from anywhere and any device with internet connection.
- * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
- * You can quickly share your contents without website, blog and e-mail.
- * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
- * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.
Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.
Easy: Notes.io doesn’t require installation. Just write and share note!
Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )
Free: Notes.io works for 14 years and has been free since the day it was started.
You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;
Email: [email protected]
Twitter: http://twitter.com/notesio
Instagram: http://instagram.com/notes.io
Facebook: http://facebook.com/notesio
Regards;
Notes.io Team