# Title: Peer Group Analysis Demo # Author: David Weston Birkbeck College, University of London # Date: January 2013 # This software is provided "as is" and licenced under GPL2, see # http://www.r-project.org/Licenses/GPL-2 # If you use this software we ask that you include the following citation: # Weston DJ, Hand DJ, Adams NM, Whitrow C and Juszczak P. Plastic card fraud detection using peer # group analysis. Advances in Data Analysis and Classification, 2(1), (2008), 45-62 # Be sure to SOURCE PeerGroupAnalysis.R and SyntheticData.R before running this script. # Change the path of the following two lines to locate your copies of these files source('C:/PeerGroup/PeerGroupAnalysis.R'); source('C:/PeerGroup/SyntheticData.R'); m <- 100; # number of time series n <- 20; # time series length p <- 5; # time series dimension (number of features) # x: A 3-dimensional matrix of size m x n x p # containing m time aligned multivariate time series, # where each time series is of length n # and each observation has dimension p. # x contains time series that follow one of two peer groups. # The first half follow one peer group, the remaining the second peer group x <- GenerateSyntheticData(m,n,p, display.graphs = FALSE) #The first time series will deviate from its peer group from time t = 11 onwards t.start <- 11 x[1,t.start:n,]<- x[1,t.start:n,]+ pD.Random.Walk(matrix(rep(0,p), nrow=1),(n-t.start+1),0.5); DisplayPeerTimeSeries(x,m,n); #Display the first dimension only, showing the first time series and all other members of its peer group #split data into training and testing build.time <- 4 training.data <- x[,1:build.time, , drop=FALSE] test.data <- x[,(build.time+1):n, , drop=FALSE] ##### Peer Group Analysis ##### # Build Peer Groups. # Can optionally use your own function for measuring time series similarity, e.g. # peer.groups <- BuildPeerGroups(training.data, peer.group.size, EucSim); # Use function EucSim to measure time series similarity. peer.group.size <- 49 #This is the largest peer group size for the synthetic data (the value is (m/2) -1) peer.groups <- BuildPeerGroups(training.data, peer.group.size); # Monitor time series using peer groups. # Can optionally use your own function for measuring separation from peer group, e.g. # separation <- PeerMonitor(test.data, peer.groups, PeerMahalDist); # Use function PeerMahalDist to measure separation from peer group separation <- PeerMonitor(test.data, peer.groups); # Simple Outlier Detector for Mahalanobis distances. # Change this to better suit your needs. calibrated.separation<-dchisq(separation,df=p); outlier.threshold <- 0.0001; #Set your own threshold outliers <- calibrated.separation < outlier.threshold; # Show frequency table of outliers. Each factor is a time series ID. table(factor((which(outliers, arr.ind = TRUE)[, 1]))); #At best, we should see this: # 1 # 10 # (Which means, only time series 1 has outliers, and they have occured at each of the 10 time points.) # Optional, repeat the monitoring but ignore a peer group member at time t if it is an outlier at time t active.in.peer.groups <- !outliers # Append this information to the data, i.e. we have introduced a new feature. # This feature is called "use.as.peer" to distinguish it from the others, see function 'ActivePeerMahalDist' for more details. new.test.data <- array(0, , dim = c(m,length((build.time+1):n),p+1)); names.array <-attr(test.data, "dimnames"); names.array$`observation feature` <- c(names.array$`observation feature`,"use.as.peer") attr(new.test.data, "dimnames") <- names.array #append the data new.test.data[ , , 1:p]<- test.data; new.test.data[ , , "use.as.peer"] <- active.in.peer.groups; # Note the use of another Similarity function 'ActivePeerMahalDist' separation <- PeerMonitor(new.test.data, peer.groups, ActivePeerMahalDist); #Outlier detector, same as above calibrated.separation <- dchisq(separation, df = p); outliers <- calibrated.separation < outlier.threshold; # Show summary of outliers. table(factor((which(outliers, arr.ind = TRUE)[, 1])));