/*  SEQUENTIAL */


/*  This program computes a variety of basic sequential analysis statistics.
    The data are assumed to be a series of integer codes with values ranging
    from "1" to what ever value the user specifies in the "ncodes" 
    computation at the start of the program. */


options nocenter nodate nonumber  linesize=90;  title;

proc iml;

/* Enter the number of possible code values here. */
ncodes = 6;

/* Enter labels for the codes (5 characters maximum), if desired, here. */
labels={Code1 Code2 Code3 Code4   Code5  Code6  Code7 
        Code8 Code9 Code10 Code11 Code12 Code13 Code14 Code15};

/* Enter the lag number for the analyses here. */
lag = 1;

/* Can adjacent events be coded the same? 
   Enter "0" if adjacent events can NEVER be the same;
   Enter "1" if adjacent events can ALWAYS be the same;
   Enter "2" if some adjacent events can, and others cannot, be the same; then
                   scroll down and enter the appropriate "ONEZERO" matrix for your data. */
adjacent = 1;

/* Enter "1" for one-tailed tests; enter "2" for two-tailed tests. */
tailed = 2;

/* Do you want to run permutation tests of significance?  "1"=yes, "0"=no;
   Warning: these computations are time-consuming */
permtest = 0;

/*  Enter the number of desired permutations per block here. */
nperms = 10;

/* Enter the number of desired blocks of permutations here. */
nblocks = 3;

/* Enter "95" for 95% confidence intervals;
  enter  "99" for 99% confidence intervals */
confid = 95;

/* Set 'seed' to a non-negative integer; 1953125 is good. */
seed = 1953125;

/* Enter/read the data into a matrix with the name "data",
   as in the following example */


data = {
3,5,3,4,4,6,3,4,4,1,6,3,6,6,1,6,2,4,3,4,3,4,2,6,6,3,6,3,6,3,4,3,5,3,
4,2,5,3,4,2,6,3,4,2,5,3,6,2,6,2,6,3,3,6,3,5,3,6,6,3,3,6,2,6,2,6,3,3,
6,3,6,3,3,6,6,2,6,2,6,2,6,3,3,5,3,6,2,6,6,3,3,5,3,5,3,3,5,3,3,6,6,2,
6,2,6,2,6,6,2,3,6,2,6,2,6,1,6,2,6,2,6,1};


/* transitional frequency matrix */
freqs = j(ncodes,ncodes,0);
do c = 1 to nrow(data);
if ( c + lag <= nrow(data) )   then
freqs[(data[c,1]),(data[(c+lag),1])] = 
freqs[(data[c,1]),(data[(c+lag),1])]  + 1;
end;

/* Warning message for specification error. */
if ( adjacent=0 & any(diag(freqs)) = 1)   then do;
print,"WARNING";
print"You have indicated that consecutive or adjacent codes may not repeat,";
print"(adjacent = 0), yet repeating codes have been found in the data. See the";
print"main diagonal of the frequency matrix.  This specification error will ";
print"result in faulty computations for LRX2, z-values, and adjusted residuals.";
end;

/* row sums module */
start rsum(matname);
rsums =j(nrow(matname),1);
do rows = 1 to nrow(matname);
dumr = matname[rows,];
rsums[rows,1]=sum(dumr);
end;
return(rsums);
finish;

/* collumn sums module */
start csum(matname);
csums =j(1,ncol(matname));
do cols = 1 to ncol(matname);
dumc = matname[,cols];
csums[1,cols]=sum(dumc);
end;
return(csums);
finish;

/* initializing. */
lrx2t    = {0};
rowtots = rsum(freqs);
coltots = csum(freqs);
ntrans   = sum(rowtots);
prows    = rowtots / ntrans;
pcols    = coltots / ntrans;
p        = j(ncodes,ncodes,-9999);
et       = j(ncodes,ncodes,-9999);
expfreq  = j(ncodes,ncodes,-9999);
zadjres  = j(ncodes,ncodes,-9999);
pzadjres = j(ncodes,ncodes,1);
yulesq   = j(ncodes,ncodes,-9999);
var      = j(ncodes,ncodes,-9999);
min      = j(ncodes,ncodes,-9999);
kappa    = j(ncodes,ncodes,-9999);
zkappa   = j(ncodes,ncodes,-9999);
pzkappa  = j(ncodes,ncodes,1);
signs    = j(ncodes,ncodes,0);
n = ntrans + 1;
nr  = rowtots;
nr[data[n,1]] = nr[data[n,1]] + 1;
pnr  = nr / n;


do i = 1 to ncodes;
do j = 1 to ncodes;

/* Note: more refined computations for when adjacent codes cannot
   repeat appear below, after the above 2 loops are completed. */
if (adjacent = 0 & (ntrans - rowtots[i,1]) > 0 )   then;
pcols[1,j] = coltots[1,j] / (ntrans - rowtots[i,1] );
if (adjacent = 0 & (ntrans - rowtots[j,1]) > 0 )   then;
expfreq[i,j] = ( rowtots[i,1] * coltots[1,j]) / (ntrans - rowtots[j,1]);
if (adjacent = 0 & (n - nr[i,1]) > 0)   then;
et[i,j]  = (nr[i,1] * nr[j,1]) / (n - nr[i,1]) ;

if (adjacent = 1)   then do;
et[i,j]  =    (nr[i,1] * nr[j,1]) / n;
expfreq[i,j] = ( rowtots[i,1] * coltots[1,j]) / ntrans ;
end;

/* transitional probabilities */
if (rowtots[i,1] > 0)   then;
p[i,j] = freqs[i,j] / rowtots[i,1] ;

/* tablewise LRX2 */
if ( freqs[i,j] > 0 & expfreq[i,j] > 0)   then;
lrx2t = lrx2t + 2 * (freqs[i,j] * log(freqs[i,j]/expfreq[i,j]));

/* adjusted residuals (z values) & sig levels */
if ( (expfreq[i,j]*(1-pcols[1,j])*(1-prows[i,1])) > 0)   then do;
zadjres[i,j]=(freqs[i,j]-expfreq[i,j])
  /sqrt( expfreq[i,j]*(1-pcols[1,j]) * (1-prows[i,1]) );
pzadjres[i,j] = (1 - probnorm(abs(zadjres[i,j])) ) * tailed;
end;

/* Yule's Q */
a = freqs[i,j];
b = rowtots[i,1] - freqs[i,j];
c = coltots[1,j] - freqs[i,j];
d = ntrans - rowtots[i,1] - coltots[1,j] + freqs[i,j];
if ( (a*d + b*c) > 0 )   then;
yulesq[i,j] = ( a*d - b*c ) / (a*d + b*c);

/* kappas, z values & sig levels */
var[i,j]=(nr[i,1]*nr[j,1]*(n-nr[j,1])*(n-nr[i,1]))/(n##2*(n-1));
if (var[i,j] > 0)   then do;
zkappa[i,j]  = ( freqs[i,j] - et[i,j] ) / sqrt(var[i,j]);
if ( nr[i,1] <= nr[j,1] )    then  min[i,j] = nr[i,1];
else                               min[i,j] = nr[j,1];
kappa[i,j] = ( freqs[i,j] - et[i,j] ) / ( min[i,j] - et[i,j] );
if ( kappa[i,j] < 0 )    then do;
kappa[i,j] = ( freqs[i,j] - et[i,j] ) /  et[i,j];
pzkappa[i,j] = (1 - probnorm(abs(zkappa[i,j]))) * tailed;
end;
end;

/* signs */
if ( freqs[i,j] > expfreq[i,j] )    then  signs[i,j] =  1;
if ( freqs[i,j] < expfreq[i,j] )    then  signs[i,j] = -1;

end;
end;



if (adjacent = 0 | adjacent = 2) then do;

/* maximum likelihood estimation of the expected cell frequencies using
   iterative proportional fitting (Wickens, 1989, pp. 107-112). */

rsumsf = rsum(freqs);
csumsf = csum(freqs);

onezero = j(ncodes,ncodes,1);
do i = 1 to ncodes;
do j = 1 to ncodes;
if (i = j) then  onezero[i,j] = 0;
end;
end;

/* the 6 previous commands create a matrix of ones and zeros that
   is used in estimating the expected cell frequencies.  A "1" indicates
   that the expected frequency for a given cell is to be estimated, 
   whereas a "0" indicates that the expected frequency for the cell
   should NOT be estimated, typically because it is a structural zero
   (codes that cannot follow one another).  By default, the matrix that
   is created by the above commands has zeros along the main diagonal,
   and ones everywhere else, which will be appropriate for most data 
   sets.  However, if your data happen to involve structural zeros
   that occur in cells other than the cells along the main diagonal,
   then you must create a ONEZERO matrix with ones and zeros that is
   appropriate for your data before running any of the commands 
   below.  Enter your ONEZERO matrix now. */


expfreq = onezero;

rdiffs = rsumsf - rsum(expfreq);
cdiffs = csumsf - csum(expfreq);

do ipfloop=1 to 100 until ( max(rdiffs) < .0001 & max(cdiffs) < .0001);

/* adjusting by row. */
xr = j(ncodes,1,0);
rsumse = rsum(expfreq);
do r = 1 to ncodes;
if (rsumse[r,1] > 0) then xr[r,1] = rsumsf[r,1] / rsumse[r,1];
end;
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] = 1 ) then expfreq[i,j] = expfreq[i,j] * xr[i,1];
end;
end;

/* adjusting by collumn. */
xc = j(1,ncodes,0);
csumse = csum(expfreq);
do c = 1 to ncodes;
if (csumse[1,c] > 0)  then xc[1,c] = csumsf[1,c] / csumse[1,c];
end;
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] = 1 ) then expfreq[i,j] = expfreq[i,j] * xc[1,j];
end;
end;

rdiffs = rsumsf - rsum(expfreq);
cdiffs = csumsf - csum(expfreq);
end;

print, "Maximum likelihood estimation of the expected cell";
print "frequencies using iterative proportional fitting";
if ( max(rdiffs) < .0001 & max(cdiffs) < .0001) then;
print "converged after the following number of interations:", ipfloop;
else 
print "did NOT converge after the following number of interations:",ipfloop;


/* tablewise LRX2 */
lrx2t    = {0};
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( freqs[i,j] > 0 &  expfreq[i,j] > 0) then;
lrx2t = lrx2t + 2 * (freqs[i,j] * log(freqs[i,j]/expfreq[i,j]));
end;
end;

/* adjusted residuals for matrices with structural zeros (Christensen, 1997, p. 357) */

/* constructing the design matrix */
x = j(ncodes**2,1,1);
y = j(ncodes**2,ncodes-1,0);
z = j(ncodes**2,ncodes-1,0);
do i = 1 to ncodes - 1;
do j = 1 to ncodes;
y[i*ncodes+j,i] = 1;
z[ (((j-1)*ncodes)+(i+1)),i] = 1;
end;
end;
des1 = ( x || y || z );

/* pruning values corresponding to cells with structural zeros */
onezero2 = shape(onezero,ncodes**2,1);
dm1  = shape(expfreq,ncodes**2,1);
dm2  = j(1,1,-9999);
des2 = j(1,ncol(des1),-9999);
do pp = 1 to ncodes**2;
if (onezero2[pp,1] = 1) then do;
dm2  = ( dm2  // dm1 [pp,1] );
des2 = ( des2 // des1[pp,] );
end;
end;
dm2  = dm2[2:nrow(dm2),1];
des2 = des2[2:nrow(des2),];

dm2 = diag(dm2);
if ( det(t(des2) * dm2 * des2) ^= 0) then do;
a = des2 * (inv(t(des2) * dm2 * des2)) * t(des2) * dm2;
acounter = 1;
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] ^= 0) then do;
zadjres[i,j] = ( freqs[i,j] - expfreq[i,j] ) /
                 sqrt( expfreq[i,j] * (1 - a[acounter,acounter]) );
acounter = acounter + 1;
end;
end;
end;
end;
if ( det(t(des2) * dm2 * des2) = 0) then do;
reset spaces = 2;
print, "A nonsingular matrix has been identified, which means";
print "that proper adjusted residuals cannot be computed for";
print "this data, probably because there are no values for";
print "one or more codes.  Try recoding using sequential";
print "integers, and redo the analyses.  The adjusted";
print "residuals that are printed below are based on";
print "equation 5 from Bakeman & Quera (1995, p. 274), and";
print "are close approximations to the proper values.";
print "The procedures recommended by Bakeman & Quera";
print "(1995, p. 276), Haberman (1979), and Christensen";
print "(1997) cannot be conducted with nonsingular matrices.";
end;

do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] = 0) then do;
zadjres [i,j] = 0;
yulesq  [i,j] = 0;
kappa   [i,j] = 0;
zkappa  [i,j] = 0;
pzadjres[i,j] = 1;
pzkappa [i,j] = 1;
end;
end;
end;

end;


print,, "Requested 'tail' (1 or 2) for Significance Tests =", tailed;
b = labels[1,1:ncodes];
bb = ( b || {Totals});
transf = ( (freqs || rowtots) // (coltots || ntrans) ); 
print , "Cell Frequencies, Row & Collumn Totals, & N" ,
         transf [rowname=bb colname=bb format=7.0];
if (adjacent = 0 | adjacent = 2) then do;
print, "The processed ONEZERO matrix appears below.";
print, "In the ONEZERO matrix, a 0 indicates a structural";
print, "zero, and a 1 indicates that an expected cell";
print, "frequency will be estimated.";
print, "ONEZERO matrix:", onezero[rowname=b colname=b format=5.4];
end;
print , "Expected Values/Frequencies", expfreq[rowname=b colname=b format=5.4];
print , "Transitional Probabilities" , p[rowname=b colname=b format=5.4];
if (adjacent = 1)    then  df = (ncodes - 1)##2;
else                       df = (ncodes - 1)##2 - (ncodes**2 - sum(onezero));
plrx2t = 1 -  probchi(abs(lrx2t),df);
lrx =  (lrx2t || df || plrx2t);
print, "Tablewise Likelihood Ratio (Chi-Square) Test",
        lrx[colname={LRX2 df sig} format=12.4];
print, "Adjusted Residuals", zadjres[rowname=b colname=b format=6.3];
print, "Significance Levels for the Adjusted Residuals",
        pzadjres[rowname=b colname=b format=5.4];
print, "Yule's Q Values", yulesq[rowname=b colname=b format=6.3];
print, "Unidirectional Kappas", kappa[rowname=b colname=b format=5.4];
print, "z values for the Unidirectional Kappas",
        zkappa[rowname=b colname=b format=6.3];
print, "Significance Levels for the Unidirectional Kappas",
        pzkappa[rowname=b colname=b format=5.4];




/* Permutation tests of significance */
if (permtest = 1) then do;

obs2    = shape(freqs,1,nrow(freqs)*ncol(freqs));
obs22   = shape((expfreq-(freqs-expfreq)),1,nrow(freqs)*ncol(freqs));
signs2  = shape(signs,1,nrow(freqs)*ncol(freqs));
sigs    = j(nblocks,nrow(freqs)*ncol(freqs),1);

do block = 1 to nblocks;
print, "Currently computing for block :", block;

results = j(nperms,nrow(freqs)*ncol(freqs),-9999);

do perm = 1 to nperms;

/* permuting the sequences; alogrithm from Castellan 1992. */

/* when adjacent codes may be the same */
datap = data;
if (adjacent = 1)  then do;
do i = 1 to (nrow(datap) -1);
kay = int( (nrow(datap) - i + 1) * uniform(seed) + 1 )  + i - 1;
d = datap[i,1];
datap[i,1] = datap[kay,1];
datap[kay,1] = d;
end;
end;

/* when adjacent codes may NOT be the same */
if (adjacent = 0) then do;
datap = ( 0 // data // 0 );
do i = 2 to (nrow(datap) - 2);
limit = 10000;
do j = 1 to limit;
kay = int( ((nrow(datap)-1) - i + 1) * uniform(seed) + 1 )  + i - 1;
if ( (datap[i-1,1]   ^= datap[kay,1]) & (datap[i+1,1]   ^= datap[kay,1])
   & (datap[kay-1,1] ^= datap[i,1])   & (datap[kay+1,1] ^= datap[i,1]) ) then;
j = limit;
end;
d = datap[i,1];
datap[i,1] = datap[kay,1];
datap[kay,1] = d;
end;
datap = datap[2:(nrow(datap)-1),];
end;

/* transitional frequency matrix for permuted data. */
freqsp = j(ncodes,ncodes,0);
do c = 1 to nrow(datap);
if ( c + lag <= nrow(datap) )    then
freqsp[(datap[c,1]),(datap[(c+lag),1])] =
 freqsp[(datap[c,1]),(datap[(c+lag),1])]  + 1;
end;

results[perm,] =  shape(freqsp,1,nrow(freqs)*ncol(freqs));
end;


/* sig levels for the current block of permutations */

/* one-tailed */
if (tailed = 1) then do;
do j = 1 to ncol(results);
counter = 0;
do i = 1 to nrow(results);
if   ( results[i,j] >= obs2[1,j]  & signs2[1,j] > 0 )    then;
counter = counter + 1;
if   ( results[i,j] <= obs2[1,j]  & signs2[1,j] < 0 )    then;
counter = counter + 1;
end;
if (signs2[1,j] ^= 0) then;
sigs[block,j] = counter / nperms;
end;
end;

/* two-tailed */
if (tailed = 2) then do;
do j = 1 to ncol(results);
counter = 0;
do i = 1 to nrow(results);
if ( signs2[1,j] > 0 &  ((results[i,j] >= obs2 [1,j]) |
                         (results[i,j] <= obs22[1,j]))) then;
counter = counter + 1;
if ( signs2[1,j] < 0 &  ((results[i,j] <= obs2 [1,j]) |
                         (results[i,j] >= obs22[1,j]))) then;
counter = counter + 1;
end;
if (signs2[1,j] ^= 0) then;
sigs[block,j] = counter /  nperms;
end;
end;

end;


/* mean significance levels and confidence intervals */
if  (confid = 95 & tailed = 1)  then
z = 1.645;
if  (confid = 95 & tailed = 2)  then
z = 1.96;
if  (confid = 99 & tailed = 1)  then
z = 2.326;
if  (confid = 99 & tailed = 2)  then
z = 2.576;

csums = j(1,ncol(sigs));
do t = 1 to ncol(sigs);
dumc = sigs[,t];
csums[1,t]=sum(dumc);
end;
meansigs = csums / nblocks;
if (nblocks > 1) then do;
semeans  = j(1, ncol(sigs), -9999);
do a = 1 to ncol(sigs);
semeans[1,a]=(sqrt(ssq(sigs[,a]-meansigs[1,a])/(nblocks-1)))/(sqrt(nblocks));
end;

confidhi = meansigs + z # semeans;
confidlo = meansigs - z # semeans;
end;

print, "Permutation Tests of Significance: ";
print, "Number of permutations per block: ", nperms;
print, "Number of blocks of permutations: ", nblocks;
meansigs = shape(meansigs,ncodes,ncodes);
print, "Mean Significance Levels", meansigs[rowname=b colname=b format=7.5];
if (nblocks > 1) then do;
print, "Percentage for the Confidence Intervals:", confid;
confidhi = shape(confidhi,ncodes,ncodes);
print, "High Ends of the Confidence Intervals",
        confidhi[rowname=b colname=b format=7.5];
confidlo = shape(confidlo,ncodes,ncodes);
print, "Low Ends of the Confidence Intervals",
        confidlo[rowname=b colname=b format=7.5];
end;

end;

quit;