Data Management

Below is a comparison of the commands used for common data management tasks in R, SAS, SPSS and Stata. The variables gender and workshop are categorical factors and q1 to q4, pretest and posttest are considered continuous and normally distributed.

The practice data set is shown here. The programs and the data they use are also available for download here. Examples that are missing for Stata reflect differences between the two books. We will get around to those when we write the second edition of R for Stata Users.

Aggregating or Summarizing or Collapsing Data Sets

R

setwd("c:/myRfolder")
load(file = "mydata.RData")
attach(mydata)

# The aggregate Function.

# Means by gender.
myAgg1 <- aggregate(q1, by = data.frame(gender), mean, na.rm = TRUE)
myAgg1

# Now by workshop and gender.
myAgg2 <- aggregate(q1, by = data.frame(workshop, gender), mean, na.rm=TRUE)
myAgg2
mode(myAgg2)
class(myAgg2)

# Aggregation with tapply.

myAgg2 <- tapply(q1, data.frame(workshop, gender), mean, na.rm = TRUE)
myAgg2
class(myAgg2)
mode(myAgg2)

myAgg2 <- tapply(q1, data.frame(workshop, gender), range, na.rm = TRUE)
myAgg2
mode(myAgg1)
class(myAgg2)
myAgg2[[1]]

# Example multi-level transformation.

mydata$Zq1 <- (q1 - mean(q1) ) / sd(q1)
mydata

mySubset <- mydata[ q1 < mean(q1), ]
mySubset

# Rename x to be mean.q1.
library("plyr")
myAgg3 <- rename(myAgg2,
c(x = "mean.q1") )
myAgg3

# Now merge means back with mydata.
mydata2 <- merge(mydata, myAgg3,
by=c("workshop", "gender") )
mydata2

# Tables of Counts
table(workshop)
table(gender, workshop)
myCounts <- table(gender, workshop)
mode(myCounts)
class(myCounts)

# Counts in Summary or
# Aggregate style.
myCountsDF <- as.data.frame(myCounts)
myCountsDF
class(myCountsDF)

SAS

* Get means of q1 for each gender;
PROC SUMMARY DATA=myLib.mydata MEAN NWAY;
CLASS GENDER;
VAR q1;
OUTPUT OUT=myLib.myAgg;
RUN;
PROC PRINT; RUN;
DATA myLib.myAgg;
SET myLib.myAgg;
WHERE _STAT_='MEAN'’MEAN’;
KEEP gender q1;
RUN;
PROC PRINT; RUN;

*Get means of q1 by workshop and gender;
PROC SUMMARY DATA=myLib.mydata MEAN NWAY;
CLASS WORKSHOP GENDER;
VAR Q1;
OUTPUT OUT=myLib.myAgg;RUN;
PROC PRINT; RUN;

*Strip out just the mean and matching variables;
DATA myLib.myAgg;
SET myLib.myAgg;
WHERE _STAT_='MEAN';
KEEP workshop gender q1;
RENAME q1=meanQ1;
RUN;
PROC PRINT; RUN;

*Now merge aggregated data back into mydata;
PROC SORT DATA=myLib.mydata;
BY workshop gender; RUN:
PROC SORT DATA=myLib.myAgg;
BY workshop gender; RUN:
DATA myLib.mydata2;
MERGE myLib.mydata myLib.myAgg;
BY workshop gender;
PROC PRINT; RUN;

SPSS

GET FILE='mydata.sav'.

AGGREGATE
/OUTFILE='myAgg.sav'
/BREAK=gender
/q1_mean = MEAN(q1).
GET FILE='myAgg.sav'.
LIST.

* Get mean of q1 by workshop
and gender.
GET FILE='mydata.sav'.
AGGREGATE
/OUTFILE='myAgg.sav'.
/BREAK=workshop gender
/q1_mean = MEAN(q1).
GET FILE='myAgg.sav'.
LIST.

* Merge aggregated data
back into mydata.
* This step can be saved
* by using
* MODE=ADDVARIABLES
* in the previous step.
GET FILE='mydata.sav'.
SORT CASES BY  workshop (A) gender (A) .
MATCH FILES /FILE=*
/TABLE='\myAgg.sav'
/BY workshop gender.
SAVE OUTFILE='mydata.sav'.

Stata

* Filename: Collapse.do

use c:\myRfolder\mydata
* Get means of q1 for each gender
tabstat q1, by(gender)

* Get means of q1 by workshop and gender;
tabulate outwork gender, summarize(q1) means

* Strip out just the mean and matching variables;
collapse (mean) workshop gender, by(q1)

* Merge aggregated data back into mydata.
* mydata assumed to be sorted by q1
sort q1
merge q1 using mydata

Applying Functions

The other packages have functions that go across the values of an observation, and procedures or commands that go down through observations. R has only commands, and which way they go can be changed by how they are “applied.”

R

load(file = "mydata.RData")
mydata
attach(mydata)

# Mean of the q variables
mean(mydata[3:6], na.rm = TRUE)

# Create mymatrix.
mymatrix <- as.matrix( mydata[ ,3:6] )
mymatrix

# Get mean of whole matrix.
mean(mymatrix, na.rm = TRUE)

# Get mean of matrix columns.
apply(mymatrix, 2, mean, na.rm = TRUE)

# Get mean of matrix rows.
apply(mymatrix, 1, mean, na.rm = TRUE)
rowMeans(mymatrix, na.rm = TRUE)

# Add row means to mydata.
mydata$meanQ <- apply(mymatrix, 1, mean, na.rm = TRUE)
mydata$meanQ <- rowMeans(mymatrix, na.rm = TRUE)
mydata <- transform(mydata,
meanQ = rowMeans(mymatrix, na.rm = TRUE)
)
mydata

# Means of data frames & their vectors.
lapply(mydata[ ,3:6], mean, na.rm = TRUE)
sapply(mydata[ ,3:6], mean, na.rm = TRUE)

mean(
sapply(mydata[ ,3:6], mean, na.rm = TRUE)
)

# Length of data frames & their vectors.
length(mydata[ ,"q3"] )
nrow(mydata)
is.na( mydata[ ,"q3"] )
!is.na( mydata[ ,"q3"] )
sum( !is.na( mydata[ ,"q3"] ) )

# Like the SAS/SPSS n from stat procedures.
library("prettyR")
sapply(mydata, valid.n)

apply(myMatrix, 1, valid.n)
mydata$myQn <- apply(myMatrix, 1, valid.n)
mydata

SAS

LIBNAME myLib 'C:\myRfolder';

 DATA myLib.mydata;
 SET  myLib.mydata;
 myMean = MEAN(OF q1-q4);
 myN = N(OF q1-q4);
 RUN;

 PROC MEANS;
   VAR q1-q4 myMean myN;
 RUN;

SPSS

GET FILE='mydata.sav'.

* Functions work for each observation (row).

COMPUTE myMean = Mean(q1 TO q4).
COMPUTE mySum  = Sum(q1 TO q4).
COMPUTE myN = mySum / myMean.

Stata

* Filename: FunctionsCommands.do

use c:\myRfolder\mydata, clear
preserve

* Some statistical functions
egen mymean = rowmean(q1-q4)
egen mysum = rowtotal(q1-q4)
gen myn = mysum/mymean
restore

* A statistical command
summary

By or Split-File Processing

R

setwd("c:/myRfolder")
load(file = "mydata.RData")
attach(mydata)

# Get means of q variables for all observations.
mean( mydata[ c("q1", "q2",
"q3", "q4") ] ,
na.rm = TRUE)

# Now get means by gender.
myBYout <- by( mydata[
c("q1", "q2", "q3", "q4") ] ,
mydata["gender"],
mean,na.rm = TRUE)
myBYout
mode(myBYout)
class(myBYout)
myBYdata <- as.data.frame( (as.table(myBYout) ) )
myBYdata

# Get range by workshop and gender
myVars <- c("q1", "q2", "q3", "q4")
myBys  <- mydata[
c("workshop", "gender") ]
myBYout <- by( mydata[myVars],
myBys, range, na.rm = TRUE )

# Converting output to data frame.
# A data frame the long way.
myBYdata <- data.frame(
rbind(myBYout[[1]], myBYout[[2]],
myBYout[[3]], myBYout[[4]])
)

# A data frame using do.call.
myBYdata <- data.frame( do.call( rbind, myBYout) )
myBYdata

SAS

PROC MEANS DATA=myLib.mydata;
RUN;

PROC SORT DATA=myLib.mydata;
BY gender;
RUN;
PROC MEANS DATA=myLib.mydata;
BY gender;
RUN;

PROC SORT DATA=myLib.mydata;
BY workshop gender;
RUN;
PROC MEANS DATA=myLib.mydata;
BY workshop gender;
RUN;

SPSS

GET FILE='mydata.sav'.

DESCRIPTIVES
VARIABLES=q1 q2 q3 q4
/STATISTICS=MEAN STDDEV MIN MAX.

SORT CASES BY gender .
SPLIT FILE
SEPARATE BY gender .
DESCRIPTIVES
VARIABLES=q1 q2 q3 q4
/STATISTICS=MEAN STDDEV MIN MAX.

SORT CASES BY workshop gender.
SPLIT FILE
SEPARATE BY workshop gender.
DESCRIPTIVES
VARIABLES=q1 q2 q3 q4
/STATISTICS=MEAN STDDEV MIN MAX.

Stata

use c:\myRfolder\mydata, clear
sort gender
by gender: sum gender
tabstat q1 q2 q3 q4, stat(mean sd min max)
tabstat q1 q2 q3 q4, stat(mean sd min max) by(gender)

Character String Manipulations

These examples uses a different practice data set named “giants.txt”:

born       died
R.A. Fisher    1890-02-17 1962-07-29
Carl Pearson   1857-03-27 1936-04-27
Gertrude Cox   1900-01-13 1978-10-17
John Tukey     1915-06-16 2000-07-26
William Gosset 1876-06-13 1937-10-16

R

# Filename: CharacterStrings.R

 gender <-c("m", "f", "m", NA, "m", "f", "m", "f")
 gender

 options(width = 58)
 letters
 LETTERS

 library("stringr")
 myVars <- str_c("Var", LETTERS[1:6])
 myVars

 setwd("c:/myRfolder")
 giants <- read.fwf(
    file  = "giants.txt",
    width = c(15, 11, 11),
    col.names  = c("name",      "born",      "died"),
    colClasses = c("character", "character", "POSIXct")
 )
 giants

 str_length( giants$name )

 giants[ giants$name == "R.A. Fisher", ]
 giants[ giants$name == "R.A. Fisher    ", ]

 giants$name <- str_trim(giants$name)
 attach(giants)
 str_length(name)

 toupper(name)
 tolower(name)
 library("ggplot2")
 firstUpper( tolower(name) )

 str_sub(name, 1, 5)

 myNamesMatrix <- str_split_fixed(name, " ", 2)
 myNamesMatrix

 myFirst <- myNamesMatrix[ ,1]
 myFirst
 myLast  <- myNamesMatrix[ ,2]
 myLast

 myFirst <- str_replace_all(myFirst, "R.A.", "Ronald A.")

 myLastFirst <- str_c( myLast, ", ", myFirst)
 myLastFirst

 myObs <- myLast == "Tukey"
 myObs
 myObs <- which(myLast == "Tukey")
 myObs
 giants[ myObs, ]

 myObs <- str_detect(myLast, "key")
 myObs

 myTable <- c("Box", "Bayes", "Fisher", "Tukey")
 myObs <- myLast %in% myTable
 myObs
 name[ myObs ]

 myObs <- str_detect( myLast, "Box|Bayes|Fish|key" )
 myObs
 name[ myObs ]

 myAthruM <- str_detect(myLastFirst, "^[A-M]")
 myAthruM
 name[ myAthruM ]
 name[!myAthruM ]

SAS

* Filename: CharacterStrings.sas

LIBNAME MYlIB 'C:\myRfolder';

DATA myLib.giants;
INFILE '\myRfolder\giants.txt'
MISSOVER DSD LRECL=32767;
INPUT name $char14. @16 born mmddyy10. @27 died yymmdd10.;
FORMAT born mmddyy10. died yymmdd10.;
myVarLength=length(name);
born=strip(born); *Not needed;

PROC PRINT;
RUN;

DATA  myLib.giants;
SET myLib.giants;
myLower= lowcase(name);
myUpper= upcase(name);
myProper=propcase(name);
PROC PRINT;   RUN;

DATA  myLib.giants;
SET myLib.giants;
myFirst5=substr(name, 1, 5);
* split names using substr;
myBlank=find(name, " ");
myFirst=strip( substr(name, 1, myBlank) );
myLast =strip( substr(name, myBlank)    );
PUT "Using substr... " myFirst= myLast=;
* splip names using scan;
myFirst=scan(name,1," ");
myLast =scan(name,2," ");
myFirst=tranwrd(myFirst,"R.A.","Ronald A.");
LENGTH myLastFirst $ 17;
myLastFirst= strip(myLast) || ", " || strip(myFirst);
*or: CALL CATX(", ", myLastFirst, myLast, myFirst);
PROC PRINT; VAR name myFirst myLast myLastFirst;
RUN;
DATA  tukey;
SET myLib.giants;
WHERE myLast="Tukey";
PROC PRINT;
VAR name;
RUN;
DATA  tukey;
SET myLib.giants;
WHERE FIND(myLast, "key");
PROC PRINT;
VAR name;
RUN;
DATA  mySubset;
SET myLib.giants;
WHERE myLast IN ("Box","Bayes","Fisher","Tukey");
RUN;
PROC PRINT;
VAR name;
RUN;
DATA  FishOrKey;
SET myLib.giants;
IF FIND(myLast, "Box")   |
FIND(myLast, "Bayes") |
FIND(myLast, "Fish")  |
FIND(myLast, "key") ;
RUN;
PROC PRINT;
VAR name;
RUN;
DATA AthruM;
SET myLib.giants;
firstLetter=substr(myLast, 1, 1);
IF "A" <= firstLetter <= "M";
RUN;
PROC PRINT;
VAR name;
RUN;

SPSS

* Filename: CharacterStrings.sps

CD 'C:\myRfolder'.

DATA LIST FILE='giants.txt' RECORDS=1
/1 name 1-14 (A) born 16-26 (ADATE) died 27-37 (SDATE).

STRING myFirst5 (A5)/ myLower myUpper myLastFirst (A17)
myFirst myLast (A9).
COMPUTE myLength1=LENGTH(name).
COMPUTE name=RTRIM(name).
COMPUTE myLength2=LENGTH(name).
COMPUTE myLower=LOWER(name).
COMPUTE myUpper=UPCASE(name).
LIST name myLower myUpper myLength1 myLength2.

COMPUTE myFirst5=SUBSTR(name, 1, 5).
COMPUTE myBlank= INDEX(name, " ").
COMPUTE myFirst=SUBSTR(name, 1, myBlank-1).
COMPUTE myFirst=REPLACE(myFirst, "R.A.", "Ronald A.").
COMPUTE myLast=SUBSTR(name, myBlank+1).
COMPUTE myLastFirst=CONCAT( RTRIM(myLast),
", ", RTRIM(myFirst) ).
LIST name myFirst myLast myLastFirst.

TEMPORARY.
SELECT IF (myLast EQ "Tukey").
LIST name.

TEMPORARY.
SELECT IF (CHAR.RINDEX(myLast,"Fish") GE 1
OR CHAR.RINDEX(myLast,"key") GE 1).
LIST name.

TEMPORARY.
SELECT IF (
myLast EQ "Box" OR
myLast EQ "Bayes" OR
myLast EQ "Fisher" OR
myLast EQ "Tukey").
LIST name.

TEMPORARY.
SELECT IF (
name EQ "Box" OR
name EQ "Bayes" OR
CHAR.RINDEX(myLast,"Fish") GE 1 OR
CHAR.RINDEX(myLast,"key") GE 1).
LIST name.

TEMPORARY.
SELECT IF(SUBSTR(myLast, 1, 1) LE "M").
LIST name.

Date and Time Manipulations

These examples uses a different practice data set named “giants.txt”:

born       died
R.A. Fisher    1890-02-17 1962-07-29
Carl Pearson   1857-03-27 1936-04-27
Gertrude Cox   1900-01-13 1978-10-17
John Tukey     1915-06-16 2000-07-26
William Gosset 1876-06-13 1937-10-16

R

setwd("c:/myRfolder")
giants <- read.fwf(
file        = "giants.txt",
width       = c(15,11,11),
col.names   = c("name",      "born",      "died"),
colClasses  = c("character", "character", "POSIXct"),
row.names   = "name",
strip.white = TRUE,
)
giants
class(giants$born)  # A character vector.

library("lubridate")
giants$born <- mdy(giants$born)
giants

class(giants$born)
class(giants$died)

giants  # They display in yyyy-mm-dd by default.
attach(giants)

unclass( born )
as.numeric( born )
as.POSIXct(
c(-2520460800,-3558556800,-2207952000,
-1721347200, -2952201600),
origin="1960-01-01", tz="UTC" )

#---Calculating Durations---

age <- difftime(died, born, units="secs")
age
age <- difftime(died, born)
age # now we have age in days

mode( age )
class( age )  # it's a difftime object

as.period(age)

age/365.2425  # age in years
giants$age <- round(
as.numeric( age/365.2425 ), 2 )
giants

now()  # Current date-time.
difftime( now(), died ) / 365.2425

# Again, using subtraction.
age <- died - born  # age in days
age
age / 365.2425   # Age in years, mislabeled.
mode(age)
class( age )     # it's an interval object.
names( age )
sapply( age, unclass )
# Not a helpful age to store in our data frame!

#---Adding Durations to Date-Times---

age <- as.duration(
c(2286057600,2495664000,2485382400,
2685916800,1935705600)
)
class(age)
born+age
died

#---Accessing Date-Time Elements---

year(born)
month(born)
month(born, label = TRUE)
day(born)   # day of month
wday(born)  # day of week
wday(born, label = TRUE, abbr = FALSE)
yday(born)

#---Creating Date-Times from Elements---

myYear  <- year(died)
myMonth <- month(died)
myDay   <- day(died)

myDateString <- paste(myYear, myMonth, myDay, sep="/")
myDateString
died2 <- ymd(myDateString)
died2

#---Logical Comparisons with Date-Times---

giants[ born > mdy("1/1/1900") , ]

#---SAS Picture Format Example---

myDateText <- format(born, "%B %d, %Y is day %j of %Y")
myDateText

myDateText <- format(born,
"was born on the %jth day of %Y")
for (i in 1:5) cat(rownames(giants)[i],
myDateText[i],"\n")

# Two-Digit Years

my1969 <- mdy("08/31/69")
my1969
my1968 <- mdy("08/31/68")
my1968
my1968 <- my1968 - as.duration(100 * 365.2425 * 24 * 60 * 60)
my1968

as.POSIXlt("08/31/68", format="%m/%d/%y")
as.POSIXlt("08/31/69", format="%m/%d/%y")

as.POSIXlt("08/31/69", format="%m/%d/%y")

SAS

LIBNAME MyLib 'C:\myRfolder';

DATA myLib.giants;
INFILE '\myRfolder\giants.txt'
MISSOVER DSD LRECL=32767;
INPUT name $char14. @16 born mmddyy10. @27 died mmddyy10.;

PROC PRINT;
RUN;

PROC PRINT;
FORMAT died born mmddyy10.;
RUN;

* Caculating Durations.;
DATA myLib.giants;
SET  myLib.giants;
age  = (died-born)/365.2425;
longAgo = ( today()-died )/365.2425;
RUN;

PROC PRINT;
FORMAT died born mmddyy10. age longAgo 5.2 ;
RUN;

* Adding Durations to Date-Times.
DATA myLib.giants;
SET myLib.giants;
died=born+age;
RUN;
PROC PRINT;
FORMAT died born mmddyy10. age 5.2;
RUN;

* Accessing Date-Time Elements;
DATA myLib.giants;
SET myLib.giants;
myYear=YEAR(born);
myMonth=MONTH(born);
myDay  =DAY(born);
PROC PRINT;
FORMAT died born mmddyy10. age 5.2;
RUN;

* Creating Date-Time Variables from Elements;
DATA myLib.giants;
set myLib.giants;
born=MDY(myMonth, myDay, myYear);
PROC PRINT;
FORMAT died born mmddyy10. age 5.2;
RUN;

* Logical Comparisons with Date-Times;
DATA Born1900s;
set myLib.giants;
if born > "01jan1900"d ;
PROC PRINT;
FORMAT died born mmddyy10. age 5.2;
RUN;

* Formatting Date-Time Output;
PROC FORMAT;
PICTURE myFormatI
LOW - HIGH = '%B %d, %Y is day %j of %Y'
(DATATYPE=DATE);
RUN;
PROC PRINT DATA=myLib.giants;
VAR born;
FORMAT born myFormatI40.;
RUN;

PROC FORMAT;
PICTURE myFormatII
LOW - HIGH = ' was born on the %jth day of %Y'
(DATATYPE=DATE);
RUN;
DATA _NULL_;
SET myLib.giants;
PUT name $char14. born myFormatII34.;
run;

SPSS

CD 'C:\myRfolder'.

DATA LIST FILE='giants.txt' RECORDS=1
/1 name 1-14 (A) born 16-26 (ADATE) died 27-37 (SDATE).

* Calculating Durations.
COMPUTE age=died-born.
LIST.
COMPUTE age=(died-born) / (365.2425*24*60*60).
LIST.
COMPUTE longAgo=($TIME-died) /(365.2425*24*60*60) .
LIST.

* Adding Durations to Date-Times.
COMPUTE died=born+age.
LIST.

* Accessing Date-Time Elements.
COMPUTE myYear=XDATE.YEAR(born).
COMPUTE myMonth=XDATE.MONTH(born).
COMPUTE myDay=XDATE.MDAY(born).
LIST name born myYear myMonth myDay.

* Creating Date-Time Variables from Elements.
COMPUTE born=DATE.MDY(myMonth, myDay, myYear).
LIST name born.

* Logical Comparisons with Date-Times.
TEMPORARY.
SELECT IF born GE date.mdy(1,1,1900).
LIST name born.

* Formatting Date-Time Output.
PRINT /born (adate) ' is the' myDay (F3.0)
'th day of ' myYear (F4.0).
EXECUTE.

PRINT /name 'was born on the' myDay (F3.0)
'th day of ' myYear (F4.0).
EXECUTE.

Duplicate Observations: Finding and Deleting

R

load("mydata.RData")
mydata

# Create some duplicates.
myDuplicates <- rbind(mydata, mydata[1:2, ])
myDuplicates

# Get rid of duplicates without seeing them.
myNoDuplicates <- unique(myDuplicates)
myNoDuplicates

# This checks for location of duplicates
# before getting rid of them.

myDuplicates <- rbind(mydata, mydata[1:2, ])
myDuplicates

myDuplicates$DupRecs <- duplicated(myDuplicates)
myDuplicates

# Print a report of just the duplicate records.
attach(myDuplicates)
myDuplicates[DupRecs, ]

# Remove duplicates and Duplicated variable.
myNoDuplicates <-
myDuplicates[!DupRecs, -7 ]
myNoDuplicates

# Locate records with duplicate keys.
myKeys <- c("workshop", "gender")
mydata$DupKeys <-
duplicated(mydata[ ,myKeys])
mydata

SAS

DATA mycopy; SET myLib.mydata;
Data lastTwo;
SET myLib.mydata;
IF ID GE 7;
RUN;

DATA Duplicates;
SET mycopy lastTwo;
PROC PRINT; RUN;

PROC SORT NODUPREC DATA=Duplicates;
BY id workshop gender q1-q4;
RUN;

PROC PRINT;
RUN;

PROC SORT NODUPKEY EQUALS DATA=mycopy;
BY workshop gender;
RUN;

PROC PRINT DATA=mycopy;
RUN;

SPSS


GET FILE='mydata.sav'.
* Identify Duplicate Cases.
SORT CASES BY workshop(A) gender(A)
q2(A) q1(A) q3(A) q4(A) .
MATCH FILES /FILE = *
/BY workshop gender q2 q1 q3 q4
/FIRST = PrimaryFirst
/LAST = PrimaryLast.
DO IF (PrimaryFirst).
+ COMPUTE MatchSequence = 1 - PrimaryLast.
ELSE.
+ COMPUTE MatchSequence = MatchSequence + 1.
END IF.
LEAVE MatchSequence.
FORMAT MatchSequence (f7).
COMPUTE InDupGrp = MatchSequence > 0.
SORT CASES InDupGrp(D).
MATCH FILES /FILE = *
/DROP = PrimaryFirst InDupGrp MatchSequence.
VARIABLE LABELS
PrimaryLast 'Indicator of each last matching case as Primary'.
VALUE LABELS PrimaryLast
0 'Duplicate ‘Duplicate Case' Case’
1 'Primary ‘Primary Case'Case’.
VARIABLE LEVEL PrimaryLast (ORDINAL).
FREQUENCIES VARIABLES = PrimaryLast.

Stata

use c:\myRfolder\mydata, clear
duplicates drop id workshop gender q1-q4, force
list

First / Last Observations per Group, Finding

R

setwd("c:/myRfolder")
load(file = "mydata.RData")
mydata$id <- row.names(mydata)
mydata

myBys <-
data.frame(mydata$workshop, mydata$gender)
mylastList <- by(mydata, myBys, tail, n = 1)
mylastList

#Back into a data frame:
mylastDF <- do.call(rbind, mylastList)
mylastDF

# Another way to create the data frame:
mylastDF <- rbind(mylastList[[1]],
mylastList[[2]],
mylastList[[3]],
mylastList[[4]])
mylastDF

# Generating just an indicator variable
mylastDF$lastGender <- rep(1, nrow(mylastDF) )
mylastDF

mylastDF2 <- mylastDF[ c("id", "lastGender") ]
mydata2 <- merge(mydata, mylastDF2, by = "id", all = TRUE )
mydata2

mydata2$lastGender[
is.na(mydata2$lastGender) ] <- 0
mydata2

SAS

PROC SORT DATA=sasuser.mydata;
BY workshop gender;
RUN;

DATA sasuser.mylast;
SET  sasuser.mydata;
BY   workshop gender;
IF   last.gender;
RUN;

PROC PRINT; RUN;

SPSS


* Match files method.
GET FILE='mydata.sav'.
SORT CASES BY workshop gender.
MATCH FILES FILE=* /By workshop gender /LAST=lastgender.
SELECT IF lastgender.
LIST.
SAVE OUTFILE='mylast.sav'.

* Aggregation method.
SORT CASES BY workshop gender.
AGGREGATE /OUTFILE='C:\mylast.sav'
/BREAK workshop gender
/q1 = LAST(q1)
/q2 = LAST(q2)
/q3 = LAST(q3)
/q4 = LAST(q4).

* Using LIST here would display original file.
GET FILE='mylast.sav'.
DATASET NAME DataSet5 WINDOW=FRONT.
LIST.

Stata


use c:\myRfolder\mydata, clear
sort workshop gender
by workshop gender: keep if _n==_N
list

Indicator or Dummy Variables

R will create these automatically if you use a factor as a regression predictor.

R

load("mydata100.RData")
attach(mydata100)

r     <- as.numeric(workshop == "R"    )
sas   <- as.numeric(workshop == "SAS"  )
spss  <- as.numeric(workshop == "SPSS" )
stata <- as.numeric(workshop == "Stata")

lm(posttest ~ pretest + sas + spss + stata)

# Same result without them:
lm(posttest ~ pretest + workshop)

SAS

DATA  temp;
SET myLib.mydata100;
r     = workshop = 1;
sas   = workshop = 2;
spss  = workshop = 3;
stata = workshop = 4;
RUN;

PROC REG;
MODEL posttest = pretest sas spss stata;
RUN;

SPSS

GET FILE='mydata100.sav'.
DATASET NAME DataSet2 WINDOW=FRONT.

COMPUTE r     = workshop EQ 1.
COMPUTE sas   = workshop EQ 2.
COMPUTE spss  = workshop EQ 3.
COMPUTE stata = workshop EQ 4.
EXECUTE.

REGRESSION
/DEPENDENT posttest
/METHOD=ENTER pretest sas spss stata.
EXECUTE.

Joining or Merging Data Sets

R

# Read data keeping ID as a variable.
mydata <- read.table("mydata.csv",
header = TRUE, sep = ",",
na.strings = " ")

# Create a data frame keeping
# the left two q variables.
myleft <- mydata[ c("id",
"workshop", "gender", "q1",
"q2") ]

# Create a data frame keeping the right two q variables.
myright <- mydata[ c("id",
"workshop", "q3", "q4") ]

# Merge the two data frames
# by ID.
both <- merge(myleft, myright,
by = "id")

# Merge the two data frames
# by ID allowing different
# ID names in each data set.
both <- merge(myleft, myright,
by.x = "id", by.y = "id" )

# Merge data frames
# by both ID and workshop.
both <- merge(myleft, myright,
by = c("id","workshop"))

#Merge dataframes by both ID
# and workshop, while allowing
# them to have different names.
both <- merge(myleft, myright,
by.x=c("id", "workshop"),
by.y=c("id", "workshop") )

SAS

DATA  myLib.myleft;
SET mylib.mydata;
KEEP id workshop gender
q1 q2;
PROC SORT; BY id workshop;

DATA  myLib.myright;
SET myLib.mydata;
KEEP id workshop q3 q4;
PROC SORT; BY id workshop;

DATA  myLib.both;
MERGE myLib.myleft
myLib.myright;
BY id workshop;
RUN;

SPSS

GET FILE='mydata.sav'.
DELETE VARIABLES q3 to q4.
SAVE OUTFILE='myleft.sav'.

GET FILE='mydata.sav'.
DELETE VARIABLES gender,
q1 to q2.
SAVE OUTFILE='myright.sav'.

GET FILE='myleft.sav'.
MATCH FILES /FILE=*
/FILE='myright.sav'
/BY id.

<strong>Stata</strong>

1
* Filename: Merge.do

use c:\myRfolder\mydata, clear
drop q3 q4
save c:\myRfolder\myleft

use c:\myRfolder\mydata, clear
drop workshop-q2
save c:\myRfolder\myright

use c:\myRfolder\myleft, clear
sort id

merge id using c:\myRfolder\myright
save c:\myRfolder\both, replace

Keeping or Dropping Variables

R

# Keep using variable selection.
myleft <- mydata[ ,1:4]
myleft

# Drop using NULL.
myleft <- mydata
myleft$q3 <- myleft$q4 <- NULL

SAS

* Keeping vars;
DATA myleft;
SET mydata;
KEEP id workshop gender
q1 q2;

*Equivalently dropping;
DATA myleft; SET mydata;
DROP q3 q4;

SPSS

GET FILE='mydata.sav'.
DELETE VARIABLES q3 to q4.

Stata

use c:\myRfolder\mydata, clear
keep id workshop gender q1 q2
*or equivalently;
* drop q3 q4
save c:\myRfolder\myleft, replace

Missing Values

These examples use a different version of mydata which includes periods, 9’s and 99’s as missing value codes. Note that the periods in R are not read automatically as missing values! Here is mydataNA:

workshop gender q1 q2 q3 q4
1        1      f  1  1  5  1
2        2      f  2  1  4 99
3        .      f  9  2  4  3
4        2      .  3  9 99  3
5        1      m  4  5  2  4
6        .      m  9  9  5  5
7        1      .  5  3 99  4
8        2      m  4  5  5 99


R

setwd("c:/myRfolder")

mydataNA <- read.table("mydataNA.txt")
mydataNA

# Read it so that ".", 9, 99 are missing.
mydataNA <- read.table("mydataNA.txt",
na.strings = c(".", "9", "99") )
mydataNA

# Convert 9 and 99 manually
mydataNA <- read.table("mydataNA.txt",
na.strings=".")
mydataNA[mydataNA == 9 | mydataNA == 99] <- NA
mydataNA
# Substitute the mean for missing values.
mydataNA$q1[is.na(mydataNA$q1)] <-
mean(mydataNA$q1, na.rm = TRUE)
mydataNA

# Eliminate observations with any NAs.
myNoMissing <- na.omit(mydataNA)
myNoMissing

# Test to see if each case is complete.
complete.cases(mydataNA)

# Use that result to select complete cases.
myNoMissing <- mydataNA[ complete.cases(mydataNA), ]
myNoMissing

# Use that result to select incomplete cases.
myIncomplete <- mydataNA[ !complete.cases(mydataNA), ]
myIncomplete

# When "99" Has Meaning...
mydataNA <- read.table("mydataNA.txt", na.strings = ".")
mydataNA
attach(mydataNA)

# Assign missing values for q variables.
mydataNA$q1[q1 ==  9] <- NA
mydataNA$q2[q2 ==  9] <- NA
mydataNA$q3[q3 == 99] <- NA
mydataNA$q4[q4 == 99] <- NA
mydataNA
detach(mydataNA)

# Read file again, this time use functions.
mydataNA <- read.table("mydataNA.txt", na.strings = ".")
mydataNA
attach(mydataNA)

#Create a functions that replaces 9, 99 with NAs.
my9isNA   <- function(x) { x[x ==  9] <- NA; x}
my99isNA  <- function(x) { x[x == 99] <- NA; x}

# Now apply our functions to the data frame using lapply.
mydataNA[3:4] <- lapply( mydataNA[3:4], my9isNA )
mydataNA[5:6] <- lapply( mydataNA[5:6], my99isNA )
mydataNA

SAS

DATA myLib.mydata;
SET  myLib.mydata;

*Convert 9 to missing, one at a time.
IF q1=9 THEN q1=.;
IF q2=9 THEN q2=.;
IF q3=99 THEN q2=.;
IF q4=99 THEN q4=.;

* Same thing but is quicker for lots of vars;
ARRAY q9 q1-q2;
DO OVER q9;
IF q9=9 THEN q=.;
END;

ARRAY q99 q3-q4;
DO OVER q99;
IF q=99 THEN q99=.;
END;

SPSS

CD 'C:\myRfolder'.
GET FILE=('mydata.sav').

MISSING q1 TO q2 (9) q3 TO q4 (99).

SAVE OUTFILE='mydata.sav'.

Stata

* Filename: MissingValues.do

use c:\myRfolder\mydata, clear
preserve
replace q1=. if q1==9
replace q2=. if q2==9
replace q3=. if q3==99
replace q4=. if q4==99

* Same thing but is quicker for lots of vars
restore
forvalues i = 1/2 {
replace q`i'=. if q`i'==9
}
forvalues i = 3/4 {
replace q`i'=. if q`i'==90
}
restore

Ranks, Calculating

R

# Applies to vars 3 through 6,
# the vars q1 to q4.

myvars <- c("q1","q2","q3","q4")
myRanks <-
sapply(mydata[myvars], rank)

SAS

PROC RANK DATA=mylib.mydata
OUT=myRanks;
VAR Q1-Q4;
  PROC PRINT;

SPSS


RANK VARIABLES=q1 q2 q3 q4 (A)
/RANK
/PRINT=YES
/TIES=MEAN.

Recoding Variables

R

load(file = "myWorkspace.RData")
mydata
attach(mydata)

library("car")
mydata$qr1 <- recode(q1, "1=2; 5=4")
mydata$qr2 <- recode(q2, "1=2; 5=4")
mydata$qr3 <- recode(q3, "1=2; 5=4")
mydata$qr4 <- recode(q4, "1=2; 5=4")
detach(mydata)

# This approach works on thousands.

load(file = "mydata.RData")
attach(mydata)

# Generate two sets of var names
# so we can save original vars.
# QR vars will be the recoded ones.
myQnames <-
paste( "q",  1:4, sep = "")
myQRnames <-
paste( "qr", 1:4, sep = "")

# Extract the q variables to a separate data frame.
myQRvars <- mydata[ ,myQnames]

# Rename all of the variables.
names(myQRvars) <- myQRnames
myQRvars

# Create a function to apply the labels to lots of variables.
myRecoder <- function(x) { recode(x,"1=2; 5=4") }

# Here's how to use the function
# on one variable.
myRecoder(myQRvars$qr1)

#Apply it to all of the variables.
myQRvars <- sapply( myQRvars, myRecoder)
myQRvars

# Save it back to mydata
# if you want.
mydata <- cbind(mydata,myQRvars)

SAS

DATA myLib.mydata;
INFILE "mydata.csv"
delimiter = ","
MISSOVER DSD
LRECL=32767
firstobs=2;

INPUT id workshop gender $ q1 q2 q3 q4;

PROC PRINT; RUN;

PROC FORMAT;
VALUE Agreement
1="Disagree"
2="Disagree"
3="Neutral"
4="Agree"
5="Agree"; run;

DATA myLib.mydata;
SET myLib.mydata;
ARRAY q q1-q4;
ARRAY qr qr1-qr4;
*r for recoded;
DO i=1 to 4;
qr{i}=q{i};
if q{i}=1 then
qr{i}=2;
else if
q{i}=5 then qr{i}=4;
END;
FORMAT q1-q4 qr1-qr4
Agreement.;
RUN;

* This will use the recoded formats automatically;

PROC FREQ; TABLES q1-q4; RUN;

* This will ignore the formats;
* Note high/low values are 1/5;
PROC UNIVARIATE;
VAR q1-q4; RUN;

* This will use the 1-3 codings, not a good idea!;
* High/Low values are now 2/4;

PROC UNIVARIATE;
VAR qr1-qr4;
RUN;

SPSS

GET FILE='mydata.sav'.

RECODE q1 to q4 (1=2) (5=4).

SAVE OUTFILE='myleft.sav'.

Stata

* Filename: Recode.do

use c:\myRfolder\mydata, clear
recode q1-q4 (1=2) (5=4)

Renaming Variables

R

library("plyr")
# Use old="new" format.

myChanges <-
c(q1 = "x1", q2 = "x2",
q3 = "x3", q4 = "x4")
mydata <- rename(mydata, myChanges)

# The standard R approach is
# more complex when renaming just
# a few variables:

names(mydata) <- c("workshop", "gender", "x1", "x2", "x3", "x4")

# See books for many more examples.

SAS

DATA myLib.mydata;
RENAME q1-q4=x1-x4;
*or;
RENAME q1=x1  q2=x2
q3=x3  q4=x4;

SPSS


GET FILE='mydata.sav'.
RENAME VARIABLES (q1=x1)(q2=x2)(q3=x3)(q4=x4).
* Or...
RENAME VARIABLES (q1 q2 q3 q4 = x1 x2 x3 x4).
* Or...
RENAME VARIABLES (q1 TO q4 = x1 TO x4).

Stata

* Filename: Rename.do

use c:\myRfolder\mydata, clear
preserve
rename q1 x1
rename q2 x2
rename q3 x3
rename q4 x4

* or
restore
forvalues i = 1/4 {
rename q`i' x`i'
}

Reshaping Variables to Observations and Back

R

# Filename: Reshape.R

setwd("c:/myRfolder")
load("mydata.RData")

library("reshape2")
myChanges <- c(
q1 = "time1",
q2 = "time2",
q3 = "time3",
q4 = "time4")
mydata <- rename(mydata, myChanges)
mydata$subject <- factor(1:8)
mydata

# Reshaping from wide to long
library("reshape2")  # Just a reminder
mylong <- melt(mydata)
mylong

# Again, specifying arguments
mylong <- melt(mydata,
id.vars      = c("subject", "workshop", "gender"),
measure.vars = c("time1", "time2", "time3", "time4"),
value.name   = "variable")
mylong

# Reshaping from long to wide
mywide <- dcast(mylong,
subject + workshop + gender ~ variable)
mywide

# ---Two Time Variables---

load("mydata.RData")
mydata$subject <- factor(1:8)
library("reshape2")
myChanges <- c(
q1 = "M1_L1",
q2 = "M1_L2",
q3 = "M2_L1",
q4 = "M2_L2")
mydata <- rename(mydata, myChanges)
mydata

library("reshape2")  # Just a reminder
mylong2 <- melt(mydata)
mylong2

# Same thing with arguments specified
mylong2 <- melt(mydata,
id.vars      = c("subject", "workshop", "gender"),
measure.vars = c("M1_L1", "M1_L2", "M2_L1", "M2_L2"),
value.name   = "value")
mylong2

mylong2$method <- rep( c("M1", "M2"), each=16, times=1)
mylong2$level  <- rep( c("L1", "L2"), each=8,  times=2)
mylong2
mylong2$variable <- NULL

# Reshape to wide
mywide2 <- dcast(mylong2,
gender + workshop + subject ~ method + level)
mywide2
mywide2[ order(mywide2$subject), ]

# Aggregation via reshape

dcast(mylong2, gender + workshop ~ method + level,
mean, na.rm = TRUE)
dcast(mylong2, gender + workshop ~ method,
mean, na.rm = TRUE)
dcast(mylong2, workshop ~ .,
mean, na.rm = TRUE)

SAS

* Filename: Reshape.sas ;

LIBNAME myLib 'C:\myRfolder';

* Wide to long;
PROC TRANSPOSE DATA=mylib.mydata
OUT=myLib.mylong;
VAR q1-q4;
BY id workshop gender;
PROC PRINT;
RUN;
DATA mylib.mylong;
SET mylib.mylong( rename=(COL1=value) );
time=INPUT( SUBSTR( _NAME_, 2) , 1.);
DROP _NAME_;
RUN;
PROC PRINT;
RUN;

* Long to wide;
PROC TRANSPOSE DATA=mylib.mylong
OUT=myLib.mywide PREFIX=q;
BY id workshop gender;
ID time;
VAR value;
RUN;
DATA mylib.mywide;
SET mylib.mywide(DROP=_NAME_);
RUN;
PROC PRINT;
RUN;

SPSS


* Filename: Reshape.sps .

CD 'C:\myRfolder'.
GET FILE='mydata.sav'.
* Wide to long.
VARSTOCASES  /MAKE Y FROM q1 q2 q3 q4
/INDEX = Question(4)
/KEEP =  id workshop gender
/NULL = KEEP.
LIST.
SAVE OUTFILE='mywide.sav'.
* Long to wide.
GET FILE='mywide.sav'.
CASESTOVARS
/ID = id workshop gender
/INDEX = Question
/GROUPBY = VARIABLE.
LIST.
SAVE OUTFILE='mylong.sav'.

Stata

* Filename: Reshape.do

use c:\myRfolder\mydata, clear

reshape long q, i(subject) j(item)
list

reshape wide q, i(subject) j(item)
list

Sorting Data Sets

R

setwd("c:/myRfolder")
load(file = "mydata.RData")
mydata

# Show first four observations in order.
mydata[ c(1, 2, 3, 4), ]

# Show them in reverse order.
mydata[ c(4, 3, 2, 1), ]

# Create order variable for workshop.
myW <- order( mydata$workshop )
myW
mydata[ myW, ]

# Create order variable for gender then workshop.
myGW <- order( mydata$gender, mydata$workshop )
myGW
mydata[ myGW, ]

# Create order variable for
# descending (-) workshop then gender
myWdG <- order( -mydata$workshop, mydata$gender )
myWdG

# Print data in WG order.
mydata[ myWdG, ]

# Save data in WdG order.
mydataSorted <- mydata[ myWdG,  ]
mydataSorted

SAS

LIBNAME myLib '\myRfolder';

PROC SORT  DATA = myLib.mydata;
BY workshop;
RUN;
PROC PRINT DATA = myLib.mydata;
RUN;

PROC SORT  DATA = myLib.mydata;
BY gender workshop;
RUN;
PROC PRINT DATA = myLib.mydata;
RUN;

PROC SORT  DATA = myLib.mydata;
BY descending gender workshop;
RUN;
PROC PRINT DATA = myLib.mydata;
RUN;

SPSS

CD '\myRfolder'.
GET FILE = 'mydata.sav'.

SORT CASES BY workshop (A).
LIST.

SORT CASES BY gender (A) workshop (A).
LIST.

SORT CASES BY  gender (D) workshop (A).
LIST.

Stata

use c:\myRfolder\mydata, clear

* sort workshop ascending order
sort workshop
list

* sort gender and workshop, both ascending order
sort gender workshop
list

* sort workshop descending and gender in ascending order.
gsort - workshop + gender
list

Stacking or Concatenating or Adding Data Sets

R

load(file = "mydata.RData")
mydata
attach(mydata)

# Create female data frame.
females <-
mydata[which(gender == "f"),]

# Create male data frame.
males <-
mydata[which(gender == "m"),]

# Bind their rows together
# with the rbind function.
both <- rbind(females, males)

# Drop q2 to see what happens.
males$q2 <- NULL
males

# See that row bind will
# not work now.
both <- rbind(females, males)

# Use plyr rbind.fill instead.
library("plyr")
both <-
rbind.fill(females, males)

# Add a q2 variable to males.
males <- data.frame(males, q2 = NA)

# Now rbind can handle it.
both <-
rbind(females, males)

SAS

DATA males;
SET mydata;
WHERE gender='m';
RUN;

DATA females;
SET mydata;
WHERE gender='f';
RUN;

DATA both;
SET males females;
RUN;
PROC PRINT; RUN;

SPSS

GET FILE='mydata.sav'.
SELECT IF(gender = "f").
LIST.
SAVE OUTFILE='females.sav'.
EXECUTE .

GET FILE='mydata.sav'.
SELECT IF(gender = "m").
LIST.
SAVE OUTFILE='males.sav'.
EXECUTE .

GET FILE='females.sav'.
ADD FILES /FILE=*
/FILE='males.sav'.
LIST.
EXECUTE.

Stata

use c:\myRfolder\mydata, clear
preserve

keep if gender=="m"
save c:\myRfolder\mymale
restore

keep if gender=="f"
save c:\myRfolder\myfemale
use c:\myRfolder\mymale, clear

append using c:\myRfolder\myfemale
save c:\myRfolder\both

Transformations

R

load(file = "mydata.RData")
mydata

# Transformation in the middle
# of another function.
summary( log(mydata$q4) )

# Using dollar notation.
mydata$meanQ <-
(mydata$q1 + mydata$q2 +
mydata$q3 + mydata$q4) / 4
mydata

# Creating meanQ using transform.
mydata <- transform(mydata,
meanQ=(q1 + q2 + q3 + q4) / 4 )
mydata

SAS

LIBNAME myLib 'C:\myRfolder';

DATA myLib.mydataTransformed;
SET  myLib.mydata;
totalq = (q1 + q2 + q3 + q4);
logtot = log10(totalq);
mean1 = (q1 + q2 + q3 + q4) / 4;
mean2 = mean(of q1-q4);

PROC PRINT; RUN;

SPSS

GET FILE='mydata.sav'.

COMPUTE Totalq=q1+q2+q3+q4.
COMPUTE Logtot=lg10(totalq).
COMPUTE Mean1=(q1+q2+q3+q4)/4.
COMPUTE Mean2=MEAN(q1 TO q4).

SAVE OUTFILE='mydataTransformed.sav'.
LIST.

Stata

* Filename: Transform.do

use c:\myRfolder\mydata, clear
gen totalq = q1+q2+q3+q4
gen logtotalq = log10(totalq)
gen mean1 = totalq/4
egen mean2 = mean(q1-q4)
save c:\myRfolder\mydataT

Transformations, Conditional

R

setwd("c:/myRfolder")
load(file = "mydata.RData")
mydata
attach(mydata)

mydata$q4Sagree <- ifelse(q4 == 5, 1, 0)
mydata$q4Sagree

mydata$q4Sagree <- as.numeric(q4 == 5 )
mydata$q4Sagree

mydata$q4agree <- ifelse(q4 >= 4, 1, 0)
mydata$q4agree

mydata$ws1agree <- ifelse(workshop == 1 & q4 >=4 , 1, 0)
mydata$ws1agree

mydata$score <- ifelse(gender == "f",
(2 * q1) + q2,
(3 * q1) + q2
)
mydata

# ---Cutting posttest---

detach(mydata)
load("mydata100.RData")
attach(mydata100)
head(mydata100)

# An inefficient approach:
postGroup <- posttest
postGroup <- ifelse(posttest< 60              , 1, postGroup)
postGroup <- ifelse(posttest>=60 & posttest<70, 2, postGroup)
postGroup <- ifelse(posttest>=70 & posttest<80, 3, postGroup)
postGroup <- ifelse(posttest>=80 & posttest<90, 4, postGroup)
postGroup <- ifelse(posttest>=90              , 5, postGroup)

table(postGroup)

# An efficient approach:
postGroup <-
ifelse(posttest <  60                 , 1,
ifelse(posttest >= 60 &  posttest < 70, 2,
ifelse(posttest >= 70 &  posttest < 80, 3,
ifelse(posttest >= 80 &  posttest < 90, 4,
ifelse(posttest >= 90                 , 5, posttest)
))))
table(postGroup)

# Logical approach:
postGroup <- 1+
(posttest >= 60)+
(posttest >= 70)+
(posttest >= 80)+
(posttest >= 90)
table(postGroup)

# ---Cutting Functions---

# Hmisc cut2 function
library("Hmisc")
postGroup <- cut2(posttest, c(60, 70, 80, 90) )
table(postGroup)

postGroup <- cut2(posttest, g = 5)
table(postGroup)

postGroup <- cut2(posttest, m = 25)
table(postGroup)

SAS

LIBNAME myLib 'C:\myRfolder';

DATA myLib.mydataTransformed;
SET myLib.mydata;
IF q4  = 5 then x1 = 1; else x1 = 0;
IF q4 >= 4 then x2 = 1; else x2 = 0;
IF workshop = 1 & q4 >= 5 then  x3 = 1;
ELSE x3 = 0;
IF gender = "f" then scoreA = 2 * q1 + q2;
Else scoreA = 3 * q1 + q2;
IF workshop = 1 and q4 >= 5
THEN scoreB = 2 * q1 + q2;
ELSE scoreB = 3 * q1 + q2;
RUN;
PROC PRINT; RUN;

DATA myLib.mydataTransformed;
SET myLib.mydata100;

IF      (posttest <  60)                 THEN postGroup = 1;
ELSE IF (posttest >= 60 & posttest < 70) THEN postGroup = 2;
ELSE IF (posttest >= 70 & posttest < 80) THEN postGroup = 3;
ELSE IF (posttest >= 80 & posttest < 90) THEN postGroup = 4;
ELSE IF (posttest >= 90)                 THEN postGroup = 5;
RUN;

PROC FREQ; TABLES postGroup; RUN;

PROC RANK OUT=myLib.mydataTransformed GROUPS=5;
VAR posttest;
RUN;

PROC FREQ; TABLES posttest; RUN;

SPSS

CD 'C:\myRfolder'.
GET FILE = 'mydata.sav'.

DO IF (Q4 eq 5).
+ COMPUTE X1 = 1.
ELSE.
+ COMPUTE X1 = 0.
END IF.

DO IF (Q4 GE 4).
+ COMPUTE X2 = 1.
ELSE.
+ COMPUTE X2 = 0.
END IF.

DO IF (workshop EQ 1 AND q4 GE 4).
+ COMPUTE X3 = 1.
ELSE.
+ COMPUTE X3 = 0.
END IF.

DO IF (gender = 'f').
+ COMPUTE scoreA = 2 * q1 + q2.
ELSE.
+ COMPUTE scoreA = 3 * q1 + q2.
END IF.

DO IF (workshop EQ 1 AND q4 GE 5).
+ COMPUTE scoreB = 2 * q1 + q2.
ELSE.
+ COMPUTE scoreB = 3 * q1 + q2.
END IF.
EXECUTE.

GET FILE='mydata100.sav'.
DATASET NAME DataSet2 WINDOW=FRONT.

DO IF (posttest LT  60).
+ COMPUTE  postGroup = 1.
ELSE IF (posttest GE 60  AND posttest LT 70).
+ COMPUTE postGroup = 2.
ELSE IF (posttest GE 70 AND posttest LT 80).
+ COMPUTE postGroup = 3.
ELSE IF (posttest GE 80 AND posttest LT 90).
+ COMPUTE postGroup = 4.
ELSE IF (posttest GE 90).
+ COMPUTE postGroup = 5.
END IF.
EXECUTE.

DATASET ACTIVATE DataSet2.
FREQUENCIES VARIABLES=postGroup
/ORDER=ANALYSIS.

DATASET ACTIVATE DataSet1.
RANK VARIABLES=posttest (A)
/NTILES(5)
/PRINT=YES
/TIES=MEAN.

FREQUENCIES VARIABLES=Nposttes
/ORDER=ANALYSIS.

Stata

* Filename: TransformIF.do

use c:\myRfolder\mydata, clear
preserve
if q4 == 5 {
gen x1=1
}
else {
gen x1=0
}
if q4>=4 {
gen x2=1
}
else {
gen x2=0
}
if workshop == 1 & q4>=5 {
gen x3=1
}
else {
gen x3=0
}
if gender=="f" {
gen scoreA = 2*q1+q2
}
else {
gen scoreA = 3*q1+q2
}
if workshop==1 & q4>5 {
gen scoreB = 2*q1+q2
}
else {
gen scoreB = 3*q1+q2
}
restore

Transformations, Multiple Conditional

R

attach(mydata)
mydata

# Using the ifelse approach.
mydata$score1 <-
ifelse( gender == "f",
(2 * q1) + q2,  #1st for females.
(20* q1) + q2   #1st for males.
)
mydata$score2 <-
ifelse( gender == "f",
(3  * q1) + q2,  #2nd for females.
(30 * q1) + q2   #2nd for males.
)
mydata

# Using the subscript approach.
load(file = "mydata.RData")

# Create names in data frame.
detach(mydata)
mydata <-
data.frame(mydata,
score1 = NA,
score2 = NA)

attach(mydata)

# Find which are males and females.
gals <- which( gender == "f" )
guys <- which( gender == "m" )

# Now assign scores.
mydata[gals, "score1"] <-
2 * q1[gals] + q2[gals]
mydata[gals, "score2"] <-
3 * q1[gals] + q2[gals]
mydata[guys, "score1"] <-
20 * q1[guys] + q2[guys]
mydata[guys, "score2"] <-
30 * q1[guys] + q2[guys]

SAS

DATA myLib.mydata;
SET  myLib.mydata;

IF gender="f" THEN DO;
score1 = (2 * q1) + q2;
score2 = (3 * q1) + q2;
END;

ELSE IF gender="m" THEN DO;
score1 = (20 * q1) + q2;
score2 = (30 * q1) + q2;
END;

SPSS

GET FILE = 'mydata.sav'.

DO IF (gender EQ 'm').
+  COMPUTE score1 = (2*q1) + q2.
+  COMPUTE score2 = (3*q1) + q2.
ELSE IF (gender EQ 'f').
+  COMPUTE  score1 = (20*q1) + q2.
+  COMPUTE  score2 = (30*q1) + q2.
END IF.

Stata

* Filename: TransformIF2.do

use c:\myRfolder\mydata, clear
preserve
if gender=="m" {
gen score1 = (20*q1)+q2
gen score2 = (30*q1)+q2
}
else if gender=="f" {
gen score1 = (2*q1) + q2
gen score2 = (3*q1) + q2
}
restore

Transposing or Flipping Data Sets

R

# Filename: Transpose.R

setwd("c:/myRfolder")
load("mydata.RData")
mydata

myQs <- c("q1", "q2", "q3", "q4")
myQdf <- mydata[ ,myQs]
myQdf
myFlipped <- t(myQdf)
myFlipped
class(myFlipped)  # coerced into a matrix!
myFixed <- as.data.frame( t(myFlipped) )
myFixed

# Again, but with all the data
options(width = 60)
myFlipped <- t(mydata)
myFlipped

myFixed <- t(myFlipped)
myFixed
myFixed <- data.frame(myFixed)
str(myFixed)

myQs <- c("q1", "q2", "q3", "q4")
myFixed[ ,myQs] <-
lapply(myFixed[ ,myQs], as.numeric)
str(myFixed)

SAS

* Filename: Transpose.sas ;

LIBNAME myLib 'C:\myRfolder';

PROC TRANSPOSE DATA=myLib.mydata OUT=mycopy;
PROC PRINT; RUN;

PROC TRANSPOSE DATA=mycopy OUT=myFixed;
PROC PRINT; RUN;

SPSS

* Filename: Transpose.sps.

CD 'C:\myRfolder'.
GET FILE='mydata.sav'.
DATASET NAME DataSet1 WINDOW=FRONT.

FLIP VARIABLES=id workshop gender q1 q2 q3 q4.

FLIP VARIABLES=var001 var002 var003 var004
var005 var006 var007 var008
/NEWNAMES=CASE_LBL.

Z-Scores, Calculating

R

# Applying Z Transformation.

myZs <- apply(mymatrix, 2, scale)
myZs

SAS

* Get Z Scores;
 PROC STANDARD DATA=mylib.mydata
MEAN=0 STD=1 out=myZs;
RUN;
 PROC PRINT;
RUN;

SPSS

DESCRIPTIVES VARIABLES=q1 q2 q3 q4
/SAVE
/STATISTICS=MEAN STDDEV MIN MAX.

3 Responses to Data Management

  1. Mark says:

    outstanding examples. very helpful. I wish Stata’s codes were not missed in several examples.

  2. Anders Alexandersson says:

    I agree with Mark.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s