Below is a comparison of the commands used for common data management tasks in R, SAS, SPSS and Stata. The variables gender and workshop are categorical factors and q1 to q4, pretest and posttest are considered continuous and normally distributed.
The practice data set is shown here. The programs and the data they use are also available for download here. Examples that are missing for Stata reflect the differences between the two books. We will get around to those when we write the second edition of R for Stata Users.
Aggregating or Summarizing or Collapsing Data Sets
R
setwd("c:/myRfolder") load(file = "mydata.RData") attach(mydata) # The aggregate Function. # Means by gender. myAgg1 <- aggregate(q1, by = data.frame(gender), mean, na.rm = TRUE) myAgg1 # Now by workshop and gender. myAgg2 <- aggregate(q1, by = data.frame(workshop, gender), mean, na.rm=TRUE) myAgg2 mode(myAgg2) class(myAgg2) # Aggregation with tapply. myAgg2 <- tapply(q1, data.frame(workshop, gender), mean, na.rm = TRUE) myAgg2 class(myAgg2) mode(myAgg2) myAgg2 <- tapply(q1, data.frame(workshop, gender), range, na.rm = TRUE) myAgg2 mode(myAgg1) class(myAgg2) myAgg2[[1]] # Example multi-level transformation. mydata$Zq1 <- (q1 - mean(q1) ) / sd(q1) mydata mySubset <- mydata[ q1 < mean(q1), ] mySubset # Rename x to be mean.q1. library("plyr") myAgg3 <- rename(myAgg2, c(x = "mean.q1") ) myAgg3 # Now merge means back with mydata. mydata2 <- merge(mydata, myAgg3, by=c("workshop", "gender") ) mydata2 # Tables of Counts table(workshop) table(gender, workshop) myCounts <- table(gender, workshop) mode(myCounts) class(myCounts) # Counts in Summary or # Aggregate style. myCountsDF <- as.data.frame(myCounts) myCountsDF class(myCountsDF)
SAS
* Get means of q1 for each gender; PROC SUMMARY DATA=myLib.mydata MEAN NWAY; CLASS GENDER; VAR q1; OUTPUT OUT=myLib.myAgg; RUN; PROC PRINT; RUN; DATA myLib.myAgg; SET myLib.myAgg; WHERE _STAT_='MEAN'’MEAN’; KEEP gender q1; RUN; PROC PRINT; RUN; *Get means of q1 by workshop and gender; PROC SUMMARY DATA=myLib.mydata MEAN NWAY; CLASS WORKSHOP GENDER; VAR Q1; OUTPUT OUT=myLib.myAgg;RUN; PROC PRINT; RUN; *Strip out just the mean and matching variables; DATA myLib.myAgg; SET myLib.myAgg; WHERE _STAT_='MEAN'; KEEP workshop gender q1; RENAME q1=meanQ1; RUN; PROC PRINT; RUN; *Now merge aggregated data back into mydata; PROC SORT DATA=myLib.mydata; BY workshop gender; RUN: PROC SORT DATA=myLib.myAgg; BY workshop gender; RUN: DATA myLib.mydata2; MERGE myLib.mydata myLib.myAgg; BY workshop gender; PROC PRINT; RUN;
SPSS
GET FILE='mydata.sav'. AGGREGATE /OUTFILE='myAgg.sav' /BREAK=gender /q1_mean = MEAN(q1). GET FILE='myAgg.sav'. LIST. * Get mean of q1 by workshop and gender. GET FILE='mydata.sav'. AGGREGATE /OUTFILE='myAgg.sav'. /BREAK=workshop gender /q1_mean = MEAN(q1). GET FILE='myAgg.sav'. LIST. * Merge aggregated data back into mydata. * This step can be saved * by using * MODE=ADDVARIABLES * in the previous step. GET FILE='mydata.sav'. SORT CASES BY workshop (A) gender (A) . MATCH FILES /FILE=* /TABLE='myAgg.sav' /BY workshop gender. SAVE OUTFILE='mydata.sav'.
Stata
* Filename: Collapse.do use c:myRfoldermydata * Get means of q1 for each gender tabstat q1, by(gender) * Get means of q1 by workshop and gender; tabulate outwork gender, summarize(q1) means * Strip out just the mean and matching variables; collapse (mean) workshop gender, by(q1) * Merge aggregated data back into mydata. * mydata assumed to be sorted by q1 sort q1 merge q1 using mydata
Applying Functions
The other packages have functions that go across the values of an observation, and procedures or commands that go down through observations. R has only commands, and which way they go can be changed by how they are “applied.”
R
load(file = "mydata.RData") mydata attach(mydata) # Mean of the q variables mean(mydata[3:6], na.rm = TRUE) # Create mymatrix. mymatrix <- as.matrix( mydata[ ,3:6] ) mymatrix # Get mean of whole matrix. mean(mymatrix, na.rm = TRUE) # Get mean of matrix columns. apply(mymatrix, 2, mean, na.rm = TRUE) # Get mean of matrix rows. apply(mymatrix, 1, mean, na.rm = TRUE) rowMeans(mymatrix, na.rm = TRUE) # Add row means to mydata. mydata$meanQ <- apply(mymatrix, 1, mean, na.rm = TRUE) mydata$meanQ <- rowMeans(mymatrix, na.rm = TRUE) mydata <- transform(mydata, meanQ = rowMeans(mymatrix, na.rm = TRUE) ) mydata # Means of data frames and their vectors. lapply(mydata[ ,3:6], mean, na.rm = TRUE) sapply(mydata[ ,3:6], mean, na.rm = TRUE) mean( sapply(mydata[ ,3:6], mean, na.rm = TRUE) ) # Length of data frames and their vectors. length(mydata[ ,"q3"] ) nrow(mydata) is.na( mydata[ ,"q3"] ) !is.na( mydata[ ,"q3"] ) sum( !is.na( mydata[ ,"q3"] ) ) # Like the SAS/SPSS n from stat procedures. library("prettyR") sapply(mydata, valid.n) apply(myMatrix, 1, valid.n) mydata$myQn <- apply(myMatrix, 1, valid.n) mydata
SAS
LIBNAME myLib 'C:myRfolder'; DATA myLib.mydata; SET myLib.mydata; myMean = MEAN(OF q1-q4); myN = N(OF q1-q4); RUN; PROC MEANS; VAR q1-q4 myMean myN; RUN;
SPSS
GET FILE='mydata.sav'. * Functions work for each observation (row). COMPUTE myMean = Mean(q1 TO q4). COMPUTE mySum = Sum(q1 TO q4). COMPUTE myN = mySum / myMean.
Stata
* Filename: FunctionsCommands.do use c:myRfoldermydata, clear preserve * Some statistical functions egen mymean = rowmean(q1-q4) egen mysum = rowtotal(q1-q4) gen myn = mysum/mymean restore * A statistical command summary
By or Split-File Processing
R
setwd("c:/myRfolder") load(file = "mydata.RData") attach(mydata) # Get means of q variables for all observations. mean( mydata[ c("q1", "q2", "q3", "q4") ] , na.rm = TRUE) # Now get means by gender. myBYout <- by( mydata[ c("q1", "q2", "q3", "q4") ] , mydata["gender"], mean,na.rm = TRUE) myBYout mode(myBYout) class(myBYout) myBYdata <- as.data.frame( (as.table(myBYout) ) ) myBYdata # Get range by workshop and gender myVars <- c("q1", "q2", "q3", "q4") myBys <- mydata[ c("workshop", "gender") ] myBYout <- by( mydata[myVars], myBys, range, na.rm = TRUE ) # Converting output to data frame. # A data frame the long way. myBYdata <- data.frame( rbind(myBYout[[1]], myBYout[[2]], myBYout[[3]], myBYout[[4]]) ) # A data frame using do.call. myBYdata <- data.frame( do.call( rbind, myBYout) ) myBYdata
SAS
PROC MEANS DATA=myLib.mydata; RUN; PROC SORT DATA=myLib.mydata; BY gender; RUN; PROC MEANS DATA=myLib.mydata; BY gender; RUN; PROC SORT DATA=myLib.mydata; BY workshop gender; RUN; PROC MEANS DATA=myLib.mydata; BY workshop gender; RUN;
SPSS
GET FILE='mydata.sav'. DESCRIPTIVES VARIABLES=q1 q2 q3 q4 /STATISTICS=MEAN STDDEV MIN MAX. SORT CASES BY gender . SPLIT FILE SEPARATE BY gender . DESCRIPTIVES VARIABLES=q1 q2 q3 q4 /STATISTICS=MEAN STDDEV MIN MAX. SORT CASES BY workshop gender. SPLIT FILE SEPARATE BY workshop gender. DESCRIPTIVES VARIABLES=q1 q2 q3 q4 /STATISTICS=MEAN STDDEV MIN MAX.
Stata
use c:myRfoldermydata, clear sort gender by gender: sum gender tabstat q1 q2 q3 q4, stat(mean sd min max) tabstat q1 q2 q3 q4, stat(mean sd min max) by(gender)
Character String Manipulations
These examples use a different practice data set named “giants.txt”:
R.A. Fisher 1890-02-17 1962-07-29 Carl Pearson 1857-03-27 1936-04-27 Gertrude Cox 1900-01-13 1978-10-17 John Tukey 1915-06-16 2000-07-26 William Gosset 1876-06-13 1937-10-16
R
# Filename: CharacterStrings.R gender <-c("m", "f", "m", NA, "m", "f", "m", "f") gender options(width = 58) letters LETTERS library("stringr") myVars <- str_c("Var", LETTERS[1:6]) myVars setwd("c:/myRfolder") giants <-read.fwf( file = "giants.txt", width = c(15, 11, 11), col.names = c("name", "born", "died"), colClasses = c("character", "character", "POSIXct") ) giants str_length( giants$name ) giants[ giants$name == "R.A. Fisher", ] giants[ giants$name == "R.A. Fisher ", ] giants$name <- str_trim(giants$name) attach(giants) str_length(name) toupper(name) tolower(name) library("ggplot2") firstUpper( tolower(name) ) str_sub(name, 1, 5) myNamesMatrix <- str_split_fixed(name, " ", 2) myNamesMatrix myFirst <- myNamesMatrix[ ,1] myFirst myLast <- myNamesMatrix[ ,2] myLast myFirst <- str_replace_all(myFirst, "R.A.", "Ronald A.") myLastFirst <- str_c( myLast, ", ", myFirst) myLastFirst myObs <- myLast == "Tukey" myObs myObs <- which(myLast == "Tukey") myObs giants[ myObs, ] myObs <- str_detect(myLast, "key") myObs myTable <- c("Box", "Bayes", "Fisher", "Tukey") myObs <- myLast %in% myTable myObs name[ myObs ] myObs <- str_detect( myLast, "Box|Bayes|Fish|key" ) myObs name[ myObs ] myAthruM <- str_detect(myLastFirst, "^[A-M]") myAthruM name[ myAthruM ] name[!myAthruM ]
SAS
* Filename: CharacterStrings.sas LIBNAME MYlIB 'C:myRfolder'; DATA myLib.giants; INFILE 'myRfolder/giants.txt' MISSOVER DSD LRECL=32767; INPUT name $char14. @16 born mmddyy10. @27 died yymmdd10.; FORMAT born mmddyy10. died yymmdd10.; myVarLength=length(name); born=strip(born); * Not needed in SAS; PROC PRINT; RUN; DATA myLib.giants; SET myLib.giants; myLower= lowcase(name); myUpper= upcase(name); myProper=propcase(name); PROC PRINT; RUN; DATA myLib.giants; SET myLib.giants; myFirst5=substr(name, 1, 5); * split names using substr; myBlank=find(name, " "); myFirst=strip( substr(name, 1, myBlank) ); myLast =strip( substr(name, myBlank) ); PUT "Using substr... " myFirst= myLast=; * splip names using scan; myFirst=scan(name,1," "); myLast =scan(name,2," "); myFirst=tranwrd(myFirst,"R.A.","Ronald A."); LENGTH myLastFirst $ 17; myLastFirst= strip(myLast) || ", " || strip(myFirst); *or: CALL CATX(", ", myLastFirst, myLast, myFirst); PROC PRINT; VAR name myFirst myLast myLastFirst; RUN; DATA tukey; SET myLib.giants; WHERE myLast="Tukey"; PROC PRINT; VAR name; RUN; DATA tukey; SET myLib.giants; WHERE FIND(myLast, "key"); PROC PRINT; VAR name; RUN; DATA mySubset; SET myLib.giants; WHERE myLast IN ("Box","Bayes","Fisher","Tukey"); RUN; PROC PRINT; VAR name; RUN; DATA FishOrKey; SET myLib.giants; IF FIND(myLast, "Box") | FIND(myLast, "Bayes") | FIND(myLast, "Fish") | FIND(myLast, "key") ; RUN; PROC PRINT; VAR name; RUN; DATA AthruM; SET myLib.giants; firstLetter=substr(myLast, 1, 1); IF "A" <= firstLetter <= "M"; RUN; PROC PRINT; VAR name; RUN;
SPSS
* Filename: CharacterStrings.sps CD 'C:myRfolder'. DATA LIST FILE='giants.txt' RECORDS=1 /1 name 1-14 (A) born 16-26 (ADATE) died 27-37 (SDATE). STRING myFirst5 (A5)/ myLower myUpper myLastFirst (A17) myFirst myLast (A9). COMPUTE myLength1=LENGTH(name). COMPUTE name=RTRIM(name). COMPUTE myLength2=LENGTH(name). COMPUTE myLower=LOWER(name). COMPUTE myUpper=UPCASE(name). LIST name myLower myUpper myLength1 myLength2. COMPUTE myFirst5=SUBSTR(name, 1, 5). COMPUTE myBlank= INDEX(name, " "). COMPUTE myFirst=SUBSTR(name, 1, myBlank-1). COMPUTE myFirst=REPLACE(myFirst, "R.A.", "Ronald A."). COMPUTE myLast=SUBSTR(name, myBlank+1). COMPUTE myLastFirst=CONCAT( RTRIM(myLast), ", ", RTRIM(myFirst) ). LIST name myFirst myLast myLastFirst. TEMPORARY. SELECT IF (myLast EQ "Tukey"). LIST name. TEMPORARY. SELECT IF (CHAR.RINDEX(myLast,"Fish") GE 1 OR CHAR.RINDEX(myLast,"key") GE 1). LIST name. TEMPORARY. SELECT IF ( myLast EQ "Box" OR myLast EQ "Bayes" OR myLast EQ "Fisher" OR myLast EQ "Tukey"). LIST name. TEMPORARY. SELECT IF ( name EQ "Box" OR name EQ "Bayes" OR CHAR.RINDEX(myLast,"Fish") GE 1 OR CHAR.RINDEX(myLast,"key") GE 1). LIST name. TEMPORARY. SELECT IF(SUBSTR(myLast, 1, 1) LE "M"). LIST name.
Date and Time Manipulations
These examples use a different dataset named “giants.txt”:
R.A. Fisher 1890-02-17 1962-07-29 Carl Pearson 1857-03-27 1936-04-27 Gertrude Cox 1900-01-13 1978-10-17 John Tukey 1915-06-16 2000-07-26 William Gosset 1876-06-13 1937-10-16
R
setwd("c:/myRfolder") giants <- read.fwf( file = "giants.txt", width = c(15,11,11), col.names = c("name", "born", "died"), colClasses = c("character", "character", "POSIXct"), row.names = "name", strip.white = TRUE, ) giants class(giants$born) # A character vector. library("lubridate") giants$born <- mdy(giants$born) giants class(giants$born) class(giants$died) giants # They display in yyyy-mm-dd by default. attach(giants) unclass( born ) as.numeric( born ) as.POSIXct( c(-2520460800,-3558556800,-2207952000, -1721347200, -2952201600), origin="1960-01-01", tz="UTC" ) #---Calculating Durations--- age <- difftime(died, born, units="secs") age age <- difftime(died, born) age # now we have age in days mode( age ) class( age ) # It's a difftime object as.period(age) age/365.2425 # Age in years. giants$age <- round( as.numeric( age/365.2425 ), 2 ) giants now() # Current date-time. difftime( now(), died ) / 365.2425 # Again, using subtraction. age <- died - born # Age in days. age age / 365.2425 # Age in years, mislabeled. mode(age) class( age ) # it's an interval object. names( age ) sapply( age, unclass ) # Not a helpful age to store in our data frame! #---Adding Durations to Date-Times--- age <- as.duration( c(2286057600,2495664000,2485382400, 2685916800,1935705600) ) class(age) born + age died #---Accessing Date-Time Elements--- year(born) month(born) month(born, label = TRUE) day(born) # Day of month. wday(born) # Day of week. wday(born, label = TRUE, abbr = FALSE) yday(born) #---Creating Date-Times from Elements--- myYear <- year(died) myMonth <- month(died) myDay <- day(died) myDateString <- paste(myYear, myMonth, myDay, sep="/") myDateString died2 <- ymd(myDateString) died2 #---Logical Comparisons with Date-Times--- giants[ born > mdy("1/1/1900") , ] #---SAS Picture Format Example--- myDateText <- format(born, "%B %d, %Y is day %j of %Y") myDateText myDateText <- format(born, "was born on the %jth day of %Y") for (i in 1:5) cat(rownames(giants)[i], myDateText[i],"n") # Two-Digit Years my1969 <- mdy("08/31/69") my1969 my1968 <- mdy("08/31/68") my1968 my1968 <- my1968 - as.duration(100 * 365.2425 * 24 * 60 * 60) my1968 as.POSIXlt("08/31/68", format="%m/%d/%y") as.POSIXlt("08/31/69", format="%m/%d/%y") as.POSIXlt("08/31/69", format="%m/%d/%y")
SAS
LIBNAME MyLib 'C:myRfolder'; DATA myLib.giants; INFILE 'myRfoldergiants.txt' MISSOVER DSD LRECL=32767; INPUT name $char14. @16 born mmddyy10. @27 died mmddyy10.; PROC PRINT; RUN; PROC PRINT; FORMAT died born mmddyy10.; RUN; * Caculating Durations.; DATA myLib.giants; SET myLib.giants; age = (died-born) / 365.2425; longAgo = ( today() - died ) / 365.2425; RUN; PROC PRINT; FORMAT died born mmddyy10. age longAgo 5.2 ; RUN; * Adding Durations to Date-Times. DATA myLib.giants; SET myLib.giants; died=born+age; RUN; PROC PRINT; FORMAT died born mmddyy10. age 5.2; RUN; * Accessing Date-Time Elements; DATA myLib.giants; SET myLib.giants; myYear=YEAR(born); myMonth=MONTH(born); myDay =DAY(born); PROC PRINT; FORMAT died born mmddyy10. age 5.2; RUN; * Creating Date-Time Variables from Elements; DATA myLib.giants; set myLib.giants; born=MDY(myMonth, myDay, myYear); PROC PRINT; FORMAT died born mmddyy10. age 5.2; RUN; * Logical Comparisons with Date-Times; DATA Born1900s; set myLib.giants; if born > "01jan1900"d ; PROC PRINT; FORMAT died born mmddyy10. age 5.2; RUN; * Formatting Date-Time Output; PROC FORMAT; PICTURE myFormatI LOW - HIGH = '%B %d, %Y is day %j of %Y' (DATATYPE=DATE); RUN; PROC PRINT DATA=myLib.giants; VAR born; FORMAT born myFormatI40.; RUN; PROC FORMAT; PICTURE myFormatII LOW - HIGH = ' was born on the %jth day of %Y' (DATATYPE=DATE); RUN; DATA _NULL_; SET myLib.giants; PUT name $char14. born myFormatII34.; run;
SPSS
CD 'C:myRfolder'. DATA LIST FILE='giants.txt' RECORDS=1 /1 name 1-14 (A) born 16-26 (ADATE) died 27-37 (SDATE). * Calculating Durations. COMPUTE age=died-born. LIST. COMPUTE age=(died-born) / (365.2425*24*60*60). LIST. COMPUTE longAgo=($TIME-died) /(365.2425*24*60*60) . LIST. * Adding Durations to Date-Times. COMPUTE died=born+age. LIST. * Accessing Date-Time Elements. COMPUTE myYear=XDATE.YEAR(born). COMPUTE myMonth=XDATE.MONTH(born). COMPUTE myDay=XDATE.MDAY(born). LIST name born myYear myMonth myDay. * Creating Date-Time Variables from Elements. COMPUTE born=DATE.MDY(myMonth, myDay, myYear). LIST name born. * Logical Comparisons with Date-Times. TEMPORARY. SELECT IF born GE date.mdy(1,1,1900). LIST name born. * Formatting Date-Time Output. PRINT /born (adate) ' is the' myDay (F3.0) 'th day of ' myYear (F4.0). EXECUTE. PRINT /name 'was born on the' myDay (F3.0) 'th day of ' myYear (F4.0). EXECUTE.
Duplicate Observations: Finding and Deleting
R
load("mydata.RData") mydata # Create some duplicates. myDuplicates <- rbind(mydata, mydata[1:2, ]) myDuplicates # Get rid of duplicates without seeing them. myNoDuplicates <- unique(myDuplicates) myNoDuplicates # This checks for location of duplicates # before getting rid of them. myDuplicates <- rbind(mydata, mydata[1:2, ]) myDuplicates myDuplicates$DupRecs <- duplicated(myDuplicates) myDuplicates # Print a report of just the duplicate records. attach(myDuplicates) myDuplicates[DupRecs, ] # Remove duplicates and Duplicated variable. myNoDuplicates <- myDuplicates[!DupRecs, -7 ] myNoDuplicates # Locate records with duplicate keys. myKeys <- c("workshop", "gender") mydata$DupKeys <- duplicated(mydata[ ,myKeys]) mydata
SAS
DATA mycopy; SET myLib.mydata; Data lastTwo; SET myLib.mydata; IF ID GE 7; RUN; DATA Duplicates; SET mycopy lastTwo; PROC PRINT; RUN; PROC SORT NODUPREC DATA=Duplicates; BY id workshop gender q1-q4; RUN; PROC PRINT; RUN; PROC SORT NODUPKEY EQUALS DATA=mycopy; BY workshop gender; RUN; PROC PRINT DATA=mycopy; RUN;
SPSS
GET FILE='mydata.sav'. * Identify Duplicate Cases. SORT CASES BY workshop(A) gender(A) q2(A) q1(A) q3(A) q4(A) . MATCH FILES /FILE = * /BY workshop gender q2 q1 q3 q4 /FIRST = PrimaryFirst /LAST = PrimaryLast. DO IF (PrimaryFirst). + COMPUTE MatchSequence = 1 - PrimaryLast. ELSE. + COMPUTE MatchSequence = MatchSequence + 1. END IF. LEAVE MatchSequence. FORMAT MatchSequence (f7). COMPUTE InDupGrp = MatchSequence > 0. SORT CASES InDupGrp(D). MATCH FILES /FILE = * /DROP = PrimaryFirst InDupGrp MatchSequence. VARIABLE LABELS PrimaryLast 'Indicator of each last matching case as Primary'. VALUE LABELS PrimaryLast 0 'Duplicate ‘Duplicate Case' Case’ 1 'Primary ‘Primary Case'Case’. VARIABLE LEVEL PrimaryLast (ORDINAL). FREQUENCIES VARIABLES = PrimaryLast.
Stata
use c:myRfoldermydata, clear duplicates drop id workshop gender q1-q4, force list
First / Last Observations per Group, Finding
R
setwd("c:/myRfolder") load(file = "mydata.RData") mydata$id <- row.names(mydata) mydata myBys <- data.frame(mydata$workshop, mydata$gender) mylastList <- by(mydata, myBys, tail, n = 1) mylastList # Back into a data frame: mylastDF <- do.call(rbind, mylastList) mylastDF # Another way to create the data frame: mylastDF <- rbind(mylastList[[1]], mylastList[[2]], mylastList[[3]], mylastList[[4]]) mylastDF # Generating just an indicator variable mylastDF$lastGender <- rep(1, nrow(mylastDF) ) mylastDF mylastDF2 <- mylastDF[ c("id", "lastGender") ] mydata2 <- merge(mydata, mylastDF2, by = "id", all = TRUE ) mydata2 mydata2$lastGender[ is.na(mydata2$lastGender) ] <- 0 mydata2
SAS
PROC SORT DATA=sasuser.mydata; BY workshop gender; RUN; DATA sasuser.mylast; SET sasuser.mydata; BY workshop gender; IF last.gender; RUN; PROC PRINT; RUN;
SPSS
* Match files method. GET FILE='mydata.sav'. SORT CASES BY workshop gender. MATCH FILES FILE=* /By workshop gender /LAST=lastgender. SELECT IF lastgender. LIST. SAVE OUTFILE='mylast.sav'. * Aggregation method. SORT CASES BY workshop gender. AGGREGATE /OUTFILE='C:mylast.sav' /BREAK workshop gender /q1 = LAST(q1) /q2 = LAST(q2) /q3 = LAST(q3) /q4 = LAST(q4). * Using LIST here would display original file. GET FILE='mylast.sav'. DATASET NAME DataSet5 WINDOW=FRONT. LIST.
Stata
use c:myRfoldermydata, clear sort workshop gender by workshop gender: keep if _n==_N list
Indicator or Dummy Variables
R will create these automatically if you use a factor as a regression predictor.
R
load("mydata100.RData") attach(mydata100) r <- as.numeric(workshop == "R") sas <- as.numeric(workshop == "SAS" ) spss <- as.numeric(workshop == "SPSS" ) stata <- as.numeric(workshop == "Stata") lm(posttest ~ pretest + sas + spss + stata) # Same result without them: lm(posttest ~ pretest + workshop)
SAS
DATA temp; SET myLib.mydata100; r = workshop = 1; sas = workshop = 2; spss = workshop = 3; stata = workshop = 4; RUN; PROC REG; MODEL posttest = pretest sas spss stata; RUN;
SPSS
GET FILE='mydata100.sav'. DATASET NAME DataSet2 WINDOW=FRONT. COMPUTE r = workshop EQ 1. COMPUTE sas = workshop EQ 2. COMPUTE spss = workshop EQ 3. COMPUTE stata = workshop EQ 4. EXECUTE. REGRESSION /DEPENDENT posttest /METHOD=ENTER pretest sas spss stata. EXECUTE.
Joining or Merging Data Sets
R
# Read data keeping ID as a variable. mydata <- read.table("mydata.csv", header = TRUE, sep = ",", na.strings = " ") # Create a data frame keeping # the left two q variables. myleft <- mydata[ c("id", "workshop", "gender", "q1", "q2") ] # Create a data frame keeping the right two q variables. myright <- mydata[ c("id", "workshop", "q3", "q4") ] # Merge the two data frames # by ID. both <- merge(myleft, myright, by = "id") # Merge the two data frames # by ID allowing different # ID names in each data set. both <- merge(myleft, myright, by.x = "id", by.y = "id" ) # Merge data frames # by both ID and workshop. both <- merge(myleft, myright, by = c("id","workshop")) #Merge dataframes by both ID # and workshop, while allowing # them to have different names. both <- merge(myleft, myright, by.x=c("id", "workshop"), by.y=c("id", "workshop") )
SAS
DATA myLib.myleft; SET mylib.mydata; KEEP id workshop gender q1 q2; PROC SORT; BY id workshop; DATA myLib.myright; SET myLib.mydata; KEEP id workshop q3 q4; PROC SORT; BY id workshop; DATA myLib.both; MERGE myLib.myleft myLib.myright; BY id workshop; RUN;
SPSS
GET FILE='mydata.sav'. DELETE VARIABLES q3 to q4. SAVE OUTFILE='myleft.sav'. GET FILE='mydata.sav'. DELETE VARIABLES gender, q1 to q2. SAVE OUTFILE='myright.sav'. GET FILE='myleft.sav'. MATCH FILES /FILE=* /FILE='myright.sav' /BY id.
Stata
Filename: Merge.do use c:myRfoldermydata, clear drop q3 q4 save c:myRfoldermyleft use c:myRfoldermydata, clear drop workshop-q2 save c:myRfoldermyright use c:myRfoldermyleft, clear sort id merge id using c:myRfoldermyright save c:myRfolderboth, replace
Keeping or Dropping Variables
R
# Keep using variable selection. myleft <- mydata[ ,1:4] myleft # Drop using NULL. myleft <- mydata myleft$q3 <- myleft$q4 <- NULL
SAS
* Keeping vars; DATA myleft; SET mydata; KEEP id workshop gender q1 q2; *Equivalently dropping; DATA myleft; SET mydata; DROP q3 q4;
SPSS
GET FILE='mydata.sav'. DELETE VARIABLES q3 to q4.
Stata
use c:myRfoldermydata, clear keep id workshop gender q1 q2 * or equivalently; * drop q3 q4 save c:myRfoldermyleft, replace
Missing Values
These examples use a different version of mydata which includes periods, 9s and 99s as missing value codes. Note that periods in R are not read automatically as missing values! Here is mydataNA:
workshop gender q1 q2 q3 q4
1 1 f 1 1 5 1
2 2 f 2 1 4 99
3 . f 9 2 4 3
4 2 . 3 9 99 3
5 1 m 4 5 2 4
6 . m 9 9 5 5
7 1 . 5 3 99 4
8 2 m 4 5 5 99
R
setwd("c:/myRfolder") mydataNA <- read.table("mydataNA.txt") mydataNA # Read it so that ".", 9, 99 are missing. mydataNA <- read.table("mydataNA.txt", na.strings = c(".", "9", "99") ) mydataNA # Convert 9 and 99 manually mydataNA <- read.table("mydataNA.txt", na.strings=".") mydataNA[mydataNA == 9 | mydataNA == 99] <- NA mydataNA # Substitute the mean for missing values. mydataNA$q1[is.na(mydataNA$q1)] <- mean(mydataNA$q1, na.rm = TRUE) mydataNA # Eliminate observations with any NAs. myNoMissing <- na.omit(mydataNA) myNoMissing # Test to see if each case is complete. complete.cases(mydataNA) # Use that result to select complete cases. myNoMissing <- mydataNA[ complete.cases(mydataNA), ] myNoMissing # Use that result to select incomplete cases. myIncomplete <- mydataNA[ !complete.cases(mydataNA), ] myIncomplete # When "99" Has Meaning... mydataNA <- read.table("mydataNA.txt", na.strings = ".") mydataNA attach(mydataNA) # Assign missing values for q variables. mydataNA$q1[q1 == 9 ] <- NA mydataNA$q2[q2 == 9 ] <- NA mydataNA$q3[q3 == 99] <- NA mydataNA$q4[q4 == 99] <- NA mydataNA detach(mydataNA) # Read file again, this time use functions. mydataNA <- read.table("mydataNA.txt", na.strings = ".") mydataNA attach(mydataNA) #Create a functions that replaces 9, 99 with NAs. my9is <- function(x) { x[x == 9 ] <- NA; x} my99isNA <- function(x) { x[x == 99] <- NA; x} # Now apply our functions to the data frame using lapply. mydataNA[3:4] <- lapply( mydataNA[3:4], my9isNA ) mydataNA[5:6] <- lapply( mydataNA[5:6], my99isNA ) mydataNA
SAS
DATA myLib.mydata; SET myLib.mydata; *Convert 9 to missing, one at a time. IF q1=9 THEN q1=.; IF q2=9 THEN q2=.; IF q3=99 THEN q2=.; IF q4=99 THEN q4=.; * Same thing but is quicker for lots of vars; ARRAY q9 q1-q2; DO OVER q9; IF q9=9 THEN q=.; END; ARRAY q99 q3-q4; DO OVER q99; IF q=99 THEN q99=.; END;
SPSS
CD 'C:myRfolder'. GET FILE=('mydata.sav'). MISSING q1 TO q2 (9) q3 TO q4 (99). SAVE OUTFILE='mydata.sav'.
Stata
* Filename: MissingValues.do use c:myRfoldermydata, clear preserve replace q1=. if q1==9 replace q2=. if q2==9 replace q3=. if q3==99 replace q4=. if q4==99 * Same thing but is quicker for lots of vars restore forvalues i = 1/2 { replace q`i'=. if q`i'==9 } forvalues i = 3/4 { replace q`i'=. if q`i'==90 } restore
Ranks, Calculating
R
# Applies to vars 3 through 6, # the vars q1 to q4. myvars <- c("q1","q2","q3","q4") myRanks <- sapply(mydata[myvars], rank)
SAS
PROC RANK DATA=mylib.mydata OUT=myRanks; VAR Q1-Q4; PROC PRINT;
SPSS
RANK VARIABLES=q1 q2 q3 q4 (A) /RANK /PRINT=YES /TIES=MEAN.
Recoding Variables
R
load(file = "myWorkspace.RData") mydata attach(mydata) library("car") mydata$qr1 <- recode(q1, "1=2; 5=4") mydata$qr2 <- recode(q2, "1=2; 5=4") mydata$qr3 <- recode(q3, "1=2; 5=4") mydata$qr4 <- recode(q4, "1=2; 5=4") detach(mydata) # This approach works on thousands. load(file = "mydata.RData") attach(mydata) # Generate two sets of var names # so we can save original vars. # QR vars will be the recoded ones. myQnames <- paste( "q", 1:4, sep = "") myQRnames <- paste( "qr", 1:4, sep = "") # Extract the q variables to a separate data frame. myQRvars <- mydata[ ,myQnames] # Rename all of the variables. names(myQRvars) <- myQRnames myQRvars # Create a function to apply the labels to lots of variables. myRecoder <- function(x) { recode(x,"1=2; 5=4") } # Here's how to use the function # on one variable. myRecoder(myQRvars$qr1) #Apply it to all of the variables. myQRvars <- sapply( myQRvars, myRecoder) myQRvars # Save it back to mydata # if you want. mydata <- cbind(mydata,myQRvars)
SAS
DATA myLib.mydata; INFILE "mydata.csv" delimiter = "," MISSOVER DSD LRECL=32767 firstobs=2; INPUT id workshop gender $ q1 q2 q3 q4; PROC PRINT; RUN; PROC FORMAT; VALUE Agreement 1="Disagree" 2="Disagree" 3="Neutral" 4="Agree" 5="Agree"; run; DATA myLib.mydata; SET myLib.mydata; ARRAY q q1-q4; ARRAY qr qr1-qr4; *r for recoded; DO i=1 to 4; qr{i}=q{i}; if q{i}=1 then qr{i}=2; else if q{i}=5 then qr{i}=4; END; FORMAT q1-q4 qr1-qr4 Agreement.; RUN; * This will use the recoded formats automatically; PROC FREQ; TABLES q1-q4; RUN; * This will ignore the formats; * Note high/low values are 1/5; PROC UNIVARIATE; VAR q1-q4; RUN; * This will use the 1-3 codings, not a good idea!; * High/Low values are now 2/4; PROC UNIVARIATE; VAR qr1-qr4; RUN;
SPSS
GET FILE='mydata.sav'. RECODE q1 to q4 (1=2) (5=4). SAVE OUTFILE='myleft.sav'.
Stata
* Filename: Recode.do use c:myRfoldermydata, clear recode q1-q4 (1=2) (5=4)
Renaming Variables
R
library("plyr") # Use old="new" format. myChanges <- c(q1 = "x1", q2 = "x2", q3 = "x3", q4 = "x4") mydata <- rename(mydata, myChanges) # The standard R approach is # more complex when renaming just # a few variables: names(mydata) <- c("workshop", "gender", "x1", "x2", "x3", "x4") # See books for many more examples.
SAS
DATA myLib.mydata; RENAME q1-q4=x1-x4; *or; RENAME q1=x1 q2=x2 q3=x3 q4=x4;
SPSS
GET FILE='mydata.sav'. RENAME VARIABLES (q1=x1)(q2=x2)(q3=x3)(q4=x4). * Or... RENAME VARIABLES (q1 q2 q3 q4 = x1 x2 x3 x4). * Or... RENAME VARIABLES (q1 TO q4 = x1 TO x4).
Stata
* Filename: Rename.do use c:myRfoldermydata, clear preserve rename q1 x1 rename q2 x2 rename q3 x3 rename q4 x4 * or restore forvalues i = 1/4 { rename q`i' x`i' }
Reshaping Variables to Observations and Back
R
# Filename: Reshape.R setwd("c:/myRfolder") load("mydata.RData") library("reshape2") myChanges <- c( q1 = "time1", q2 = "time2", q3 = "time3", q4 = "time4") mydata <- rename(mydata, myChanges) mydata$subject <- factor(1:8) mydata # Reshaping from wide to long library("reshape2") # Just a reminder. mylong <- melt(mydata) mylong # Again, specifying arguments mylong <- melt(mydata, id.vars = c("subject", "workshop", "gender"), measure.vars = c("time1", "time2", "time3", "time4"), value.name = "variable") mylong # Reshaping from long to wide mywide <- dcast(mylong, subject + workshop + gender ~ variable) mywide # ---Two Time Variables--- load("mydata.RData") mydata$subject <- factor(1:8) library("reshape2") myChanges <- c( q1 = "M1_L1", q2 = "M1_L2", q3 = "M2_L1", q4 = "M2_L2") mydata <- rename(mydata, myChanges) mydata library("reshape2") # Just a reminder. mylong2 <- melt(mydata) mylong2 # Same thing with arguments specified mylong2 <- melt(mydata, id.vars = c("subject", "workshop", "gender"), measure.vars = c("M1_L1", "M1_L2", "M2_L1", "M2_L2"), value.name = "value") mylong2 mylong2$method <- rep( c("M1", "M2"), each=16, times=1) mylong2$level <- rep( c("L1", "L2"), each=8, times=2) mylong2 mylong2$variable <- NULL # Reshape to wide mywide2 <- dcast(mylong2, gender + workshop + subject ~ method + level) mywide2 mywide2[ order(mywide2$subject), ] # Aggregation via reshape dcast(mylong2, gender + workshop ~ method + level, mean, na.rm = TRUE) dcast(mylong2, gender + workshop ~ method, mean, na.rm = TRUE) dcast(mylong2, workshop ~ ., mean, na.rm = TRUE)
SAS
* Filename: Reshape.sas ; LIBNAME myLib 'C:myRfolder'; * Wide to long; PROC TRANSPOSE DATA=mylib.mydata OUT=myLib.mylong; VAR q1-q4; BY id workshop gender; PROC PRINT; RUN; DATA mylib.mylong; SET mylib.mylong( rename=(COL1=value) ); time=INPUT( SUBSTR( _NAME_, 2) , 1.); DROP _NAME_; RUN; PROC PRINT; RUN; * Long to wide; PROC TRANSPOSE DATA=mylib.mylong OUT=myLib.mywide PREFIX=q; BY id workshop gender; ID time; VAR value; RUN; DATA mylib.mywide; SET mylib.mywide(DROP=_NAME_); RUN; PROC PRINT; RUN;
SPSS
* Filename: Reshape.sps . CD 'C:myRfolder'. GET FILE='mydata.sav'. * Wide to long. VARSTOCASES/MAKE Y FROM q1 q2 q3 q4 /INDEX = Question(4) /KEEP = id workshop gender /NULL = KEEP. LIST. SAVE OUTFILE='mywide.sav'. * Long to wide. GET FILE='mywide.sav'. CASESTOVARS /ID = id workshop gender /INDEX = Question /GROUPBY = VARIABLE. LIST. SAVE OUTFILE='mylong.sav'.
Stata
* Filename: Reshape.do use c:myRfoldermydata, clear reshape long q, i(subject) j(item) list reshape wide q, i(subject) j(item) list
Sorting Data Sets
R
setwd("c:/myRfolder") load(file = "mydata.RData") mydata # Show first four observations in order. mydata[ c(1, 2, 3, 4), ] # Show them in reverse order. mydata[ c(4, 3, 2, 1), ] # Create order variable for workshop. myW <- order( mydata$workshop ) myW mydata[ myW, ] # Create order variable for gender then workshop. myGW <- order( mydata$gender, mydata$workshop ) myGW mydata[ myGW, ] # Create order variable for # descending (-) workshop then gender myWdG <- order( -mydata$workshop, mydata$gender ) myWdG # Print data in WG order. mydata[ myWdG, ] # Save data in WdG order. mydataSorted <- mydata[ myWdG, ] mydataSorted
SAS
LIBNAME myLib 'myRfolder'; PROC SORT DATA = myLib.mydata; BY workshop; RUN; PROC PRINT DATA = myLib.mydata; RUN; PROC SORT DATA = myLib.mydata; BY gender workshop; RUN; PROC PRINT DATA = myLib.mydata; RUN; PROC SORT DATA = myLib.mydata; BY descending gender workshop; RUN; PROC PRINT DATA = myLib.mydata; RUN;
SPSS
CD 'myRfolder'. GET FILE = 'mydata.sav'. SORT CASES BY workshop (A). LIST. SORT CASES BY gender (A) workshop (A). LIST. SORT CASES BY gender (D) workshop (A). LIST.
Stata
use c:myRfoldermydata, clear * sort workshop ascending order sort workshop list * sort gender and workshop, both ascending order sort gender workshop list * sort workshop descending and gender in ascending order. gsort - workshop + gender list
Stacking or Concatenating or Adding Data Sets
R
load(file = "mydata.RData") mydata attach(mydata) # Create female data frame. females <- mydata[which(gender == "f"), ] # Create male data frame. males <- mydata[which(gender == "m"), ] # Bind their rows together # with the rbind function. both <- rbind(females, males) # Drop q2 to see what happens. males$q2 <- NULL males # See that row bind will # not work now. both <- rbind(females, males) # Use plyr rbind.fill instead. library("plyr") both <- rbind.fill(females, males) # Add a q2 variable to males. males <- data.frame(males, q2 = NA) # Now rbind can handle it. both <- rbind(females, males)
SAS
DATA males; SET mydata; WHERE gender='m'; RUN; DATA females; SET mydata; WHERE gender='f'; RUN; DATA both; SET males females; RUN; PROC PRINT; RUN;
SPSS
GET FILE='mydata.sav'. SELECT IF(gender = "f"). LIST. SAVE OUTFILE='females.sav'. EXECUTE . GET FILE='mydata.sav'. SELECT IF(gender = "m"). LIST. SAVE OUTFILE='males.sav'. EXECUTE . GET FILE='females.sav'. ADD FILES /FILE=* /FILE='males.sav'. LIST. EXECUTE.
Stata
use c:myRfoldermydata, clear preserve keep if gender=="m" save c:myRfoldermymale restore keep if gender=="f" save c:myRfoldermyfemale use c:myRfoldermymale, clear append using c:myRfoldermyfemale save c:myRfolderboth
Transformations
R
load(file = "mydata.RData") mydata # Transformation in the middle # of another function. summary( log(mydata$q4) ) # Using dollar notation. mydata$meanQ <- (mydata$q1 + mydata$q2 + mydata$q3 + mydata$q4) / 4 mydata # Creating meanQ using transform. mydata <- transform(mydata, meanQ=(q1 + q2 + q3 + q4) / 4 ) mydata
SAS
LIBNAME myLib 'C:myRfolder'; DATA myLib.mydataTransformed; SET myLib.mydata; totalq = (q1 + q2 + q3 + q4); logtot = log10(totalq); mean1 = (q1 + q2 + q3 + q4) / 4; mean2 = mean(of q1-q4); PROC PRINT; RUN;
SPSS
GET FILE='mydata.sav'. COMPUTE Totalq=q1+q2+q3+q4. COMPUTE Logtot=lg10(totalq). COMPUTE Mean1=(q1+q2+q3+q4)/4. COMPUTE Mean2=MEAN(q1 TO q4). SAVE OUTFILE='mydataTransformed.sav'. LIST.
Stata
* Filename: Transform.do use c:myRfoldermydata, clear gen totalq = q1+q2+q3+q4 gen logtotalq = log10(totalq) gen mean1 = totalq/4 egen mean2 = mean(q1-q4) save c:myRfoldermydataT
Transformations, Conditional
R
setwd("c:/myRfolder") load(file = "mydata.RData") mydata attach(mydata) mydata$q4Sagree <- ifelse(q4 == 5, 1, 0) mydata$q4Sagree mydata$q4Sagree <- as.numeric(q4 == 5 ) mydata$q4Sagree mydata$q4agree <- ifelse(q4 >= 4, 1, 0) mydata$q4agree mydata$ws1agree <- ifelse(workshop == 1 & q4 >=4 , 1, 0) mydata$ws1agree mydata$score <- ifelse(gender == "f", (2 * q1) + q2, (3 * q1) + q2 ) mydata # ---Cutting posttest--- detach(mydata) load("mydata100.RData") attach(mydata100) head(mydata100) # An inefficient approach: postGroup <- posttest postGroup <- ifelse(posttest < 60, 1, postGroup) postGroup <- ifelse(posttest >= 60 & posttest < 70, 2, postGroup) postGroup <- ifelse(posttest >= 70 & posttest < 80, 3, postGroup) postGroup <- ifelse(posttest >= 80 & posttest < 90, 4, postGroup) postGroup <- ifelse(posttest >=90, 5, postGroup) table(postGroup) # An efficient approach: postGroup <- ifelse(posttest < 60 , 1, ifelse(posttest >= 60 & < 70, 2, ifelse(posttest >= 70 & posttest < 80, 3, ifelse(posttest >= 80 & < 90, 4, ifelse(posttest >= 90, 5, posttest) )))) table(postGroup) # Logical approach: postGroup <- 1 + (posttest >= 60) + (posttest >= 70) + (posttest >= 80) + (posttest >= 90) table(postGroup) # ---Cutting Functions--- # Hmisc cut2 function library("Hmisc") postGroup <- cut2(posttest, c(60, 70, 80, 90) ) table(postGroup) postGroup <- cut2(posttest, g = 5) table(postGroup) postGroup <- cut2(posttest, m = 25) table(postGroup)
SAS
LIBNAME myLib 'C:myRfolder'; DATA myLib.mydataTransformed; SET myLib.mydata; IF q4 = 5 then x1 = 1; else x1 = 0; IF q4 >= 4 then x2 = 1; else x2 = 0; IF workshop = 1 & q4 >= 5 then x3 = 1; ELSE x3 = 0; IF gender = "f" then scoreA = 2 * q1 + q2; Else scoreA = 3 * q1 + q2; IF workshop = 1 and q4 >= 5 THEN scoreB = 2 * q1 + q2; ELSE scoreB = 3 * q1 + q2; RUN; PROC PRINT; RUN; DATA myLib.mydataTransformed; SET myLib.mydata100; IF (posttest < 60) THEN postGroup = 1; ELSE IF (posttest >= 60 & posttest < 70) THEN postGroup = 2; ELSE IF (posttest >= 70 & posttest < 80) THEN postGroup = 3; ELSE IF (posttest >= 80 & posttest < 90) THEN postGroup = 4; ELSE IF (posttest >= 90) THEN postGroup = 5; RUN; PROC FREQ; TABLES postGroup; RUN; PROC RANK OUT=myLib.mydataTransformed GROUPS=5; VAR posttest; RUN; PROC FREQ; TABLES posttest; RUN;
SPSS
CD 'C:myRfolder'. GET FILE = 'mydata.sav'. DO IF (Q4 eq 5). + COMPUTE X1 = 1. ELSE. + COMPUTE X1 = 0. END IF. DO IF (Q4 GE 4). + COMPUTE X2 = 1. ELSE. + COMPUTE X2 = 0. END IF. DO IF (workshop EQ 1 AND q4 GE 4). + COMPUTE X3 = 1. ELSE. + COMPUTE X3 = 0. END IF. DO IF (gender = 'f'). + COMPUTE scoreA = 2 * q1 + q2. ELSE. + COMPUTE scoreA = 3 * q1 + q2. END IF. DO IF (workshop EQ 1 AND q4 GE 5). + COMPUTE scoreB = 2 * q1 + q2. ELSE. + COMPUTE scoreB = 3 * q1 + q2. END IF. EXECUTE. GET FILE='mydata100.sav'. DATASET NAME DataSet2 WINDOW=FRONT. DO IF (posttest LT 60). + COMPUTE postGroup = 1. ELSE IF (posttest GE 60 AND posttest LT 70). + COMPUTE postGroup = 2. ELSE IF (posttest GE 70 AND posttest LT 80). + COMPUTE postGroup = 3. ELSE IF (posttest GE 80 AND posttest LT 90). + COMPUTE postGroup = 4. ELSE IF (posttest GE 90). + COMPUTE postGroup = 5. END IF. EXECUTE. DATASET ACTIVATE DataSet2. FREQUENCIES VARIABLES=postGroup /ORDER=ANALYSIS. DATASET ACTIVATE DataSet1. RANK VARIABLES=posttest (A) /NTILES(5) /PRINT=YES /TIES=MEAN. FREQUENCIES VARIABLES=Nposttes /ORDER=ANALYSIS.
Stata
* Filename: TransformIF.do use c:myRfoldermydata, clear preserve if q4 == 5 { gen x1=1 } else { gen x1=0 } if q4 >= 4 { gen x2=1 } else { gen x2=0 } if workshop == 1 & q4 >=5 { gen x3=1 } else { gen x3=0 } if gender=="f" { gen scoreA = 2*q1+q2 } else { gen scoreA = 3*q1+q2 } if workshop==1 & q4 > 5 { gen scoreB = 2*q1+q2 } else { gen scoreB = 3*q1+q2 } restore
Transformations, Multiple Conditional
R
attach(mydata) mydata # Using the ifelse approach. mydata$score1 <- ifelse( gender == "f", (2 * q1) + q2, # 1st for females. (20 * q1) + q2 # 1st for males. ) mydata$score2 <- ifelse( gender == "f", (3 * q1) + q2 # 2nd for females. (30 * q1) + q2 # 2nd for males. ) mydata # Using the subscript approach. load(file = "mydata.RData") # Create names in data frame. detach(mydata) mydata <- data.frame(mydata, score1 = NA, score2 = NA) attach(mydata) # Find which are males and females. gals <- which( gender == "f" ) guys <- which( gender == "m" ) # Now assign scores. mydata[gals, "score1"] <- 2 * q1[gals] + q2[gals] mydata[gals, "score2"] <- 3 * q1[gals] + q2[gals] mydata[guys, "score1"] <- 20 * q1[guys] + q2[guys] mydata[guys, "score2"] <- 30 * q1[guys] + q2[guys]
SAS
DATA myLib.mydata; SET myLib.mydata; IF gender="f" THEN DO; score1 = (2 * q1) + q2; score2 = (3 * q1) + q2; END; ELSE IF gender="m" THEN DO; score1 = (20 * q1) + q2; score2 = (30 * q1) + q2; END;
SPSS
GET FILE = 'mydata.sav'. DO IF (gender EQ 'm'). + COMPUTE score1 = ( 2*q1) + q2. + COMPUTE score2 = ( 3*q1) + q2. ELSE IF (gender EQ 'f'). + COMPUTE score1 = (20*q1) + q2. + COMPUTE score2 = (30*q1) + q2. END IF.
Stata
* Filename: TransformIF2.do use c:myRfoldermydata, clear preserve if gender=="m" { gen score1 = (20*q1)+q2 gen score2 = (30*q1)+q2 } else if gender=="f" { gen score1 = (2*q1) + q2 gen score2 = (3*q1) + q2 } restore
Transposing or Flipping Data Sets
R
# Filename: Transpose.R setwd("c:/myRfolder") load("mydata.RData") mydata myQs <- c("q1", "q2", "q3", "q4") myQdf <- mydata[ ,myQs] myQdf myFlipped <- t(myQdf) myFlipped class(myFlipped) # Coerced into a matrix! myFixed <- as.data.frame( t(myFlipped) ) myFixed # Again, but with all the data options(width = 60) myFlipped <- t(mydata) myFlipped myFixed <- t(myFlipped) myFixed myFixed <- data.frame(myFixed) str(myFixed) myQs <- c("q1", "q2", "q3", "q4") myFixed[ ,myQs] <- lapply(myFixed[ ,myQs], as.numeric) str(myFixed)
SAS
* Filename: Transpose.sas ; LIBNAME myLib 'C:myRfolder'; PROC TRANSPOSE DATA=myLib.mydata OUT=mycopy; PROC PRINT; RUN; PROC TRANSPOSE DATA=mycopy OUT=myFixed; PROC PRINT; RUN;
SPSS
* Filename: Transpose.sps. CD 'C:myRfolder'. GET FILE='mydata.sav'. DATASET NAME DataSet1 WINDOW=FRONT. FLIP VARIABLES=id workshop gender q1 q2 q3 q4. FLIP VARIABLES=var001 var002 var003 var004 var005 var006 var007 var008 /NEWNAMES=CASE_LBL.
Z-Scores, Calculating
R
# Applying Z Transformation. myZs <- apply(mymatrix, 2, scale) myZs
SAS
* Get Z Scores; PROC STANDARD DATA=mylib.mydata MEAN=0 STD=1 out=myZs; RUN; PROC PRINT; RUN;
SPSS
DESCRIPTIVES VARIABLES=q1 q2 q3 q4 /SAVE /STATISTICS=MEAN STDDEV MIN MAX.
outstanding examples. very helpful. I wish Stata’s codes were not missed in several examples.
I agree with Mark.
Mark & Anders,
Sorry, I just haven’t had time to get to them!
Cheers,
Bob