A data table is an enhanced data.frame.
data.tables (and data.frames) are internally lists as well, but with all its columns of equal length and with a class attribute.
DT = data.table(
ID = c("b","b","b","a","a","c"),
a = 1:6,
b = 7:12,
c = 13:18)
DT[i, j, (by|keyby)]
DT[ i, j, by ] # + extra arguments
| | |
| | -------> grouped by what?
| -------> what to do?
---> on which rows?
R | SQL |
---|---|
i | where (subset, filtering) and order with the order() function |
j | select columns * the data.table way: DT[, .(colA, colB)] * the data.frame way: DT[, c(“colA”, “colB”), with = FALSE] |
j | compute DT[, .(sumA = sum(colA), meanB = mean(colB))] |
by | group by |
keyby | group by ordered |
ans <- flights[origin == "JFK" & month == 6L]
ans <- flights[1:2]
Sort flights first by column origin in ascending order, and then by dest in descending order
ans <- flights[order(origin, -dest)]
flights[carrier == "AA",
.(mean(arr_delay), mean(dep_delay)),
keyby = .(origin, dest, month)]
keyby() is applied after performing the operation, i.e., on the computed result and sets a key after ordering by setting an attribute called sorted.
# return it as a vector.
ans <- flights[, arr_delay]
# return as a data.table
ans <- flights[, list(arr_delay)]
ans <- flights[, .(arr_delay)] # The point is an alias to list.
ans <- flights[, .(arr_delay, dep_delay)]
ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)]
ans <- flights[, sum((arr_delay + dep_delay) < 0)]
ans <- flights[origin == "JFK" & month == 6L,
.(m_arr = mean(arr_delay), m_dep = mean(dep_delay))]
ans <- flights[origin == "JFK" & month == 6L, length(dest)] # length = count (can have any argument)
ans <- flights[origin == "JFK" & month == 6L, .N] #.N is an alias for length
ans <- flights[, .(.N), by = .(origin)]
* the number of trips for each origin airport for carrier code “AA”
ans <- flights[carrier == "AA", .N, by = origin]
ans <- flights[carrier == "AA", .N, by = .(origin,dest)]
ans <- flights[carrier == "AA",
.(mean(arr_delay), mean(dep_delay)),
by = .(origin, dest, month)]
ans <- flights[, .N, .(dep_delay>0, arr_delay>0)]
.SD for Subset of Data:
Example:
DT[, print(.SD), by = col]
DT[, lapply(.SD, mean), by = ID]
flights[carrier == "AA", ## Only on trips with carrier "AA"
lapply(.SD, mean), ## compute the mean
by = .(origin, dest, month), ## for every 'origin,dest,month'
.SDcols = c("arr_delay", "dep_delay")] ## for just those specified in .SDcols
ans <- flights[, head(.SD, 2), by = month]
ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)]
DT[ ...
][ ...
][ ...
]