@article {35, title = {SMT: Sparse multivariate tree}, journal = {Statistical Analysis and Data Mining}, volume = {7}, year = {2014}, month = {02/2014}, pages = {53-69}, abstract = {

A multivariate decision tree attempts to improve upon the single variable split in a traditional tree. With the increase in datasets with many features and a small number of labeled instances in a variety of domains (bioinformatics, text mining, etc.), a traditional tree-based approach with a greedy variable selection at a node may omit important information. Therefore, the recursive partitioning idea of a simple decision tree combined with the intrinsic feature selection of L1 regularized logistic regression (LR) at each node is a natural choice for a multivariate tree model that is simple, but broadly applicable. This natural solution leads to the sparse multivariate tree (SMT) considered here. SMT can naturally handle non-time-series data and is extended to handle time-series classification problems with the power of extracting interpretable temporal patterns (e.g., means, slopes, and deviations). Binary L1 regularized LR models are used here for binary classification problems. However, SMT may be extended to solve multiclass problems with multinomial LR models. The accuracy and computational efficiency of SMT is compared to a large number of competitors on time series and non-time-series data.

}, keywords = {decision tree, feature extraction, fused Lasso, Lasso, time series classification}, issn = {1932-1872}, doi = {10.1002/sam.11208}, url = {http://dx.doi.org/10.1002/sam.11208}, author = {Houtao Deng and Mustafa Gokce Baydogan and George Runger} } @article {33, title = {A Bag-of-Features Framework to Classify Time Series}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, volume = {35}, year = {2013}, pages = {2796-2802}, abstract = {

Time series classification is an important task with many challenging applications. A nearest neighbor (NN) classifier with dynamic time warping (DTW) distance is a strong solution in this context. On the other hand, feature-based approaches have been proposed as both classifiers and to provide insight into the series, but these approaches have problems handling translations and dilations in local patterns. Considering these shortcomings, we present a framework to classify time series based on a bag-of-features representation (TSBF). Multiple subsequences selected from random locations and of random lengths are partitioned into shorter intervals to capture the local information. Consequently, features computed from these subsequences measure properties at different locations and dilations when viewed from the original series. This provides a feature-based approach that can handle warping (although differently from DTW). Moreover, a supervised learner (that handles mixed data types, different units, etc.) integrates location information into a compact codebook through class probability estimates. Additionally, relevant global features can easily supplement the codebook. TSBF is compared to NN classifiers and other alternatives (bag-of-words strategies, sparse spatial sample kernels, shapelets). Our experimental results show that TSBF provides better results than competitive methods on benchmark datasets from the UCR time series database.

}, keywords = {codebook, feature extraction, supervised learning}, issn = {0162-8828}, doi = {http://doi.ieeecomputersociety.org/10.1109/TPAMI.2013.72}, author = {Mustafa Gokce Baydogan and George Runger and Eugene Tuv} }