%\VignetteIndexEntry{miRNAss usage}
%\VignettePackage{miRNAss}

\documentclass[a4paper,11pt]{article}

\usepackage{authblk}
\renewcommand\Affilfont{\itshape\small}
\usepackage{Sweave}
\usepackage{hyperref}

\textwidth=6.2in
\textheight=8.5in
\parskip=.3cm
\oddsidemargin=.1in
\evensidemargin=.1in
\headheight=-.3in

\newcommand{\Rpackage}[1]{\textit{#1}}

\begin{document}
\SweaveOpts{concordance=TRUE}
%------------------------------------------------------------
\title{\textbf{MiRNAss user guide} \\
{\Large Genome-wide pre-miRNA discovery from few labeled examples}}
%------------------------------------------------------------
\author[1]{Cristian A. Yones (\url{cyones@sinc.unl.edu.ar})}
\author[1]{Georgina Stegmayer}
\author[1]{Diego H. Milone}

\affil[1]{Research Institute for Signals, Systems and Computational
Intelligence, sinc(\textit{i}), FICH-UNL, CONICET, Santa Fe, Argentina.}

\SweaveOpts{echo = TRUE, useFancyQuotes = FALSE, highlight=TRUE, tidy=TRUE, keep.space=TRUE, keep.blank.space=FALSE, keep.comment=TRUE}


\maketitle

\abstract{
MiRNAss is a machine learning method specifically designed for pre-miRNA
prediction. It takes advantage of unlabeled sequences to improve the prediction
rates even when there are just a few positive examples, and when the negative
examples are unreliable or are not good representatives of its class.
Furthermore, the method can automatically search for negative examples if the
user is unable to provide them. MiRNAss can find a good boundary to divide the
pre-miRNAs from other groups of sequences; it automatically optimizes the
threshold that defines the classes boundaries, and thus, it is robust to high
class imbalance. Each step of the method is scalable and can handle large
volumes of data.
The last version of the package can be found at CRAN. Also, the
development version of the package can be found at:
\url{https://github.com/cyones/miRNAss}.
Related projects can be found in \url{http://fich.unl.edu.ar/sinc/}
}

\section{Input data}
MiRNAss receive as input numerical features extracted from hairpin sequences.
This means that a genome needs to be pre-proccesed to be able to make
predictions with miRNAss. There are two steps: split the genome-wide data in
shorter sequences and extract features from this sequences. The first step can
be accomplished with \textit{HExtractor}
(\url{https://sourceforge.net/projects/sourcesinc/files/hextractor/}), which is
a tool specifically designed for this task. For the feature extraction we have
developed a comprehensive tool of feature extraction called \textit{miRNAfe}
\url{http://fich.unl.edu.ar/sinc/blog/web-demo/mirnafe-full/} that
is able of calculate almost all the features used in the state-of-the-art
prediction methods. For further details see Yones
\textit{et. al.}, 2015 \footnote{Yones, C. A., Stegmayer, G., Kamenetzky, L., \&
Milone, D. H. (2015). miRNAfe: A comprehensive tool for feature extraction in
microRNA prediction. \textit{Biosystems}, \textbf{138}, 1-5.}.


\section{How to use miRNAss}
After install the package, load miRNAss with the following command:

<<loadPackages,results=hide>>=
library('miRNAss')
@

\noindent The following command is the simplest way to execute miRNAss:

<<loadPackages,eval=FALSE>>=
miRNAss(features, labels)
@

\noindent Where:
\begin{itemize}
\item \textbf{features}: is a data frame with the features extracted from
hairpyn sequences, one sequence per row and one numeric feature per column.
\item \textbf{labels}: is a numeric vector where the i-th element has a value
of 1 if it is a well-known pre-miRNA, a -1 if it is not a pre-miRNA, and zero
if it is an unknown sequence that has to be classified (predicted) by
the method.
\end{itemize}

\noindent The data provided with the package can be used to test miRNAss. This small
dataset is composed of a small set of features extracted from 1000 hairpins
randomly extracted from C. elegans hairpins. To use miRNAss with this dataset,
first construct the label vector with the CLASS column

<<setY>>=
y = as.numeric(celegans$CLASS)*2 - 1
@

\noindent Remove some labels to make a test
<<sampleY>>=
y[sample(which(y > 0),200)] = 0
y[sample(which(y < 0),700)] = 0
@

\noindent Take all the features but remove the label column
<<removeLabels>>=
x = subset(celegans, select = -CLASS)
@

\noindent Call miRNAss with default parameters
<<miRNAssCall>>=
p = miRNAss(x,y)
@

\noindent To get the indexes of the sequences that were predicted as possible
pre-miRNA
<<predictions,results=hide>>=
is.miRNA = which(p > 0)
@

\noindent If the true labels are known, some performance measures can be
calculated
<<performance>>=
SE = mean(p[ celegans$CLASS & y == 0] > 0)
SP = mean(p[!celegans$CLASS & y == 0] < 0)
cat('Sensitivity: ', SE, '\nSpecificity: ', SP, '\n')
@

\noindent For more help about all the parameters execute:
<<loadPackages,results=hide>>=
help(miRNAss)
@

\section{Extra datasets and test scripts}
A set of experiments and comparisons with other methods can be done. The
scripts and the data of these experiments are contained in the file
miRNAss-experiments.zip that can be found in:

\url{https://sourceforge.net/projects/sourcesinc/files/mirnass/}

\noindent To run these tests, after unzip the file, set this directory as the working
directory and simply run each script with the function 'source':

<<loadPackages,eval=FALSE>>=
setwd('miRNAss-experiments')
source('2-delta_mirBase.R')
@

\noindent This will generate one csv file for each test in the 'results' folder. It is
important to point that most of these experiments are computationally expensive
and could take quite a while (about 40 minutes for the experiment
2 in an intel i7 PC). You can plot the results executing:

<<loadPackages,eval=FALSE>>=
source('plotResults.R')
@

\noindent The figures will be saved in the folder 'results'.

\section{Software used}

<<sessionInfo, echo=FALSE, results=tex>>=
toLatex(sessionInfo())
@

\end{document}
