## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(
  comment = "#>",
  collapse = TRUE
)

## -----------------------------------------------------------------------------
library("protr")

# Load FASTA files
# Note that `system.file()` is for accessing example files
# in the protr package. Replace it with your own file path.
extracell <- readFASTA(
  system.file(
    "protseq/extracell.fasta",
    package = "protr"
  )
)
mitonchon <- readFASTA(
  system.file(
    "protseq/mitochondrion.fasta",
    package = "protr"
  )
)

## ----eval = FALSE-------------------------------------------------------------
# length(extracell)

## ----eval = FALSE-------------------------------------------------------------
# ## [1] 325

## ----eval = FALSE-------------------------------------------------------------
# length(mitonchon)

## ----eval = FALSE-------------------------------------------------------------
# ## [1] 306

## ----eval = FALSE-------------------------------------------------------------
# extracell <- extracell[(sapply(extracell, protcheck))]
# mitonchon <- mitonchon[(sapply(mitonchon, protcheck))]

## ----eval = FALSE-------------------------------------------------------------
# length(extracell)

## ----eval = FALSE-------------------------------------------------------------
# ## [1] 323

## ----eval = FALSE-------------------------------------------------------------
# length(mitonchon)

## ----eval = FALSE-------------------------------------------------------------
# ## [1] 304

## ----eval = FALSE-------------------------------------------------------------
# # Calculate APseAAC descriptors
# x1 <- t(sapply(extracell, extractAPAAC))
# x2 <- t(sapply(mitonchon, extractAPAAC))
# x <- rbind(x1, x2)
# 
# # Make class labels
# labels <- as.factor(c(rep(0, length(extracell)), rep(1, length(mitonchon))))

## ----eval = FALSE-------------------------------------------------------------
# set.seed(1001)
# 
# # Split training and test set
# tr.idx <- c(
#   sample(1:nrow(x1), round(nrow(x1) * 0.75)),
#   sample(nrow(x1) + 1:nrow(x2), round(nrow(x2) * 0.75))
# )
# te.idx <- setdiff(1:nrow(x), tr.idx)
# 
# x.tr <- x[tr.idx, ]
# x.te <- x[te.idx, ]
# y.tr <- labels[tr.idx]
# y.te <- labels[te.idx]

## ----eval = FALSE-------------------------------------------------------------
# library("randomForest")
# rf.fit <- randomForest(x.tr, y.tr, cv.fold = 5)
# rf.fit

## ----eval = FALSE-------------------------------------------------------------
# ## Call:
# ##  randomForest(x = x.tr, y = y.tr, cv.fold = 5)
# ##                Type of random forest: classification
# ##                      Number of trees: 500
# ## No. of variables tried at each split: 8
# ##
# ##         OOB estimate of  error rate: 25.11%
# ## Confusion matrix:
# ##     0   1 class.error
# ## 0 196  46   0.1900826
# ## 1  72 156   0.3157895

## ----eval = FALSE-------------------------------------------------------------
# # Predict on test set
# rf.pred <- predict(rf.fit, newdata = x.te, type = "prob")[, 1]
# 
# # Plot ROC curve
# library("pROC")
# plot.roc(y.te, rf.pred, grid = TRUE, print.auc = TRUE)

## ----eval = FALSE-------------------------------------------------------------
# ## Call:
# ## plot.roc.default(x = y.te, predictor = rf.pred, col = "#0080ff",
# ##                  grid = TRUE, print.auc = TRUE)
# ##
# ## Data: rf.pred in 81 controls (y.te 0) > 76 cases (y.te 1).
# ## Area under the curve: 0.8697

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/roc.png")

## ----extractAAC---------------------------------------------------------------
library("protr")

x <- readFASTA(system.file(
  "protseq/P00750.fasta",
  package = "protr"
))[[1]]

extractAAC(x)

## ----extractDC----------------------------------------------------------------
dc <- extractDC(x)
head(dc, n = 30L)

## ----extractTC----------------------------------------------------------------
tc <- extractTC(x)
head(tc, n = 36L)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/AAindex.png")

## ----extractMoreau1-----------------------------------------------------------
moreau <- extractMoreauBroto(x)
head(moreau, n = 36L)

## ----extractMoreau2-----------------------------------------------------------
# Define 3 custom properties
myprops <- data.frame(
  AccNo = c("MyProp1", "MyProp2", "MyProp3"),
  A = c(0.62, -0.5, 15), R = c(-2.53, 3, 101),
  N = c(-0.78, 0.2, 58), D = c(-0.9, 3, 59),
  C = c(0.29, -1, 47), E = c(-0.74, 3, 73),
  Q = c(-0.85, 0.2, 72), G = c(0.48, 0, 1),
  H = c(-0.4, -0.5, 82), I = c(1.38, -1.8, 57),
  L = c(1.06, -1.8, 57), K = c(-1.5, 3, 73),
  M = c(0.64, -1.3, 75), F = c(1.19, -2.5, 91),
  P = c(0.12, 0, 42), S = c(-0.18, 0.3, 31),
  T = c(-0.05, -0.4, 45), W = c(0.81, -3.4, 130),
  Y = c(0.26, -2.3, 107), V = c(1.08, -1.5, 43)
)

# Use 4 properties in the AAindex database and 3 customized properties
moreau2 <- extractMoreauBroto(
  x,
  customprops = myprops,
  props = c(
    "CIDH920105", "BHAR880101",
    "CHAM820101", "CHAM820102",
    "MyProp1", "MyProp2", "MyProp3"
  )
)

head(moreau2, n = 36L)

## ----extractMoran-------------------------------------------------------------
# Use the 3 custom properties defined before
# and 4 properties in the AAindex database
moran <- extractMoran(
  x,
  customprops = myprops,
  props = c(
    "CIDH920105", "BHAR880101",
    "CHAM820101", "CHAM820102",
    "MyProp1", "MyProp2", "MyProp3"
  )
)

head(moran, n = 36L)

## ----extractGeary-------------------------------------------------------------
# Use the 3 custom properties defined before
# and 4 properties in the AAindex database
geary <- extractGeary(
  x,
  customprops = myprops,
  props = c(
    "CIDH920105", "BHAR880101",
    "CHAM820101", "CHAM820102",
    "MyProp1", "MyProp2", "MyProp3"
  )
)

head(geary, n = 36L)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/CTD.png")

## ----extractCTDC--------------------------------------------------------------
extractCTDC(x)

## ----extractCTDT--------------------------------------------------------------
extractCTDT(x)

## ----extractCTDD--------------------------------------------------------------
extractCTDD(x)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/ctriad.png")

## ----extractCTriad------------------------------------------------------------
ctriad <- extractCTriad(x)
head(ctriad, n = 65L)

## ----extractSOCN--------------------------------------------------------------
extractSOCN(x)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/QSO.png")

## ----extractQSO---------------------------------------------------------------
extractQSO(x)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/PAAC.png")

## ----extractPAAC--------------------------------------------------------------
extractPAAC(x)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/APAAC.png")

## ----extractAPAAC-------------------------------------------------------------
extractAPAAC(x)

## ----extractDescScales--------------------------------------------------------
x <- readFASTA(system.file(
  "protseq/P00750.fasta",
  package = "protr"
))[[1]]

descscales <- extractDescScales(
  x,
  propmat = "AATopo",
  index = c(37:41, 43:47),
  pc = 5, lag = 7, silent = FALSE
)

## ----extractDescScales2-------------------------------------------------------
length(descscales)
head(descscales, 15)

## ----extractBLOSUM------------------------------------------------------------
x <- readFASTA(system.file(
  "protseq/P00750.fasta",
  package = "protr"
))[[1]]

blosum <- extractBLOSUM(
  x,
  submat = "AABLOSUM62",
  k = 5, lag = 7, scale = TRUE, silent = FALSE
)

## ----extractBLOSUM2-----------------------------------------------------------
length(blosum)
head(blosum, 15)

## ----eval = FALSE-------------------------------------------------------------
# s1 <- readFASTA(system.file("protseq/P00750.fasta", package = "protr"))[[1]]
# s2 <- readFASTA(system.file("protseq/P08218.fasta", package = "protr"))[[1]]
# s3 <- readFASTA(system.file("protseq/P10323.fasta", package = "protr"))[[1]]
# s4 <- readFASTA(system.file("protseq/P20160.fasta", package = "protr"))[[1]]
# s5 <- readFASTA(system.file("protseq/Q9NZP8.fasta", package = "protr"))[[1]]
# plist <- list(s1, s2, s3, s4, s5)
# psimmat <- parSeqSim(plist, cores = 4, type = "local", submat = "BLOSUM62")
# psimmat

## ----eval = FALSE-------------------------------------------------------------
# ##            [,1]       [,2]       [,3]       [,4]       [,5]
# ## [1,] 1.00000000 0.11825938 0.10236985 0.04921696 0.03943488
# ## [2,] 0.11825938 1.00000000 0.18858241 0.12124217 0.06391103
# ## [3,] 0.10236985 0.18858241 1.00000000 0.05819984 0.06175942
# ## [4,] 0.04921696 0.12124217 0.05819984 1.00000000 0.05714638
# ## [5,] 0.03943488 0.06391103 0.06175942 0.05714638 1.00000000

## ----eval = FALSE-------------------------------------------------------------
# # By GO Terms
# go1 <- c(
#   "GO:0005215", "GO:0005488", "GO:0005515",
#   "GO:0005625", "GO:0005802", "GO:0005905"
# ) # AP4B1
# go2 <- c(
#   "GO:0005515", "GO:0005634", "GO:0005681",
#   "GO:0008380", "GO:0031202"
# ) # BCAS2
# go3 <- c(
#   "GO:0003735", "GO:0005622", "GO:0005840",
#   "GO:0006412"
# ) # PDE4DIP
# golist <- list(go1, go2, go3)
# parGOSim(golist, type = "go", ont = "CC", measure = "Wang")

## ----eval = FALSE-------------------------------------------------------------
# ##       [,1]  [,2] [,3]
# ## [1,] 1.000 0.344 0.17
# ## [2,] 0.344 1.000 0.24
# ## [3,] 0.170 0.240 1.00

## ----eval = FALSE-------------------------------------------------------------
# # By Entrez gene id
# genelist <- list(c("150", "151", "152", "1814", "1815", "1816"))
# parGOSim(genelist, type = "gene", ont = "BP", measure = "Wang")

## ----eval = FALSE-------------------------------------------------------------
# ##        150   151   152  1814  1815  1816
# ## 150  1.000 0.702 0.725 0.496 0.570 0.455
# ## 151  0.702 1.000 0.980 0.456 0.507 0.551
# ## 152  0.725 0.980 1.000 0.460 0.511 0.538
# ## 1814 0.496 0.456 0.460 1.000 0.687 0.473
# ## 1815 0.570 0.507 0.511 0.687 1.000 0.505
# ## 1816 0.455 0.551 0.538 0.473 0.505 1.000

## ----eval = FALSE-------------------------------------------------------------
# ids <- c("P00750", "P00751", "P00752")
# prots <- getUniProt(ids)
# prots

## ----eval = FALSE-------------------------------------------------------------
# ## [[1]]
# ## [1] "MDAMKRGLCCVLLLCGAVFVSPSQEIHARFRRGARSYQVICRDEKTQMIYQQHQSWLRPVLRSNRVEYCWCN
# ## SGRAQCHSVPVKSCSEPRCFNGGTCQQALYFSDFVCQCPEGFAGKCCEIDTRATCYEDQGISYRGTWSTAESGAECT
# ## NWNSSALAQKPYSGRRPDAIRLGLGNHNYCRNPDRDSKPWCYVFKAGKYSSEFCSTPACSEGNSDCYFGNGSAYRGT
# ## HSLTESGASCLPWNSMILIGKVYTAQNPSAQALGLGKHNYCRNPDGDAKPWCHVLKNRRLTWEYCDVPSCSTCGLRQ
# ## YSQPQFRIKGGLFADIASHPWQAAIFAKHRRSPGERFLCGGILISSCWILSAAHCFQERFPPHHLTVILGRTYRVVP
# ## GEEEQKFEVEKYIVHKEFDDDTYDNDIALLQLKSDSSRCAQESSVVRTVCLPPADLQLPDWTECELSGYGKHEALSP
# ## FYSERLKEAHVRLYPSSRCTSQHLLNRTVTDNMLCAGDTRSGGPQANLHDACQGDSGGPLVCLNDGRMTLVGIISWG
# ## LGCGQKDVPGVYTKVTNYLDWIRDNMRP"
# ##
# ## [[2]]
# ## [1] "MGSNLSPQLCLMPFILGLLSGGVTTTPWSLARPQGSCSLEGVEIKGGSFRLLQEGQALEYVCPSGFYPYPVQ
# ## TRTCRSTGSWSTLKTQDQKTVRKAECRAIHCPRPHDFENGEYWPRSPYYNVSDEISFHCYDGYTLRGSANRTCQVNG
# ## RWSGQTAICDNGAGYCSNPGIPIGTRKVGSQYRLEDSVTYHCSRGLTLRGSQRRTCQEGGSWSGTEPSCQDSFMYDT
# ## PQEVAEAFLSSLTETIEGVDAEDGHGPGEQQKRKIVLDPSGSMNIYLVLDGSDSIGASNFTGAKKCLVNLIEKVASY
# ## GVKPRYGLVTYATYPKIWVKVSEADSSNADWVTKQLNEINYEDHKLKSGTNTKKALQAVYSMMSWPDDVPPEGWNRT
# ## RHVIILMTDGLHNMGGDPITVIDEIRDLLYIGKDRKNPREDYLDVYVFGVGPLVNQVNINALASKKDNEQHVFKVKD
# ## MENLEDVFYQMIDESQSLSLCGMVWEHRKGTDYHKQPWQAKISVIRPSKGHESCMGAVVSEYFVLTAAHCFTVDDKE
# ## HSIKVSVGGEKRDLEIEVVLFHPNYNINGKKEAGIPEFYDYDVALIKLKNKLKYGQTIRPICLPCTEGTTRALRLPP
# ## TTTCQQQKEELLPAQDIKALFVSEEEKKLTRKEVYIKNGDKKGSCERDAQYAPGYDKVKDISEVVTPRFLCTGGVSP
# ## YADPNTCRGDSGGPLIVHKRSRFIQVGVISWGVVDVCKNQKRQKQVPAHARDFHINLFQVLPWLKEKLQDEDLGFL"
# ##
# ## [[3]]
# ## [1] "APPIQSRIIGGRECEKNSHPWQVAIYHYSSFQCGGVLVNPKWVLTAAHCKNDNYEVWLGRHNLFENENTAQF
# ## FGVTADFPHPGFNLSLLKXHTKADGKDYSHDLMLLRLQSPAKITDAVKVLELPTQEPELGSTCEASGWGSIEPGPDB
# ## FEFPDEIQCVQLTLLQNTFCABAHPBKVTESMLCAGYLPGGKDTCMGDSGGPLICNGMWQGITSWGHTPCGSANKPS
# ## IYTKLIFYLDWINDTITENP"

## ----protcheck----------------------------------------------------------------
x <- readFASTA(system.file("protseq/P00750.fasta", package = "protr"))[[1]]
# A real sequence
protcheck(x)
# An artificial sequence
protcheck(paste(x, "Z", sep = ""))

## ----protseg------------------------------------------------------------------
protseg(x, aa = "M", k = 5)

## -----------------------------------------------------------------------------
knitr::include_graphics("figures/protrweb.png")

