Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Friday, 31 March 2017

Linear Regression with SGD

[cloudera@quickstart ~]$ gedit prof
[cloudera@quickstart ~]$ hadoop fs -mkdir mlib
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal prof mlib
[cloudera@quickstart ~]$

scala> val data = sc.textFile

("/user/cloudera/mlib/prof")
data: org.apache.spark.rdd.RDD[String] =

/user/cloudera/mlib/prof MapPartitionsRDD[1]

at textFile at <console>:27

scala> data.collect.take(3).foreach(println)
"a","w","h","c"
25,80,5.9,120
23,55,5.7,90


scala> val ndata = data.filter{x =>
     |     !(x.split(",")(0).contains("a"))
     | }


scala> ndata.collect.foreach(println)
25,80,5.9,120
23,55,5.7,90
23,89,6.0,130
26,80,5.9,120
23,55,5.7,90
23,69,6.0,130
28,81,5.9,120
23,55,5.9,190
23,81,6.0,130
29,87,5.9,120
23,55,5.7,190
23,89,5.0,130

scala>

scala>
     | import

org.apache.spark.mllib.regression.LabeledPoint
import

org.apache.spark.mllib.regression.LabeledPoint

scala> import

org.apache.spark.mllib.regression.LinearRegres

sionModel
import

org.apache.spark.mllib.regression.LinearRegres

sionModel

scala> import

org.apache.spark.mllib.regression.LinearRegres

sionWithSGD
import

org.apache.spark.mllib.regression.LinearRegres

sionWithSGD

scala> import

org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vectors

-------------
scala>  def toLabel(line:String) = {
     |              
     |           val w = line.split(",")
     |           val lbl = w(3).toDouble
     |           val f = w.take(3).map(x =>

x.toDouble)
     |           LabeledPoint(lbl,

Vectors.dense(f))
     |        }
toLabel: (line: String)

org.apache.spark.mllib.regression.LabeledPoint

scala>

scala> toLabel("23,78,5.9,120")
res8:

org.apache.spark.mllib.regression.LabeledPoint

= (120.0,[23.0,78.0,5.9])

scala>

val trainset = ndata.map(x => toLabel(x))

scala> val trainset = ndata.map(x => toLabel

(x))
trainset: org.apache.spark.rdd.RDD

[org.apache.spark.mllib.regression.LabeledPoin

t] = MapPartitionsRDD[3] at map at

<console>:37

scala>

scala> trainset.collect.foreach(println)
(120.0,[25.0,80.0,5.9])
(90.0,[23.0,55.0,5.7])
(130.0,[23.0,89.0,6.0])
(120.0,[26.0,80.0,5.9])
(90.0,[23.0,55.0,5.7])
(130.0,[23.0,69.0,6.0])
(120.0,[28.0,81.0,5.9])
(190.0,[23.0,55.0,5.9])
(130.0,[23.0,81.0,6.0])
(120.0,[29.0,87.0,5.9])
(190.0,[23.0,55.0,5.7])
(130.0,[23.0,89.0,5.0])

scala>


val numIterations = 100
val model = LinearRegressionWithSGD.train

(trainset, numIterations)


val valuesAndPreds = trainset.map {
   x =>
  val prediction =       model.predict

(x.features)

  (x.label, prediction)
}

// above contains , y and ycap,
 y is actual label  and ycap is predicted

label.

  [ label means response variable ].


scala> valuesAndPreds.collect.foreach(println)
(120.0,-7.150280334821135E301)
(90.0,-5.078652953403039E301)
(130.0,-7.824818198048878E301)
(120.0,-7.176538392878548E301)
(90.0,-5.078652953403039E301)
(130.0,-6.210523036282773E301)
(120.0,-7.309769267081678E301)
(190.0,-5.07989526649868E301)
(130.0,-7.179100133342436E301)
(120.0,-7.820315873668922E301)
(190.0,-5.078652953403039E301)
(130.0,-7.818606632570674E301)


val mse = valuesAndPreds.map{ x =>
       val y = x._1.toInt
       val ycap = x._2.toInt
       val e = y - ycap
       e*e
     }.mean

continue the trails by improving
  number of iterations. till you meet

convergence.  [[ mse wont be changed. ]]




val  acc = valuesAndPreds.map{ x =>
      val y = x._1.toInt
      val ycap = x._2.toInt
   
      val dist = ((y-ycap)*100)/y

   val stat=if (dist>= -20 & dist<= 20) "Pass"

else "Fail"
      (stat,1)
     }

val accres = acc.reduceByKey(_+_)

---------------------------------

if accuracy satisfied,
   apply the predictions on predictables(live

data )


   model.predict(<dense vector>)
    dense vector should contain
       only features.


-----------------------------------





 
     







Thursday, 30 March 2017

R and Analytics Basics3

sankara.deva2016@gmail.com

bharat sreeram, Ventech It Solutions.
-----------------------------------------------------------

# Linear REgression Implementation in R
---------------

df = 
read.csv("C:/Users/Hadoop/Desktop/prof.txt")

> df

    a  w   h   c

1  25 80 5.9 120
2  23 55 5.7  90
3  23 89 6.0 130
4  26 80 5.9 120
5  23 55 5.7  90
6  23 69 6.0 130
7  28 81 5.9 120
8  23 55 5.9 190
9  23 81 6.0 130
10 29 87 5.9 120
11 23 55 5.7 190
12 23 89 5.0 130





Y = matrix(df$c, ncol=1)

Xdf = data.frame(beta = 1, a=df$a, w=df$w,
     h=df$h)
X = data.matrix(Xdf)

Xt = t(X)
XtX = Xt %*% X
inv = solve(XtX)
XtY = Xt %*% Y
beta = inv %*% XtY

-----------------------

model = lm(c ~ a + w + h , data=df)
-----------------------

Model testing: [accuracy testing ]

> x
 [1]  10  20  30  40  23  45  67  90 100 150
> y
 [1]  23  45  67  90  50 100 130 190 210 270
> cor(x,y)
[1] 0.9929889
> df = data.frame(x, y)
> lmfit = lm(y ~ x, data=df)

> params = coefficients(lmfit)
> a = params[1]
> b = params[2]

> test = data.frame(y=df$y)
> test$ycap = a + (b*df$x)
> test$dist = ((test$ycap-test$y)*100)/test$y

> test$res = "Fail"
> test$res[test$dist>=-20 & test$dist<=20] = "Pass"
> prop.table(table(test$res))


--------------------------------

non-linear regression implementation:

> dframe = data.frame(x = df$x,
+                    xsq = df$x^2,
+                    xcube = df$x^3,                             y = df$y)
> nlmfit = lm(y ~ x + xsq + xcube, data=dframe)
> p = coefficients(nlmfit)
> b0 = p[1]
> b1 = p[2]
> b2 = p[3]
> b3 = p[4]

> newtest = data.frame(y = dframe$y)
> newtest$ycap = b0 + (b1*dframe$x) +
+                      (b2 * dframe$xsq) +
+                      (b3 * dframe$xcube)

> newtest$dist = ((newtest$ycap-newtest$y)*100)/newtest$y

> newtest$res = "Fail"
> newtest$res[newtest$dist>=-20 & 
          newtest$dist<=20] = "Pass"
> prop.table(table(newtest$res))














R and Analytics Basics 2

sankara.deva2016@gmail.com

bharat sreeram , Ventech It Solutions
-----------------------------------------------------------------------------------------------

> info =

  read.csv("C:/Users/Hadoop/Desktop/info.txt")
> info
   id    name   sal sex city
1 101    Amar 40000   m  hyd
2 102   Amala 50000   f  del
3 103   sunil 70000   m  hyd
4 104 sunitha 80000   f  hyd
5 105  ankith 90000   m  del
6 106 ankitha 60000   f  hyd
>
> class(info)
[1] "data.frame"
> str(info)
 -- structure of info.
'data.frame':   6 obs. of  5 variables:
 $ id  : int  101 102 103 104 105 106
 $ name: Factor w/ 6 levels "Amala","Amar",..: 2 1 5 6 3 4
 $ sal : int  40000 50000 70000 80000 90000 60000
 $ sex : Factor w/ 2 levels "f","m": 2 1 2 1 2 1
 $ city: Factor w/ 2 levels "del","hyd": 2 1 2 2 1 2
>
> info$name
[1] Amar    Amala   sunil   sunitha ankith  ankitha
Levels: Amala Amar ankith ankitha sunil sunitha
> info$sal
[1] 40000 50000 70000 80000 90000 60000
>
-------------------
Updating Column:
 info$sal = info$sal+1000
Generating new Field for Data frame.
 info$tax = info$sal*10/100
  # conditional transformation,.
 info$grade = "C"
 info$grade[info$sal>=50000 & info$sal<=80000]    = "B"
 info$grade[info$sal>80000]="A"
> info
   id    name   sal sex city  tax grade
1 101    Amar 41000   m  hyd 4100     C
2 102   Amala 51000   f  del 5100     B
3 103   sunil 71000   m  hyd 7100     B
4 104 sunitha 81000   f  hyd 8100     A
5 105  ankith 91000   m  del 9100     A
6 106 ankitha 61000   f  hyd 6100     B
>
Grouping Aggregations:
--------------------------------
 res1 =
   aggregate( sal ~ sex, data=info, FUN=sum)
> res1
  sex    sal
1   f 193000
2   m 203000
>

 res2 =
   aggregate( sal ~ sex, data=info, FUN=mean)
> res2
  sex      sal
1   f 64333.33
2   m 67666.67

  res3 =
   aggregate( sal ~ sex, data=info, FUN=max)
> res3
  sex   sal
1   f 81000
2   m 91000

 res4 =
   aggregate( sal ~ sex, data=info, FUN=min)
> res4
  sex   sal
1   f 51000
2   m 41000


 res5 =
   aggregate( sal ~ sex, data=info, FUN=length)
> res5
  sex sal
1   f   3
2   m   3

---------------------------
 res6 = aggregate(sal ~ grade, data=info,
   FUN = length)
> res6
  grade sal
1     A   2
2     B   3
3     C   1

-----------------------------------
  select sex, grade, sum(sal) from info
    group by sex, grade;
Grouping by multiple columns of data frame.
 res7 =
   aggregate( sal ~ sex + grade, data=info, FUN=sum)
> res7
  sex grade    sal
1   f     A  81000
2   m     A  91000
3   f     B 112000
4   m     B  71000
5   m     C  41000

-----------------------------------------

 select sex, sum(sal), max(sal),
      min(sal), avg(sal), count(*)
   from info group by sex;
# performing multiple aggregation,
   of each data group.

Performing Multiple Aggregations:
r1 = aggregate(sal ~ sex , data=info , FUN=sum)
r2 = aggregate(sal ~ sex , data=info , FUN=max)
r3 = aggregate(sal ~ sex , data=info , FUN=min)
r4 = aggregate(sal ~ sex , data=info , FUN=mean)
r5 = aggregate(sal ~ sex , data=info , FUN=length)
resall = data.frame(sex=r1$sex,
          tot=r1$sal, avg=r4$sal,
         max= r2$sal, min=r3$sal,
        cnt=r5$sal)
> resall
  sex    tot      avg   max   min cnt
1   f 193000 64333.33 81000 51000   3
2   m 203000 67666.67 91000 41000   3
>
--------------------------------------
Working With Matrices.
 v = 1:15
  m1 = matrix(v, nrow=3)
 m2 = matrix(v, nrow=3 ,byrow=T)
 m3 = matrix(v, ncol=5, byrow=T)
 m4 = matrix(1:12,4)
 Operations of matrices.
 m = matrix(c(10,20,30,40), nrow=2)
 mx = matrix(c(12,34,56,23), nrow=2)
 r1 = m + mx   # element to element sum
 r2 = m * mx   #   "        "       multiplication
 r3 = m %*% mx  # matrix multiplication
 r4 = solve(m) # inverse of matrix.
 r5 = t(m) # transpose of matrix.
-------------------------------------------




























----------------------




























































R and Analytics Basics 1

sankara.deva2016@gmail.com

Bharat sreeram, Ventech It Solutions
-----------------------------------------------------------


 ls()
--- list of of r objects.
rm("x")
-- removes object
-----------------------------------------
 a <- 100
 a = 100
 b = 200
 c = a + b
 d = a^3
 e = a**2
 name = "Giri"
  a%%2 ---> remainder.
----------------------------------
>  x = c (10,20,30,40,23,45,67,90)
> x[1] # first element
> length(x) # number of elements
> x[length(x)] # last element
> x[1:3] # first 3 elements
> x[4:length(x)] # 4th onwards
> x[9]=100 # adding 9 th element
> x[length(x)+1]=150 # adding element to last.
--------------------------------
> x
 [1]  10  20  30  40  23  45  67  90 100 150
> y = x + 10
> y
 [1]  20  30  40  50  33  55  77 100 110 160
>
 z = c(1,2,3,4,5)
 u = x + z
 # element to element arithmetic operation will happen.
  # if number of elements are different.
 iterations will happen till all elements of
 big vector is completed.
> sum(x)
[1] 575
> mean(x)
[1] 57.5
> length(x)
[1] 10
> max(x)
[1] 150
> min(x)
[1] 10
> var(x)
[1] 1953.389
> sd(x)
[1] 44.19716
>
-------------------------------------
gm = function(x){
  v = 1
  for (i in x){
     v = v * i
  }
  n = length(x)
  v^(1/n)
 }
# last expression will be returned.
------------------------------------
 hm = function(x) {
     xx = 1/x
     sxx = sum(xx)
     n = length(x)
     n/sxx
 }
--------------------------------
# function for variance of population
 vpop = function(x){
   xbar = mean(x)
   dev = x - xbar
   sumdsq = sum( dev^2)
    n = length(x)
    sumdsq/n
 }
 # function for standard deviation of Population.
 sdpop = function(x) {
    sqrt(vpop(x))
 }
--------------
 a = 1:10
 # generates auto sequence number,
   incremented by 1.
 b = 10:1
 c = seq(10,100,5)
 d = seq(100,10,-5)
-----------------------------------------

























----------------------------




.