Up

sparktk.models.classification.svm module

# vim: set encoding=utf-8

#  Copyright (c) 2016 Intel Corporation 
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from sparktk.loggers import log_load; log_load(__name__); del log_load

from sparktk.propobj import PropertiesObject
from sparktk.frame.ops.classification_metrics_value import ClassificationMetricsValue
from sparktk import TkContext

__all__ = ["train", "load", "SvmModel"]

def train(frame,
          label_column,
          observation_columns,
          intercept = True,
          num_iterations = 100,
          step_size = 1.0,
          reg_type = None,
          reg_param = 0.01,
          mini_batch_fraction = 1.0):
    """
    Creates a Svm Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame of training data
    :param label_column: (str) Column containing the label for each observation
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param intercept: (boolean) Flag indicating if the algorithm adds an intercept. Default is true
    :param num_iterations: (int) Number of iterations for SGD. Default is 100
    :param step_size: (float) Initial step size for SGD optimizer for the first step. Default is 1.0
    :param reg_type: (Optional(str)) Regularization "L1" or "L2". Default is "L2"
    :param reg_param: (float) Regularization parameter. Default is 0.01
    :param mini_batch_fraction: (float) Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent
    :return: (SvmModel) The SVM trained model (with SGD)

    Notes
    -----
    Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine
    constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the
    largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM
    with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels
    of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the
    observations are predicted and tested against the true labels using built-in binary Classification Metrics.

    """
    if frame is None:
        raise ValueError("frame cannot be None")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   label_column,
                                   tc.jutils.convert.to_scala_list_string(observation_columns),
                                   intercept,
                                   num_iterations,
                                   step_size,
                                   tc.jutils.convert.to_scala_option(reg_type),
                                   reg_param,
                                   mini_batch_fraction)

    return SvmModel(tc, scala_model)


def load(path, tc=TkContext.implicit):
    """load SvmModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, SvmModel)


def get_scala_obj(tc):
    """Gets reference to the scala object"""
    return tc.sc._jvm.org.trustedanalytics.sparktk.models.classification.svm.SvmModel


class SvmModel(PropertiesObject):
    """
    A trained Svm model

    Example
    -------
        >>> frame = tc.frame.create([[-48.0,1], [-75.0,1], [-63.0,1], [-57.0,1], [73.0,0], [-33.0,1], [100.0,0],
        ...                          [-54.0,1], [78.0,0], [48.0,0], [-55.0,1], [23.0,0], [45.0,0], [75.0,0]],
        ...                          [("data", float),("label", str)])


        >>> frame.inspect()
        [#]  data   label
        =================
        [0]  -48.0  1
        [1]  -75.0  1
        [2]  -63.0  1
        [3]  -57.0  1
        [4]   73.0  0
        [5]  -33.0  1
        [6]  100.0  0
        [7]  -54.0  1
        [8]   78.0  0
        [9]   48.0  0

        >>> model = tc.models.classification.svm.train(frame, 'label', ['data'])

        >>> model.label_column
        u'label'

        >>> model.observation_columns
        [u'data']

        >>> predicted_frame = model.predict(frame, ['data'])

        >>> predicted_frame.inspect()
        [#]  data   label  predicted_label
        ==================================
        [0]  -48.0  1                    1
        [1]  -75.0  1                    1
        [2]  -63.0  1                    1
        [3]  -57.0  1                    1
        [4]   73.0  0                    0
        [5]  -33.0  1                    1
        [6]  100.0  0                    0
        [7]  -54.0  1                    1
        [8]   78.0  0                    0
        [9]   48.0  0                    0


        >>> test_metrics = model.test(predicted_frame)

        >>> test_metrics
        accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              7              0
        Actual_Neg              0              7
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0

        >>> model.save("sandbox/svm")

        >>> restored = tc.load("sandbox/svm")

        >>> restored.label_column == model.label_column
        True

        >>> restored.intercept == model.intercept
        True

        >>> set(restored.observation_columns) == set(model.observation_columns)
        True

        >>> predicted_frame2 = restored.predict(frame)

        >>> predicted_frame2.inspect()
        [#]  data   label  predicted_label
        ==================================
        [0]  -48.0  1                    1
        [1]  -75.0  1                    1
        [2]  -63.0  1                    1
        [3]  -57.0  1                    1
        [4]   73.0  0                    0
        [5]  -33.0  1                    1
        [6]  100.0  0                    0
        [7]  -54.0  1                    1
        [8]   78.0  0                    0
        [9]   48.0  0                    0

        >>> canonical_path = model.export_to_mar("sandbox/SVM.mar")



    """

    def __init__(self, tc, scala_model):
        self._tc = tc
        tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
        self._scala = scala_model

    @staticmethod
    def _from_scala(tc, scala_model):
        return SvmModel(tc, scala_model)

    @property
    def label_column(self):
        """column containing the label used during model training"""
        return self._scala.labelColumn()

    @property
    def observation_columns(self):
        """columns containing the observation values used during model training"""
        return self._tc.jutils.convert.from_scala_seq(self._scala.observationColumns())

    @property
    def intercept(self):
        """intercept used during model training"""
        return self._scala.intercept()

    @property
    def num_iterations(self):
        """max number of iterations allowed during model training"""
        return self._scala.numIterations()

    @property
    def step_size(self):
        """step size value used to train the model"""
        return self._scala.stepSize()

    @property
    def reg_type(self):
        """regularization type used to train the model"""
        return self._tc.jutils.convert.from_scala_option(self._scala.regType())

    @property
    def reg_param(self):
        """regularization parameter used to train the model"""
        return self._scala.regParam()

    @property
    def mini_batch_fraction(self):
        """minimum batch fraction used to train the model"""
        return self._scala.miniBatchFraction()

    def predict(self, frame, columns=None):
        """
       Predicts the labels for the observation columns in the given input frame. Creates a new frame
       with the existing columns and a new predicted column.

       Parameters
       ----------

       :param frame: (Frame) Frame used for predicting the values
       :param c: (List[str]) Names of the observation columns.
       :return: (Frame) A new frame containing the original frame's columns and a prediction column
       """
        c = self.__columns_to_option(columns)
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, c))

    def test(self, frame, columns=None):
        """test the frame given the trained model"""
        c = self.__columns_to_option(columns)
        return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))

    def __columns_to_option(self, c):
        if c is not None:
            c = self._tc.jutils.convert.to_scala_list_string(c)
        return self._tc.jutils.convert.to_scala_option(c)

    def save(self, path):
        """save the trained model to path"""
        self._scala.save(self._tc._scala_sc, path)

    def export_to_mar(self, path):
        """
        Exports the trained model as a model archive (.mar) to the specified path

        Parameters
        ----------

        :param path: (str) Path to save the trained model
        :return: (str) Full path to the saved .mar file
        """
        if isinstance(path, basestring):
            return self._scala.exportToMar(self._tc._scala_sc, path)


del PropertiesObject

Functions

def load(

path, tc=<class 'sparktk.arguments.implicit'>)

load SvmModel from given path

def load(path, tc=TkContext.implicit):
    """load SvmModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, SvmModel)

def train(

frame, label_column, observation_columns, intercept=True, num_iterations=100, step_size=1.0, reg_type=None, reg_param=0.01, mini_batch_fraction=1.0)

Creates a Svm Model by training on the given frame

Parameters:
frame(Frame):frame of training data
label_column(str):Column containing the label for each observation
observation_columns(list(str)):Column(s) containing the observations
intercept(boolean):Flag indicating if the algorithm adds an intercept. Default is true
num_iterations(int):Number of iterations for SGD. Default is 100
step_size(float):Initial step size for SGD optimizer for the first step. Default is 1.0
reg_type(Optional(str)):Regularization "L1" or "L2". Default is "L2"
reg_param(float):Regularization parameter. Default is 0.01
mini_batch_fraction(float):Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent

Returns(SvmModel): The SVM trained model (with SGD)

Notes:

Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the observations are predicted and tested against the true labels using built-in binary Classification Metrics.

def train(frame,
          label_column,
          observation_columns,
          intercept = True,
          num_iterations = 100,
          step_size = 1.0,
          reg_type = None,
          reg_param = 0.01,
          mini_batch_fraction = 1.0):
    """
    Creates a Svm Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame of training data
    :param label_column: (str) Column containing the label for each observation
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param intercept: (boolean) Flag indicating if the algorithm adds an intercept. Default is true
    :param num_iterations: (int) Number of iterations for SGD. Default is 100
    :param step_size: (float) Initial step size for SGD optimizer for the first step. Default is 1.0
    :param reg_type: (Optional(str)) Regularization "L1" or "L2". Default is "L2"
    :param reg_param: (float) Regularization parameter. Default is 0.01
    :param mini_batch_fraction: (float) Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent
    :return: (SvmModel) The SVM trained model (with SGD)

    Notes
    -----
    Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine
    constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the
    largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM
    with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels
    of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the
    observations are predicted and tested against the true labels using built-in binary Classification Metrics.

    """
    if frame is None:
        raise ValueError("frame cannot be None")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   label_column,
                                   tc.jutils.convert.to_scala_list_string(observation_columns),
                                   intercept,
                                   num_iterations,
                                   step_size,
                                   tc.jutils.convert.to_scala_option(reg_type),
                                   reg_param,
                                   mini_batch_fraction)

    return SvmModel(tc, scala_model)

Classes

class SvmModel

A trained Svm model

Example:
>>> frame = tc.frame.create([[-48.0,1], [-75.0,1], [-63.0,1], [-57.0,1], [73.0,0], [-33.0,1], [100.0,0],
...                          [-54.0,1], [78.0,0], [48.0,0], [-55.0,1], [23.0,0], [45.0,0], [75.0,0]],
...                          [("data", float),("label", str)])


>>> frame.inspect()
[#]  data   label
=================
[0]  -48.0  1
[1]  -75.0  1
[2]  -63.0  1
[3]  -57.0  1
[4]   73.0  0
[5]  -33.0  1
[6]  100.0  0
[7]  -54.0  1
[8]   78.0  0
[9]   48.0  0

>>> model = tc.models.classification.svm.train(frame, 'label', ['data'])

>>> model.label_column
u'label'

>>> model.observation_columns
[u'data']

>>> predicted_frame = model.predict(frame, ['data'])

>>> predicted_frame.inspect()
[#]  data   label  predicted_label
==================================
[0]  -48.0  1                    1
[1]  -75.0  1                    1
[2]  -63.0  1                    1
[3]  -57.0  1                    1
[4]   73.0  0                    0
[5]  -33.0  1                    1
[6]  100.0  0                    0
[7]  -54.0  1                    1
[8]   78.0  0                    0
[9]   48.0  0                    0


>>> test_metrics = model.test(predicted_frame)

>>> test_metrics
accuracy         = 1.0
confusion_matrix =             Predicted_Pos  Predicted_Neg
Actual_Pos              7              0
Actual_Neg              0              7
f_measure        = 1.0
precision        = 1.0
recall           = 1.0

>>> model.save("sandbox/svm")

>>> restored = tc.load("sandbox/svm")

>>> restored.label_column == model.label_column
True

>>> restored.intercept == model.intercept
True

>>> set(restored.observation_columns) == set(model.observation_columns)
True

>>> predicted_frame2 = restored.predict(frame)

>>> predicted_frame2.inspect()
[#]  data   label  predicted_label
==================================
[0]  -48.0  1                    1
[1]  -75.0  1                    1
[2]  -63.0  1                    1
[3]  -57.0  1                    1
[4]   73.0  0                    0
[5]  -33.0  1                    1
[6]  100.0  0                    0
[7]  -54.0  1                    1
[8]   78.0  0                    0
[9]   48.0  0                    0

>>> canonical_path = model.export_to_mar("sandbox/SVM.mar")
class SvmModel(PropertiesObject):
    """
    A trained Svm model

    Example
    -------
        >>> frame = tc.frame.create([[-48.0,1], [-75.0,1], [-63.0,1], [-57.0,1], [73.0,0], [-33.0,1], [100.0,0],
        ...                          [-54.0,1], [78.0,0], [48.0,0], [-55.0,1], [23.0,0], [45.0,0], [75.0,0]],
        ...                          [("data", float),("label", str)])


        >>> frame.inspect()
        [#]  data   label
        =================
        [0]  -48.0  1
        [1]  -75.0  1
        [2]  -63.0  1
        [3]  -57.0  1
        [4]   73.0  0
        [5]  -33.0  1
        [6]  100.0  0
        [7]  -54.0  1
        [8]   78.0  0
        [9]   48.0  0

        >>> model = tc.models.classification.svm.train(frame, 'label', ['data'])

        >>> model.label_column
        u'label'

        >>> model.observation_columns
        [u'data']

        >>> predicted_frame = model.predict(frame, ['data'])

        >>> predicted_frame.inspect()
        [#]  data   label  predicted_label
        ==================================
        [0]  -48.0  1                    1
        [1]  -75.0  1                    1
        [2]  -63.0  1                    1
        [3]  -57.0  1                    1
        [4]   73.0  0                    0
        [5]  -33.0  1                    1
        [6]  100.0  0                    0
        [7]  -54.0  1                    1
        [8]   78.0  0                    0
        [9]   48.0  0                    0


        >>> test_metrics = model.test(predicted_frame)

        >>> test_metrics
        accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              7              0
        Actual_Neg              0              7
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0

        >>> model.save("sandbox/svm")

        >>> restored = tc.load("sandbox/svm")

        >>> restored.label_column == model.label_column
        True

        >>> restored.intercept == model.intercept
        True

        >>> set(restored.observation_columns) == set(model.observation_columns)
        True

        >>> predicted_frame2 = restored.predict(frame)

        >>> predicted_frame2.inspect()
        [#]  data   label  predicted_label
        ==================================
        [0]  -48.0  1                    1
        [1]  -75.0  1                    1
        [2]  -63.0  1                    1
        [3]  -57.0  1                    1
        [4]   73.0  0                    0
        [5]  -33.0  1                    1
        [6]  100.0  0                    0
        [7]  -54.0  1                    1
        [8]   78.0  0                    0
        [9]   48.0  0                    0

        >>> canonical_path = model.export_to_mar("sandbox/SVM.mar")



    """

    def __init__(self, tc, scala_model):
        self._tc = tc
        tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
        self._scala = scala_model

    @staticmethod
    def _from_scala(tc, scala_model):
        return SvmModel(tc, scala_model)

    @property
    def label_column(self):
        """column containing the label used during model training"""
        return self._scala.labelColumn()

    @property
    def observation_columns(self):
        """columns containing the observation values used during model training"""
        return self._tc.jutils.convert.from_scala_seq(self._scala.observationColumns())

    @property
    def intercept(self):
        """intercept used during model training"""
        return self._scala.intercept()

    @property
    def num_iterations(self):
        """max number of iterations allowed during model training"""
        return self._scala.numIterations()

    @property
    def step_size(self):
        """step size value used to train the model"""
        return self._scala.stepSize()

    @property
    def reg_type(self):
        """regularization type used to train the model"""
        return self._tc.jutils.convert.from_scala_option(self._scala.regType())

    @property
    def reg_param(self):
        """regularization parameter used to train the model"""
        return self._scala.regParam()

    @property
    def mini_batch_fraction(self):
        """minimum batch fraction used to train the model"""
        return self._scala.miniBatchFraction()

    def predict(self, frame, columns=None):
        """
       Predicts the labels for the observation columns in the given input frame. Creates a new frame
       with the existing columns and a new predicted column.

       Parameters
       ----------

       :param frame: (Frame) Frame used for predicting the values
       :param c: (List[str]) Names of the observation columns.
       :return: (Frame) A new frame containing the original frame's columns and a prediction column
       """
        c = self.__columns_to_option(columns)
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, c))

    def test(self, frame, columns=None):
        """test the frame given the trained model"""
        c = self.__columns_to_option(columns)
        return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))

    def __columns_to_option(self, c):
        if c is not None:
            c = self._tc.jutils.convert.to_scala_list_string(c)
        return self._tc.jutils.convert.to_scala_option(c)

    def save(self, path):
        """save the trained model to path"""
        self._scala.save(self._tc._scala_sc, path)

    def export_to_mar(self, path):
        """
        Exports the trained model as a model archive (.mar) to the specified path

        Parameters
        ----------

        :param path: (str) Path to save the trained model
        :return: (str) Full path to the saved .mar file
        """
        if isinstance(path, basestring):
            return self._scala.exportToMar(self._tc._scala_sc, path)

Ancestors (in MRO)

  • SvmModel
  • sparktk.propobj.PropertiesObject
  • __builtin__.object

Instance variables

var intercept

intercept used during model training

var label_column

column containing the label used during model training

var mini_batch_fraction

minimum batch fraction used to train the model

var num_iterations

max number of iterations allowed during model training

var observation_columns

columns containing the observation values used during model training

var reg_param

regularization parameter used to train the model

var reg_type

regularization type used to train the model

var step_size

step size value used to train the model

Methods

def __init__(

self, tc, scala_model)

def __init__(self, tc, scala_model):
    self._tc = tc
    tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
    self._scala = scala_model

def export_to_mar(

self, path)

Exports the trained model as a model archive (.mar) to the specified path

Parameters:
path(str):Path to save the trained model

Returns(str): Full path to the saved .mar file

def export_to_mar(self, path):
    """
    Exports the trained model as a model archive (.mar) to the specified path
    Parameters
    ----------
    :param path: (str) Path to save the trained model
    :return: (str) Full path to the saved .mar file
    """
    if isinstance(path, basestring):
        return self._scala.exportToMar(self._tc._scala_sc, path)

def predict(

self, frame, columns=None)

Predicts the labels for the observation columns in the given input frame. Creates a new frame with the existing columns and a new predicted column.

Parameters:
frame(Frame):Frame used for predicting the values
c(List[str]):Names of the observation columns.

Returns(Frame): A new frame containing the original frame's columns and a prediction column

def predict(self, frame, columns=None):
    """
   Predicts the labels for the observation columns in the given input frame. Creates a new frame
   with the existing columns and a new predicted column.
   Parameters
   ----------
   :param frame: (Frame) Frame used for predicting the values
   :param c: (List[str]) Names of the observation columns.
   :return: (Frame) A new frame containing the original frame's columns and a prediction column
   """
    c = self.__columns_to_option(columns)
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.predict(frame._scala, c))

def save(

self, path)

save the trained model to path

def save(self, path):
    """save the trained model to path"""
    self._scala.save(self._tc._scala_sc, path)

def test(

self, frame, columns=None)

test the frame given the trained model

def test(self, frame, columns=None):
    """test the frame given the trained model"""
    c = self.__columns_to_option(columns)
    return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))

def to_dict(

self)

def to_dict(self):
    d = self._properties()
    d.update(self._attributes())
    return d

def to_json(

self)

def to_json(self):
    return json.dumps(self.to_dict())