sparktk.models.clustering.lda module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from sparktk.loggers import log_load; log_load(__name__); del log_load
from sparktk.propobj import PropertiesObject
from sparktk.lazyloader import implicit
__all__ = ["train", "load", "LdaModel"]
def train(frame,
document_column_name,
word_column_name,
word_count_column_name,
max_iterations = 20,
alpha = None,
beta = 1.1,
num_topics = 10,
seed = None,
check_point_interval = 10):
"""
Creates a LdaModel by training on the given frame
See the discussion about `Latent Dirichlet Allocation at Wikipedia. `
:param frame: (Frame) Input frame data
:param documentColumnName: (str) Column Name for documents. Column should contain a str value.
:param wordColumnName: (str) Column name for words. Column should contain a str value.
:param wordCountColumnName: (str) Column name for word count. Column should contain an int32 or int64 value.
:param maxIterations: (int) The maximum number of iterations that the algorithm will execute.
The valid value range is all positive int. Default is 20.
:param alpha: (Optional(List(float))) The :term:`hyperparameter` for document-specific distribution over topics.
Mainly used as a smoothing parameter in :term:`Bayesian inference`.
If set to a singleton list List(-1d), then docConcentration is set automatically.
If set to singleton list List(t) where t != -1, then t is replicated to a vector of length k during
LDAOptimizer.initialize(). Otherwise, the alpha must be length k.
Currently the EM optimizer only supports symmetric distributions, so all values in the vector should be the same.
Values should be greater than 1.0. Default value is -1.0 indicating automatic setting.
:param beta: (float) The :term:`hyperparameter` for word-specific distribution over topics.
Mainly used as a smoothing parameter in :term:`Bayesian inference`.
smaller value implies that topics are more concentrated on a small
subset of words.
Valid value range is all positive float greater than or equal to 1.
Default is 0.1.
:param numTopics: (int) The number of topics to identify in the LDA model.
Using fewer topics will speed up the computation, but the extracted topics
might be more abstract or less specific; using more topics will
result in more computation but lead to more specific topics.
Valid value range is all positive int.
Default is 10.
:param seed: (Optional(long)) An optional random seed.
The random seed is used to initialize the pseudorandom number generator
used in the LDA model. Setting the random seed to the same value every
time the model is trained, allows LDA to generate the same topic distribution
if the corpus and LDA parameters are unchanged.
:param checkPointInterval (int) Period (in iterations) between checkpoints (default = 10).
Checkpointing helps with recovery (when nodes fail). It also helps with eliminating
temporary shuffle files on disk, which can be important when LDA is run for many
iterations. If the checkpoint directory is not set, this setting is ignored.
:return: (LdaModel) Trained Lda Model
"""
if frame is None:
raise ValueError("frame cannot be None")
tc = frame._tc
_scala_obj = get_scala_obj(tc)
scala_alpha = tc.jutils.convert.to_scala_option_list_double(alpha)
seed = seed if seed is None else long(seed)
scala_seed = tc.jutils.convert.to_scala_option(seed)
scala_model = _scala_obj.train(frame._scala,
document_column_name,
word_column_name,
word_count_column_name,
max_iterations,
scala_alpha,
beta,
num_topics,
scala_seed,
check_point_interval)
return LdaModel(tc, scala_model)
def get_scala_obj(tc):
"""Gets reference to the scala object"""
return tc.sc._jvm.org.trustedanalytics.sparktk.models.clustering.lda.LdaModel
def load(path, tc=implicit):
"""load LdaModel from given path"""
if tc is implicit:
implicit.error("tc")
return tc.load(path, LdaModel)
class LdaModel(PropertiesObject):
"""
A trained Lda model
Example
-------
>>> frame = tc.frame.create([['nytimes','harry',3], ['nytimes','economy',35], ['nytimes','jobs',40], ['nytimes','magic',1],
... ['nytimes','realestate',15], ['nytimes','movies',6],['economist','economy',50],
... ['economist','jobs',35], ['economist','realestate',20],['economist','movies',1],
... ['economist','harry',1],['economist','magic',1],['harrypotter','harry',40],
... ['harrypotter','magic',30],['harrypotter','chamber',20],['harrypotter','secrets',30]],
... [('doc_id', str), ('word_id', str), ('word_count', long)])
>>> frame.inspect()
[#] doc_id word_id word_count
======================================
[0] nytimes harry 3
[1] nytimes economy 35
[2] nytimes jobs 40
[3] nytimes magic 1
[4] nytimes realestate 15
[5] nytimes movies 6
[6] economist economy 50
[7] economist jobs 35
[8] economist realestate 20
[9] economist movies 1
>>> model = tc.models.clustering.lda.train(frame, 'doc_id', 'word_id', 'word_count', max_iterations = 3, num_topics = 2)
>>> print model.report
======Graph Statistics======
Number of vertices: 11} (doc: 3, word: 8})
Number of edges: 16
======LDA Configuration======
numTopics: 2
alpha: 26.0
beta: 1.100000023841858
maxIterations: 3
>>> model.document_column_name
u'doc_id'
>>> model.word_column_name
u'word_id'
>>> model.max_iterations
3
>>> model.training_data_row_count
16L
>>> model.check_point_interval
10
>>> model.topics_given_doc_frame.schema
[(u'doc_id', ), (u'topic_probabilities', vector(2))]
>>> model.topics_given_doc_frame.inspect(columns = ['doc_id'])
[#] doc_id topic_probabilities
===========================================================
[0] harrypotter [0.06417509902256538, 0.9358249009774346]
[1] economist [0.8065841283073141, 0.19341587169268581]
[2] nytimes [0.855316939742769, 0.14468306025723088]
>>> model.word_given_topics_frame.inspect()
[#] word_id topic_probabilities
=============================================================
[0] harry [0.005015572372943657, 0.2916109787103347]
[1] realestate [0.167941871746252, 0.032187084858186256]
[2] secrets [0.026543839878055035, 0.17103864163730945]
[3] movies [0.03704750433384287, 0.003294403360133419]
[4] magic [0.016497495727347045, 0.19676900962555072]
[5] economy [0.3805836266747442, 0.10952481503975171]
[6] chamber [0.0035944004256137523, 0.13168123398523954]
[7] jobs [0.36277568884120137, 0.06389383278349432]
>>> model.topics_given_word_frame.inspect()
[#] word_id topic_probabilities
===========================================================
[0] harry [0.018375903962878668, 0.9816240960371213]
[1] realestate [0.8663322126823493, 0.13366778731765067]
[2] secrets [0.15694172611285945, 0.8430582738871405]
[3] movies [0.9444179131148587, 0.055582086885141324]
[4] magic [0.09026309091077593, 0.9097369090892241]
[5] economy [0.8098866029287505, 0.19011339707124958]
[6] chamber [0.0275551649439219, 0.9724448350560781]
[7] jobs [0.8748608515169193, 0.12513914848308066]
>>> prediction = model.predict(['harry', 'secrets', 'magic', 'harry', 'chamber' 'test'])
>>> prediction
{u'topics_given_doc': [0.3149285399451628, 0.48507146005483726], u'new_words_percentage': 20.0, u'new_words_count': 1}
>>> prediction['topics_given_doc']
[0.3149285399451628, 0.48507146005483726]
>>> prediction['new_words_percentage']
20.0
>>> prediction['new_words_count']
1
>>> prediction.has_key('topics_given_doc')
True
>>> prediction.has_key('new_words_percentage')
True
>>> prediction.has_key('new_words_count')
True
>>> model.save("sandbox/lda")
>>> restored = tc.load("sandbox/lda")
>>> restored.document_column_name == model.document_column_name
True
>>> restored.check_point_interval == model.check_point_interval
True
>>> restored.max_iterations == model.max_iterations
True
>>> restored.topics_given_doc_frame.schema
[(u'doc_id', ), (u'topic_probabilities', vector(2))]
>>> restored.topics_given_doc_frame.inspect()
[#] doc_id topic_probabilities
===========================================================
[0] harrypotter [0.06417509902256538, 0.9358249009774346]
[1] economist [0.8065841283073141, 0.19341587169268581]
[2] nytimes [0.855316939742769, 0.14468306025723088]
>>> prediction2 = restored.predict(['harry', 'secrets', 'magic', 'harry', 'chamber' 'test'])
>>> prediction2
{u'topics_given_doc': [0.3149285399451628, 0.48507146005483726], u'new_words_percentage': 20.0, u'new_words_count': 1}
>>> prediction2['topics_given_doc']
[0.3149285399451628, 0.48507146005483726]
>>> prediction2['new_words_percentage']
20.0
>>> prediction2['new_words_count']
1
>>> prediction2.has_key('topics_given_doc')
True
>>> prediction2.has_key('new_words_percentage')
True
>>> prediction2.has_key('new_words_count')
True
>>> canonical_path = model.export_to_mar("sandbox/lda.mar")
"""
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
@staticmethod
def _from_scala(tc, scala_model):
return LdaModel(tc, scala_model)
@property
def document_column_name(self):
"""Column Name for documents"""
return self._scala.documentColumnName()
@property
def word_column_name(self):
"""Column name for words"""
return self._scala.wordColumnName()
@property
def word_count_column_name(self):
"""Column name for word count"""
return self._scala.wordCountColumnName()
@property
def max_iterations(self):
"""The maximum number of iterations that the algorithm could have executed"""
return self._scala.maxIterations()
@property
def alpha(self):
"""Hyperparameter for document-specific distribution over topics"""
s = self._tc.jutils.convert.from_scala_option(self._scala.alpha())
if s:
return list(self._tc.jutils.convert.from_scala_seq(s))
return None
@property
def beta(self):
"""Hyperparameter for word-specific distribution over topics"""
return self._scala.beta()
@property
def num_topics(self):
"""Number of topics to identify in the LdaModel"""
return self._scala.numTopics()
@property
def seed(self):
"""Random seed used to train the model"""
return self._tc.jutils.convert.from_scala_option(self._scala.seed())
@property
def check_point_interval(self):
"""Checkpoint Interval used to train the model"""
return self._scala.checkPointInterval()
@property
def report(self):
"""Summary Report of the Training"""
return self._scala.report()
@property
def topics_given_doc_frame(self):
"""Frame for topics given document"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topicsGivenDocFrame())
@property
def word_given_topics_frame(self):
"""Frame for word given topics"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.wordGivenTopicsFrame())
@property
def topics_given_word_frame(self):
"""Frame for topics given word"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topicsGivenWordFrame())
@property
def training_data_row_count(self):
"""Row count of the frame used to train this model"""
return self._scala.trainingDataRowCount()
def predict(self, documents):
"""Predict topic probabilities for the documents given the trained model"""
scala_documents = self._tc.jutils.convert.to_scala_list(documents)
predicted_output = self._scala.predict(scala_documents)
topics_given_doc = self._tc.jutils.convert.from_scala_seq(predicted_output.topicsGivenDoc())
new_words_count = predicted_output.newWordsCount()
new_words_percentage = predicted_output.newWordsPercentage()
return {u"topics_given_doc":topics_given_doc,
u"new_words_count":new_words_count,
u"new_words_percentage":new_words_percentage}
def save(self, path):
"""Save the trained model"""
self._scala.save(self._tc._scala_sc, path)
def export_to_mar(self, path):
""" export the trained model to MAR format for Scoring Engine """
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
del PropertiesObject
Functions
def load(
path, tc=<class 'sparktk.arguments.implicit'>)
load LdaModel from given path
def load(path, tc=implicit):
"""load LdaModel from given path"""
if tc is implicit:
implicit.error("tc")
return tc.load(path, LdaModel)
def train(
frame, document_column_name, word_column_name, word_count_column_name, max_iterations=20, alpha=None, beta=1.1, num_topics=10, seed=None, check_point_interval=10)
Creates a LdaModel by training on the given frame
See the discussion about Latent Dirichlet Allocation at Wikipedia. <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>
frame | (Frame): | Input frame data |
documentColumnName | (str): | Column Name for documents. Column should contain a str value. |
wordColumnName | (str): | Column name for words. Column should contain a str value. |
wordCountColumnName | (str): | Column name for word count. Column should contain an int32 or int64 value. |
maxIterations | (int): | The maximum number of iterations that the algorithm will execute. The valid value range is all positive int. Default is 20. |
alpha | (Optional(List(float))): | The :term:hyperparameter for document-specific distribution over topics.
Mainly used as a smoothing parameter in :term:Bayesian inference .
If set to a singleton list List(-1d), then docConcentration is set automatically.
If set to singleton list List(t) where t != -1, then t is replicated to a vector of length k during
LDAOptimizer.initialize(). Otherwise, the alpha must be length k.
Currently the EM optimizer only supports symmetric distributions, so all values in the vector should be the same.
Values should be greater than 1.0. Default value is -1.0 indicating automatic setting.
|
beta | (float): | The :term:hyperparameter for word-specific distribution over topics.
Mainly used as a smoothing parameter in :term:Bayesian inference .
smaller value implies that topics are more concentrated on a small
subset of words.
Valid value range is all positive float greater than or equal to 1.
Default is 0.1.
|
numTopics | (int): | The number of topics to identify in the LDA model. Using fewer topics will speed up the computation, but the extracted topics might be more abstract or less specific; using more topics will result in more computation but lead to more specific topics. Valid value range is all positive int. Default is 10. |
seed | (Optional(long)): | An optional random seed. The random seed is used to initialize the pseudorandom number generator used in the LDA model. Setting the random seed to the same value every time the model is trained, allows LDA to generate the same topic distribution if the corpus and LDA parameters are unchanged. |
Returns | (LdaModel): | Trained Lda Model |
def train(frame,
document_column_name,
word_column_name,
word_count_column_name,
max_iterations = 20,
alpha = None,
beta = 1.1,
num_topics = 10,
seed = None,
check_point_interval = 10):
"""
Creates a LdaModel by training on the given frame
See the discussion about `Latent Dirichlet Allocation at Wikipedia. `
:param frame: (Frame) Input frame data
:param documentColumnName: (str) Column Name for documents. Column should contain a str value.
:param wordColumnName: (str) Column name for words. Column should contain a str value.
:param wordCountColumnName: (str) Column name for word count. Column should contain an int32 or int64 value.
:param maxIterations: (int) The maximum number of iterations that the algorithm will execute.
The valid value range is all positive int. Default is 20.
:param alpha: (Optional(List(float))) The :term:`hyperparameter` for document-specific distribution over topics.
Mainly used as a smoothing parameter in :term:`Bayesian inference`.
If set to a singleton list List(-1d), then docConcentration is set automatically.
If set to singleton list List(t) where t != -1, then t is replicated to a vector of length k during
LDAOptimizer.initialize(). Otherwise, the alpha must be length k.
Currently the EM optimizer only supports symmetric distributions, so all values in the vector should be the same.
Values should be greater than 1.0. Default value is -1.0 indicating automatic setting.
:param beta: (float) The :term:`hyperparameter` for word-specific distribution over topics.
Mainly used as a smoothing parameter in :term:`Bayesian inference`.
smaller value implies that topics are more concentrated on a small
subset of words.
Valid value range is all positive float greater than or equal to 1.
Default is 0.1.
:param numTopics: (int) The number of topics to identify in the LDA model.
Using fewer topics will speed up the computation, but the extracted topics
might be more abstract or less specific; using more topics will
result in more computation but lead to more specific topics.
Valid value range is all positive int.
Default is 10.
:param seed: (Optional(long)) An optional random seed.
The random seed is used to initialize the pseudorandom number generator
used in the LDA model. Setting the random seed to the same value every
time the model is trained, allows LDA to generate the same topic distribution
if the corpus and LDA parameters are unchanged.
:param checkPointInterval (int) Period (in iterations) between checkpoints (default = 10).
Checkpointing helps with recovery (when nodes fail). It also helps with eliminating
temporary shuffle files on disk, which can be important when LDA is run for many
iterations. If the checkpoint directory is not set, this setting is ignored.
:return: (LdaModel) Trained Lda Model
"""
if frame is None:
raise ValueError("frame cannot be None")
tc = frame._tc
_scala_obj = get_scala_obj(tc)
scala_alpha = tc.jutils.convert.to_scala_option_list_double(alpha)
seed = seed if seed is None else long(seed)
scala_seed = tc.jutils.convert.to_scala_option(seed)
scala_model = _scala_obj.train(frame._scala,
document_column_name,
word_column_name,
word_count_column_name,
max_iterations,
scala_alpha,
beta,
num_topics,
scala_seed,
check_point_interval)
return LdaModel(tc, scala_model)
Classes
class LdaModel
A trained Lda model
>>> frame = tc.frame.create([['nytimes','harry',3], ['nytimes','economy',35], ['nytimes','jobs',40], ['nytimes','magic',1],
... ['nytimes','realestate',15], ['nytimes','movies',6],['economist','economy',50],
... ['economist','jobs',35], ['economist','realestate',20],['economist','movies',1],
... ['economist','harry',1],['economist','magic',1],['harrypotter','harry',40],
... ['harrypotter','magic',30],['harrypotter','chamber',20],['harrypotter','secrets',30]],
... [('doc_id', str), ('word_id', str), ('word_count', long)])
>>> frame.inspect()
[#] doc_id word_id word_count
======================================
[0] nytimes harry 3
[1] nytimes economy 35
[2] nytimes jobs 40
[3] nytimes magic 1
[4] nytimes realestate 15
[5] nytimes movies 6
[6] economist economy 50
[7] economist jobs 35
[8] economist realestate 20
[9] economist movies 1
>>> model = tc.models.clustering.lda.train(frame, 'doc_id', 'word_id', 'word_count', max_iterations = 3, num_topics = 2)
>>> print model.report
======Graph Statistics======
Number of vertices: 11} (doc: 3, word: 8})
Number of edges: 16
<BLANKLINE>
======LDA Configuration======
numTopics: 2
alpha: 26.0
beta: 1.100000023841858
maxIterations: 3
<BLANKLINE>
>>> model.document_column_name
u'doc_id'
>>> model.word_column_name
u'word_id'
>>> model.max_iterations
3
>>> model.training_data_row_count
16L
>>> model.check_point_interval
10
>>> model.topics_given_doc_frame.schema
[(u'doc_id', <type 'unicode'>), (u'topic_probabilities', vector(2))]
>>> model.topics_given_doc_frame.inspect(columns = ['doc_id'])
[#] doc_id topic_probabilities
===========================================================
[0] harrypotter [0.06417509902256538, 0.9358249009774346]
[1] economist [0.8065841283073141, 0.19341587169268581]
[2] nytimes [0.855316939742769, 0.14468306025723088]
>>> model.word_given_topics_frame.inspect()
[#] word_id topic_probabilities
=============================================================
[0] harry [0.005015572372943657, 0.2916109787103347]
[1] realestate [0.167941871746252, 0.032187084858186256]
[2] secrets [0.026543839878055035, 0.17103864163730945]
[3] movies [0.03704750433384287, 0.003294403360133419]
[4] magic [0.016497495727347045, 0.19676900962555072]
[5] economy [0.3805836266747442, 0.10952481503975171]
[6] chamber [0.0035944004256137523, 0.13168123398523954]
[7] jobs [0.36277568884120137, 0.06389383278349432]
>>> model.topics_given_word_frame.inspect()
[#] word_id topic_probabilities
===========================================================
[0] harry [0.018375903962878668, 0.9816240960371213]
[1] realestate [0.8663322126823493, 0.13366778731765067]
[2] secrets [0.15694172611285945, 0.8430582738871405]
[3] movies [0.9444179131148587, 0.055582086885141324]
[4] magic [0.09026309091077593, 0.9097369090892241]
[5] economy [0.8098866029287505, 0.19011339707124958]
[6] chamber [0.0275551649439219, 0.9724448350560781]
[7] jobs [0.8748608515169193, 0.12513914848308066]
>>> prediction = model.predict(['harry', 'secrets', 'magic', 'harry', 'chamber' 'test'])
>>> prediction
{u'topics_given_doc': [0.3149285399451628, 0.48507146005483726], u'new_words_percentage': 20.0, u'new_words_count': 1}
>>> prediction['topics_given_doc']
[0.3149285399451628, 0.48507146005483726]
>>> prediction['new_words_percentage']
20.0
>>> prediction['new_words_count']
1
>>> prediction.has_key('topics_given_doc')
True
>>> prediction.has_key('new_words_percentage')
True
>>> prediction.has_key('new_words_count')
True
>>> model.save("sandbox/lda")
>>> restored = tc.load("sandbox/lda")
>>> restored.document_column_name == model.document_column_name
True
>>> restored.check_point_interval == model.check_point_interval
True
>>> restored.max_iterations == model.max_iterations
True
>>> restored.topics_given_doc_frame.schema
[(u'doc_id', <type 'unicode'>), (u'topic_probabilities', vector(2))]
>>> restored.topics_given_doc_frame.inspect()
[#] doc_id topic_probabilities
===========================================================
[0] harrypotter [0.06417509902256538, 0.9358249009774346]
[1] economist [0.8065841283073141, 0.19341587169268581]
[2] nytimes [0.855316939742769, 0.14468306025723088]
>>> prediction2 = restored.predict(['harry', 'secrets', 'magic', 'harry', 'chamber' 'test'])
>>> prediction2
{u'topics_given_doc': [0.3149285399451628, 0.48507146005483726], u'new_words_percentage': 20.0, u'new_words_count': 1}
>>> prediction2['topics_given_doc']
[0.3149285399451628, 0.48507146005483726]
>>> prediction2['new_words_percentage']
20.0
>>> prediction2['new_words_count']
1
>>> prediction2.has_key('topics_given_doc')
True
>>> prediction2.has_key('new_words_percentage')
True
>>> prediction2.has_key('new_words_count')
True
>>> canonical_path = model.export_to_mar("sandbox/lda.mar")
class LdaModel(PropertiesObject):
"""
A trained Lda model
Example
-------
>>> frame = tc.frame.create([['nytimes','harry',3], ['nytimes','economy',35], ['nytimes','jobs',40], ['nytimes','magic',1],
... ['nytimes','realestate',15], ['nytimes','movies',6],['economist','economy',50],
... ['economist','jobs',35], ['economist','realestate',20],['economist','movies',1],
... ['economist','harry',1],['economist','magic',1],['harrypotter','harry',40],
... ['harrypotter','magic',30],['harrypotter','chamber',20],['harrypotter','secrets',30]],
... [('doc_id', str), ('word_id', str), ('word_count', long)])
>>> frame.inspect()
[#] doc_id word_id word_count
======================================
[0] nytimes harry 3
[1] nytimes economy 35
[2] nytimes jobs 40
[3] nytimes magic 1
[4] nytimes realestate 15
[5] nytimes movies 6
[6] economist economy 50
[7] economist jobs 35
[8] economist realestate 20
[9] economist movies 1
>>> model = tc.models.clustering.lda.train(frame, 'doc_id', 'word_id', 'word_count', max_iterations = 3, num_topics = 2)
>>> print model.report
======Graph Statistics======
Number of vertices: 11} (doc: 3, word: 8})
Number of edges: 16
======LDA Configuration======
numTopics: 2
alpha: 26.0
beta: 1.100000023841858
maxIterations: 3
>>> model.document_column_name
u'doc_id'
>>> model.word_column_name
u'word_id'
>>> model.max_iterations
3
>>> model.training_data_row_count
16L
>>> model.check_point_interval
10
>>> model.topics_given_doc_frame.schema
[(u'doc_id', ), (u'topic_probabilities', vector(2))]
>>> model.topics_given_doc_frame.inspect(columns = ['doc_id'])
[#] doc_id topic_probabilities
===========================================================
[0] harrypotter [0.06417509902256538, 0.9358249009774346]
[1] economist [0.8065841283073141, 0.19341587169268581]
[2] nytimes [0.855316939742769, 0.14468306025723088]
>>> model.word_given_topics_frame.inspect()
[#] word_id topic_probabilities
=============================================================
[0] harry [0.005015572372943657, 0.2916109787103347]
[1] realestate [0.167941871746252, 0.032187084858186256]
[2] secrets [0.026543839878055035, 0.17103864163730945]
[3] movies [0.03704750433384287, 0.003294403360133419]
[4] magic [0.016497495727347045, 0.19676900962555072]
[5] economy [0.3805836266747442, 0.10952481503975171]
[6] chamber [0.0035944004256137523, 0.13168123398523954]
[7] jobs [0.36277568884120137, 0.06389383278349432]
>>> model.topics_given_word_frame.inspect()
[#] word_id topic_probabilities
===========================================================
[0] harry [0.018375903962878668, 0.9816240960371213]
[1] realestate [0.8663322126823493, 0.13366778731765067]
[2] secrets [0.15694172611285945, 0.8430582738871405]
[3] movies [0.9444179131148587, 0.055582086885141324]
[4] magic [0.09026309091077593, 0.9097369090892241]
[5] economy [0.8098866029287505, 0.19011339707124958]
[6] chamber [0.0275551649439219, 0.9724448350560781]
[7] jobs [0.8748608515169193, 0.12513914848308066]
>>> prediction = model.predict(['harry', 'secrets', 'magic', 'harry', 'chamber' 'test'])
>>> prediction
{u'topics_given_doc': [0.3149285399451628, 0.48507146005483726], u'new_words_percentage': 20.0, u'new_words_count': 1}
>>> prediction['topics_given_doc']
[0.3149285399451628, 0.48507146005483726]
>>> prediction['new_words_percentage']
20.0
>>> prediction['new_words_count']
1
>>> prediction.has_key('topics_given_doc')
True
>>> prediction.has_key('new_words_percentage')
True
>>> prediction.has_key('new_words_count')
True
>>> model.save("sandbox/lda")
>>> restored = tc.load("sandbox/lda")
>>> restored.document_column_name == model.document_column_name
True
>>> restored.check_point_interval == model.check_point_interval
True
>>> restored.max_iterations == model.max_iterations
True
>>> restored.topics_given_doc_frame.schema
[(u'doc_id', ), (u'topic_probabilities', vector(2))]
>>> restored.topics_given_doc_frame.inspect()
[#] doc_id topic_probabilities
===========================================================
[0] harrypotter [0.06417509902256538, 0.9358249009774346]
[1] economist [0.8065841283073141, 0.19341587169268581]
[2] nytimes [0.855316939742769, 0.14468306025723088]
>>> prediction2 = restored.predict(['harry', 'secrets', 'magic', 'harry', 'chamber' 'test'])
>>> prediction2
{u'topics_given_doc': [0.3149285399451628, 0.48507146005483726], u'new_words_percentage': 20.0, u'new_words_count': 1}
>>> prediction2['topics_given_doc']
[0.3149285399451628, 0.48507146005483726]
>>> prediction2['new_words_percentage']
20.0
>>> prediction2['new_words_count']
1
>>> prediction2.has_key('topics_given_doc')
True
>>> prediction2.has_key('new_words_percentage')
True
>>> prediction2.has_key('new_words_count')
True
>>> canonical_path = model.export_to_mar("sandbox/lda.mar")
"""
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
@staticmethod
def _from_scala(tc, scala_model):
return LdaModel(tc, scala_model)
@property
def document_column_name(self):
"""Column Name for documents"""
return self._scala.documentColumnName()
@property
def word_column_name(self):
"""Column name for words"""
return self._scala.wordColumnName()
@property
def word_count_column_name(self):
"""Column name for word count"""
return self._scala.wordCountColumnName()
@property
def max_iterations(self):
"""The maximum number of iterations that the algorithm could have executed"""
return self._scala.maxIterations()
@property
def alpha(self):
"""Hyperparameter for document-specific distribution over topics"""
s = self._tc.jutils.convert.from_scala_option(self._scala.alpha())
if s:
return list(self._tc.jutils.convert.from_scala_seq(s))
return None
@property
def beta(self):
"""Hyperparameter for word-specific distribution over topics"""
return self._scala.beta()
@property
def num_topics(self):
"""Number of topics to identify in the LdaModel"""
return self._scala.numTopics()
@property
def seed(self):
"""Random seed used to train the model"""
return self._tc.jutils.convert.from_scala_option(self._scala.seed())
@property
def check_point_interval(self):
"""Checkpoint Interval used to train the model"""
return self._scala.checkPointInterval()
@property
def report(self):
"""Summary Report of the Training"""
return self._scala.report()
@property
def topics_given_doc_frame(self):
"""Frame for topics given document"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topicsGivenDocFrame())
@property
def word_given_topics_frame(self):
"""Frame for word given topics"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.wordGivenTopicsFrame())
@property
def topics_given_word_frame(self):
"""Frame for topics given word"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topicsGivenWordFrame())
@property
def training_data_row_count(self):
"""Row count of the frame used to train this model"""
return self._scala.trainingDataRowCount()
def predict(self, documents):
"""Predict topic probabilities for the documents given the trained model"""
scala_documents = self._tc.jutils.convert.to_scala_list(documents)
predicted_output = self._scala.predict(scala_documents)
topics_given_doc = self._tc.jutils.convert.from_scala_seq(predicted_output.topicsGivenDoc())
new_words_count = predicted_output.newWordsCount()
new_words_percentage = predicted_output.newWordsPercentage()
return {u"topics_given_doc":topics_given_doc,
u"new_words_count":new_words_count,
u"new_words_percentage":new_words_percentage}
def save(self, path):
"""Save the trained model"""
self._scala.save(self._tc._scala_sc, path)
def export_to_mar(self, path):
""" export the trained model to MAR format for Scoring Engine """
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
Ancestors (in MRO)
- LdaModel
- sparktk.propobj.PropertiesObject
- __builtin__.object
Instance variables
var alpha
Hyperparameter for document-specific distribution over topics
var beta
Hyperparameter for word-specific distribution over topics
var check_point_interval
Checkpoint Interval used to train the model
var document_column_name
Column Name for documents
var max_iterations
The maximum number of iterations that the algorithm could have executed
var num_topics
Number of topics to identify in the LdaModel
var report
Summary Report of the Training
var seed
Random seed used to train the model
var topics_given_doc_frame
Frame for topics given document
var topics_given_word_frame
Frame for topics given word
var training_data_row_count
Row count of the frame used to train this model
var word_column_name
Column name for words
var word_count_column_name
Column name for word count
var word_given_topics_frame
Frame for word given topics
Methods
def __init__(
self, tc, scala_model)
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
def export_to_mar(
self, path)
export the trained model to MAR format for Scoring Engine
def export_to_mar(self, path):
""" export the trained model to MAR format for Scoring Engine """
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
def predict(
self, documents)
Predict topic probabilities for the documents given the trained model
def predict(self, documents):
"""Predict topic probabilities for the documents given the trained model"""
scala_documents = self._tc.jutils.convert.to_scala_list(documents)
predicted_output = self._scala.predict(scala_documents)
topics_given_doc = self._tc.jutils.convert.from_scala_seq(predicted_output.topicsGivenDoc())
new_words_count = predicted_output.newWordsCount()
new_words_percentage = predicted_output.newWordsPercentage()
return {u"topics_given_doc":topics_given_doc,
u"new_words_count":new_words_count,
u"new_words_percentage":new_words_percentage}
def save(
self, path)
Save the trained model
def save(self, path):
"""Save the trained model"""
self._scala.save(self._tc._scala_sc, path)
def to_dict(
self)
def to_dict(self):
d = self._properties()
d.update(self._attributes())
return d
def to_json(
self)
def to_json(self):
return json.dumps(self.to_dict())