Table Of Contents

Appendix A — Sample Application Configuration File

# BEGIN REQUIRED SETTINGS

trustedanalytics.atk {
#bind address - change to 0.0.0.0 to listen on all interfaces
//api.host = "127.0.0.1"

#bind port
//api.port = 9099

# The host name for the Postgresql database in which the metadata will be stored
metastore.connection-postgresql.host = "localhost"
metastore.connection-postgresql.port = "5432"
metastore.connection-postgresql.database = "ta_metastore"
metastore.connection-postgresql.username = "atkuser"
metastore.connection-postgresql.password = "MyPassword"
metastore.connection-postgresql.url =
    "jdbc:postgresql://"${trustedanalytics.atk.metastore.connection-postgresql.host}":
    "${trustedanalytics.atk.metastore.connection-postgresql.port}"/
    "${trustedanalytics.atk.metastore.connection-postgresql.database}

# This allows for the use of postgres for a metastore.
# Service restarts will not affect the data stored in postgres.
metastore.connection = ${trustedanalytics.atk.metastore.connection-postgresql}

# This allows the use of an in memory data store.
# Restarting the REST server will create a fresh database and any
# data in the h2 DB will be lost
//metastore.connection = ${trustedanalytics.atk.metastore.connection-h2}

engine {

    # The hdfs URL where the trustedanalytics folder will be created
    # and which will be used as the starting point for any relative URLs
    fs.root = "hdfs://master.silvern.gao.cluster:8020/user/atkuser"

    # The (comma separated, no spaces) Zookeeper hosts that
    # Comma separated list of host names with zookeeper role assigned
    titan.load.storage.hostname = "node01, node02, node01"

    # Titan storage backend.
    # Available options are hbase and cassandra.
    # The default is hbase.
    //titan.load.storage.backend = "hbase"

    # Titan storage port, defaults to 2181 for HBase ZooKeeper.
    # Use 9160 for Cassandra.
    titan.load.storage.port = "2181"

    # The URL for connecting to the Spark master server
    #spark.master = "spark://master.silvern.gao.cluster:7077"
    yarn-client = "spark://master.silvern.gao.cluster:7077"


    spark.conf.properties {
        # Memory should be same or lower than what is listed as available
        # in Cloudera Manager.
        # Values should generally be in gigabytes, e.g. "64g".
        spark.executor.memory = "103079215104"
    }
}

}
# END REQUIRED SETTINGS

# The settings below are all optional.
# Some may need to be configured depending on the
# specifics of your cluster and workload.

trustedanalytics.atk {
  engine {
    auto-partitioner {
      # auto-partitioning spark based on the file size
      file-size-to-partition-size = [{ upper-bound="1MB", partitions = 15 },
                                       { upper-bound="1GB", partitions = 45 },
                                       { upper-bound="5GB", partitions = 100 },
                                       { upper-bound="10GB", partitions = 200 },
                                       { upper-bound="15GB", partitions = 375 },
                                       { upper-bound="25GB", partitions = 500 },
                                       { upper-bound="50GB", partitions = 750 },
                                       { upper-bound="100GB", partitions = 1000 },
                                       { upper-bound="200GB", partitions = 1500 },
                                       { upper-bound="300GB", partitions = 2000 },
                                       { upper-bound="400GB", partitions = 2500 },
                                       { upper-bound="600GB", partitions = 3750 }]
  # max-partitions is used if value is above the max upper-bound
          max-partitions = 10000
      }
    }

    # Configuration for the Trusted Analytics ATK REST API server
    api {
      # this is reported by the API server in the /info results -
      # it can be used to identify a particular server or cluster.
      //identifier = "ta"

      #The default page size for result pagination
      //default-count = 20

      #Timeout for waiting for results from the engine
      //default-timeout = 30s

      #HTTP request timeout for the REST server
      //request-timeout = 29s
    }

      #Configuration for the processing engine
      engine {
          //default-timeout = 30s
         //page-size = 1000

    spark {

      # When master is empty the system defaults to spark://`hostname`:7070
      # where hostname is calculated from the current system.
      # For local mode (useful only for development testing) set master = "local[4]"
      # in cluster mode, set master and home like the example
      # master = "spark://MASTER_HOSTNAME:7077"
      # home = "/opt/cloudera/parcels/CDH/lib/spark"

      # When home is empty the system will check expected locations on the
      # local system and use the first one it finds.
      # If spark is running in yarn-cluster mode (spark.master = "yarn-cluster"),
      # spark.home needs to be set to the spark directory on CDH cluster
      # ("/usr/lib/spark","/opt/cloudera/parcels/CDH/lib/spark/", etc)
      //home = ""

      conf {
        properties {
          # These key/value pairs will be parsed dynamically and provided
          # to SparkConf().
          # See Spark docs for possible values
          # http://spark.apache.org/docs/0.9.0/configuration.html.
          # All values should be convertible to Strings.

          #Examples of other useful properties to edit for performance tuning:

          # Increased Akka frame size from default of 10MB to 100MB to
          # allow tasks to send large results to Spark driver
          # (e.g., using collect() on large datasets).
          //spark.akka.frameSize=100

          #spark.akka.retry.wait=30000
          #spark.akka.timeout=200
          #spark.akka.timeout=30000

          //spark.shuffle.consolidateFiles=true

          # Enabling RDD compression to save space (might increase CPU cycles)
          # Snappy compression is more efficient
          //spark.rdd.compress=true
          //spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec

          #spark.storage.blockManagerHeartBeatMs=300000
          #spark.storage.blockManagerSlaveTimeoutMs=300000

          #spark.worker.timeout=600
          #spark.worker.timeout=30000
          spark.eventLog.enabled=true
          spark.eventLog.dir=
          "hdfs://master.silvern.gao.cluster:8020/user/spark/applicationHistory"
        }

      }
    }

    giraph {
      #Overrides of normal Hadoop settings that are used when running Giraph jobs
      giraph.maxWorkers = 30
      //giraph.minWorkers = 1
      //giraph.SplitMasterWorker = true
      mapreduce.map.memory.mb = 4096
      mapreduce.map.java.opts = "-Xmx3072m"
      //giraph.zkIsExternal = false
    }


    titan {
      load {
        # documentation for these settings is available on Titan website
        # http://s3.thinkaurelius.com/docs/titan/current/titan-config-ref.html
        storage {

          # Whether to enable batch loading into the storage backend.
          # Set to true for bulk loads.
          //batch-loading = true

          # Size of the batch in which mutations are persisted.
          //buffer-size = 2048

          lock {
            # Number of milliseconds the system waits for a lock application
            # to be acknowledged by the storage backend.
            //wait-time = 400

            # Number of times the system attempts to acquire a lock before
            # giving up and throwing an exception.
            //retries = 15
          }

          hbase {
            # Pre-split settngs for large datasets
            //region-count = 12
            //compression-algorithm = "SNAPPY"
          }

          cassandra {
            # Cassandra configuration options
          }
        }

        ids {
          # Globally reserve graph element IDs in chunks of this size.
          # Setting this too low will make commits
          # frequently block on slow reservation requests.
          # Setting it too high will result in IDs wasted when a graph
          # instance shuts down with reserved but mostly-unused blocks.
          //block-size = 300000

          # Number of partition block to allocate for placement of vertices.
          //num-partitions = 10

          # The number of milliseconds that the Titan id pool manager will
          # wait before giving up on allocating a new block of ids.
          //renew-timeout = 150000

          # When true, vertices and edges are assigned IDs immediately upon
          # creation.
          # When false, IDs are assigned only when the transaction commits.
          # Must be disabled for graph partitioning to work.
          //flush = true

          authority {
            # This setting helps separate Titan instances sharing a single
            # graph storage backend avoid contention when reserving ID
            # blocks, increasing overall throughput.
            # The options available are:
            # NONE = Default in Titan
            # LOCAL_MANUAL = Expert feature: user manually assigns each
            # Titan instance a unique conflict avoidance tag in its local
            # graph configuration.
            # GLOBAL_MANUAL = User assigns a tag to each Titan instance.
            # The tags should be globally unique for optimal performance,
            # but duplicates will not compromise correctness
            # GLOBAL_AUTO = Titan randomly selects a tag from the space of
            # all possible tags when performing allocations.
            //conflict-avoidance-mode = "GLOBAL_AUTO"

            # The number of milliseconds the system waits for an ID block
            # reservation to be acknowledged by the storage backend.
            //wait-time = 300

            # Number of times the system attempts ID block reservations
            # with random conflict avoidance tags
            # before giving up and throwing an exception
            //randomized-conflict-avoidance-retries = 10
          }
        }

        auto-partitioner {
          hbase {
            # Number of regions per regionserver to set when creating
            # Titan/HBase table.
            regions-per-server = 2

            # Number of input splits for Titan reader is based on number of
            # available cores and minimum split size as follows: Number of
            # splits = Minimum(input-splits-per-spark-core * spark-cores,
            # graph size in HBase/minimum-input-splits-size-mb).
            input-splits-per-spark-core = 20
          }

          enable = true
        }
      }

      query {
        storage {
          # query does use the batch load settings in titan.load
          backend = ${trustedanalytics.atk.engine.titan.load.storage.backend}
          hostname =  ${trustedanalytics.atk.engine.titan.load.storage.hostname}
          port =  ${trustedanalytics.atk.engine.titan.load.storage.port}
        }
        cache {
          # Adjust cache size parameters if you experience OutOfMemory
          # errors during Titan queries.
          # Either increase heap allocation for TrustedAnalytics Engine, or
          # reduce db-cache-size.
          # Reducing db-cache will result in cache misses and increased
          # reads from disk.
          //db-cache = true
          //db-cache-clean-wait = 20
          //db-cache-time = 180000
          #Allocates 30% of available heap to Titan (default is 50%)
          //db-cache-size = 0.3
        }
      }
    }
  }
}