Amazon Elastic MapReduce API Reference. This guide provides descriptions and samples of the Amazon Elastic MapReduce * APIs. * * Amazon Elastic MapReduce is a web service that makes it easy to process large amounts of data efficiently. Elastic MapReduce uses Hadoop * processing combined with several AWS products to do tasks such as web indexing, data mining, log file analysis, machine learning, scientific * simulation, and data warehousing. * * @version Tue Aug 23 12:49:06 PDT 2011 * @license See the included NOTICE.md file for complete information. * @copyright See the included NOTICE.md file for complete information. * @link http://aws.amazon.com/elasticmapreduce/Amazon Elastic MapReduce * @link http://aws.amazon.com/documentation/elasticmapreduce/Amazon Elastic MapReduce documentation */ class AmazonEMR extends CFRuntime { /*%******************************************************************************************%*/ // CLASS CONSTANTS /** * Specify the default queue URL. */ const DEFAULT_URL = 'us-east-1.elasticmapreduce.amazonaws.com'; /** * Specify the queue URL for the US-East (Northern Virginia) Region. */ const REGION_US_E1 = self::DEFAULT_URL; /** * Specify the queue URL for the US-West (Northern California) Region. */ const REGION_US_W1 = 'us-west-1.elasticmapreduce.amazonaws.com'; /** * Specify the queue URL for the EU (Ireland) Region. */ const REGION_EU_W1 = 'eu-west-1.elasticmapreduce.amazonaws.com'; /** * Specify the queue URL for the Asia Pacific (Singapore) Region. */ const REGION_APAC_SE1 = 'ap-southeast-1.elasticmapreduce.amazonaws.com'; /** * Specify the queue URL for the Asia Pacific (Japan) Region. */ const REGION_APAC_NE1 = 'ap-northeast-1.elasticmapreduce.amazonaws.com'; /*%******************************************************************************************%*/ // SETTERS /** * This allows you to explicitly sets the region for the service to use. * * @param string $region (Required) The region to explicitly set. Available options are , , , or . * @return $this A reference to the current instance. */ public function set_region($region) { $this->set_hostname($region); return $this; } /*%******************************************************************************************%*/ // CONSTRUCTOR /** * Constructs a new instance of . * * @param string $key (Optional) Your Amazon API Key. If blank, it will look for the AWS_KEY constant. * @param string $secret_key (Optional) Your Amazon API Secret Key. If blank, it will look for the AWS_SECRET_KEY constant. * @return boolean false if no valid values are set, otherwise true. */ public function __construct($key = null, $secret_key = null) { $this->api_version = '2009-03-31'; $this->hostname = self::DEFAULT_URL; if (!$key && !defined('AWS_KEY')) { // @codeCoverageIgnoreStart throw new EMR_Exception('No account key was passed into the constructor, nor was it set in the AWS_KEY constant.'); // @codeCoverageIgnoreEnd } if (!$secret_key && !defined('AWS_SECRET_KEY')) { // @codeCoverageIgnoreStart throw new EMR_Exception('No account secret was passed into the constructor, nor was it set in the AWS_SECRET_KEY constant.'); // @codeCoverageIgnoreEnd } return parent::__construct($key, $secret_key); } /*%******************************************************************************************%*/ // SERVICE METHODS /** * * AddInstanceGroups adds an instance group to a running cluster. * * @param array $instance_groups (Required) Instance Groups to add.
    *
  • x - array - This represents a simple array index.
      *
    • Name - string - Optional - Friendly name given to the instance group.
    • *
    • Market - string - Optional - Market type of the Amazon EC2 instances used to create a cluster node. [Allowed values: ON_DEMAND, SPOT]
    • *
    • InstanceRole - string - Required - The role of the instance group in the cluster. [Allowed values: MASTER, CORE, TASK]
    • *
    • BidPrice - string - Optional - Bid price for each Amazon EC2 instance in the instance group when launching nodes as Spot Instances, expressed in USD.
    • *
    • InstanceType - string - Required - The Amazon EC2 instance type for all instances in the instance group.
    • *
    • InstanceCount - integer - Required - Target number of instances for the instance group.
    • *
  • *
* @param string $job_flow_id (Required) Job flow in which to add the instance groups. * @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function add_instance_groups($instance_groups, $job_flow_id, $opt = null) { if (!$opt) $opt = array(); // Required parameter $opt = array_merge($opt, CFComplexType::map(array( 'InstanceGroups' => (is_array($instance_groups) ? $instance_groups : array($instance_groups)) ), 'member')); $opt['JobFlowId'] = $job_flow_id; return $this->authenticate('AddInstanceGroups', $opt, $this->hostname); } /** * * AddJobFlowSteps adds new steps to a running job flow. A maximum of 256 steps are allowed in each job flow. * * If your job flow is long-running (such as a Hive data warehouse) or complex, you may require more than 256 steps to process your data. You * can bypass the 256-step limitation in various ways, including using the SSH shell to connect to the master node and submitting queries * directly to the software running on the master node, such as Hive and Hadoop. For more information on how to do this, go to Add More than 256 Steps to a Job * Flow in the Amazon Elastic MapReduce Developer's Guide. * * A step specifies the location of a JAR file stored either on the master node of the job flow or in Amazon S3. Each step is performed by the * main function of the main class of the JAR file. The main class can be specified either in the manifest of the JAR or by using the * MainFunction parameter of the step. * * Elastic MapReduce executes each step in the order listed. For a step to be considered complete, the main function must exit with a zero * exit code and all Hadoop jobs started while the step was running must have completed and run successfully. * * You can only add steps to a job flow that is in one of the following states: STARTING, BOOTSTRAPPING, RUNNING, or WAITING. * * @param string $job_flow_id (Required) A string that uniquely identifies the job flow. This identifier is returned by RunJobFlow and can also be obtained from DescribeJobFlows. * @param array $steps (Required) A list of StepConfig to be executed by the job flow.
    *
  • x - array - This represents a simple array index.
      *
    • Name - string - Required - The name of the job flow step.
    • *
    • ActionOnFailure - string - Optional - Specifies the action to take if the job flow step fails. [Allowed values: TERMINATE_JOB_FLOW, CANCEL_AND_WAIT, CONTINUE]
    • *
    • HadoopJarStep - array - Required - Specifies the JAR file used for the job flow step. Takes an associative array of parameters that can have the following keys:
        *
      • Properties - array - Optional - A list of Java properties that are set when the step runs. You can use these properties to pass key value pairs to your main function.
          *
        • x - array - This represents a simple array index.
            *
          • Key - string - Optional - The unique identifier of a key value pair.
          • *
          • Value - string - Optional - The value part of the identified key.
          • *
        • *
      • *
      • Jar - string - Required - A path to a JAR file run during the step.
      • *
      • MainClass - string - Optional - The name of the main class in the specified Java file. If not specified, the JAR file should specify a Main-Class in its manifest file.
      • *
      • Args - string|array - Optional - A list of command line arguments passed to the JAR file's main function when executed. Pass a string for a single value, or an indexed array for multiple values.
      • *
    • *
  • *
* @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function add_job_flow_steps($job_flow_id, $steps, $opt = null) { if (!$opt) $opt = array(); $opt['JobFlowId'] = $job_flow_id; // Required parameter $opt = array_merge($opt, CFComplexType::map(array( 'Steps' => (is_array($steps) ? $steps : array($steps)) ), 'member')); return $this->authenticate('AddJobFlowSteps', $opt, $this->hostname); } /** * * TerminateJobFlows shuts a list of job flows down. When a job flow is shut down, any step not yet completed is canceled and the EC2 * instances on which the job flow is running are stopped. Any log files not already saved are uploaded to Amazon S3 if a LogUri was specified * when the job flow was created. * * @param string|array $job_flow_ids (Required) A list of job flows to be shutdown. Pass a string for a single value, or an indexed array for multiple values. * @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function terminate_job_flows($job_flow_ids, $opt = null) { if (!$opt) $opt = array(); // Required parameter $opt = array_merge($opt, CFComplexType::map(array( 'JobFlowIds' => (is_array($job_flow_ids) ? $job_flow_ids : array($job_flow_ids)) ), 'member')); return $this->authenticate('TerminateJobFlows', $opt, $this->hostname); } /** * * DescribeJobFlows returns a list of job flows that match all of the supplied parameters. The parameters can include a list of job flow IDs, * job flow states, and restrictions on job flow creation date and time. * * Regardless of supplied parameters, only job flows created within the last two months are returned. * * If no parameters are supplied, then job flows matching either of the following criteria are returned: * *
  • Job flows created and completed in the last two weeks
  • * *
  • Job flows created within the last two months that are in one of the following states: RUNNING, WAITING, * SHUTTING_DOWN, STARTING
  • * *
* * Amazon Elastic MapReduce can return a maximum of 512 job flow descriptions. * * @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • CreatedAfter - string - Optional - Return only job flows created after this date and time. May be passed as a number of seconds since UNIX Epoch, or any string compatible with .
  • *
  • CreatedBefore - string - Optional - Return only job flows created before this date and time. May be passed as a number of seconds since UNIX Epoch, or any string compatible with .
  • *
  • JobFlowIds - string|array - Optional - Return only job flows whose job flow ID is contained in this list. Pass a string for a single value, or an indexed array for multiple values.
  • *
  • JobFlowStates - string|array - Optional - Return only job flows whose state is contained in this list. Pass a string for a single value, or an indexed array for multiple values.
  • *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function describe_job_flows($opt = null) { if (!$opt) $opt = array(); // Optional parameter if (isset($opt['CreatedAfter'])) { $opt['CreatedAfter'] = $this->util->convert_date_to_iso8601($opt['CreatedAfter']); } // Optional parameter if (isset($opt['CreatedBefore'])) { $opt['CreatedBefore'] = $this->util->convert_date_to_iso8601($opt['CreatedBefore']); } // Optional parameter if (isset($opt['JobFlowIds'])) { $opt = array_merge($opt, CFComplexType::map(array( 'JobFlowIds' => (is_array($opt['JobFlowIds']) ? $opt['JobFlowIds'] : array($opt['JobFlowIds'])) ), 'member')); unset($opt['JobFlowIds']); } // Optional parameter if (isset($opt['JobFlowStates'])) { $opt = array_merge($opt, CFComplexType::map(array( 'JobFlowStates' => (is_array($opt['JobFlowStates']) ? $opt['JobFlowStates'] : array($opt['JobFlowStates'])) ), 'member')); unset($opt['JobFlowStates']); } return $this->authenticate('DescribeJobFlows', $opt, $this->hostname); } /** * * SetTerminationProtection locks a job flow so the Amazon EC2 instances in the cluster cannot be terminated by user intervention, an API * call, or in the event of a job-flow error. The cluster still terminates upon successful completion of the job flow. Calling * SetTerminationProtection on a job flow is analogous to calling the Amazon EC2 DisableAPITermination API on all of the EC2 instances in a * cluster. * * SetTerminationProtection is used to prevent accidental termination of a job flow and to ensure that in the event of an error, the instances * will persist so you can recover any data stored in their ephemeral instance storage. * * To terminate a job flow that has been locked by setting SetTerminationProtection to true, you must first unlock the job flow * by a subsequent call to SetTerminationProtection in which you set the value to false. * * For more information, go to Protecting a Job Flow * from Termination in the Amazon Elastic MapReduce Developer's Guide. * * @param string|array $job_flow_ids (Required) A list of strings that uniquely identify the job flows to protect. This identifier is returned by RunJobFlow and can also be obtained from DescribeJobFlows . Pass a string for a single value, or an indexed array for multiple values. * @param boolean $termination_protected (Required) A Boolean that indicates whether to protect the job flow and prevent the Amazon EC2 instances in the cluster from shutting down due to API calls, user intervention, or job-flow error. * @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function set_termination_protection($job_flow_ids, $termination_protected, $opt = null) { if (!$opt) $opt = array(); // Required parameter $opt = array_merge($opt, CFComplexType::map(array( 'JobFlowIds' => (is_array($job_flow_ids) ? $job_flow_ids : array($job_flow_ids)) ), 'member')); $opt['TerminationProtected'] = $termination_protected; return $this->authenticate('SetTerminationProtection', $opt, $this->hostname); } /** * * RunJobFlow creates and starts running a new job flow. The job flow will run the steps specified. Once the job flow completes, the cluster * is stopped and the HDFS partition is lost. To prevent loss of data, configure the last step of the job flow to store results in Amazon S3. * If the JobFlowInstancesDetail KeepJobFlowAliveWhenNoSteps parameter is set to TRUE, the job flow will transition * to the WAITING state rather than shutting down once the steps have completed. * * For additional protection, you can set the JobFlowInstancesDetail TerminationProtected parameter to TRUE to lock * the job flow and prevent it from being terminated by API call, user intervention, or in the event of a job flow error. * * A maximum of 256 steps are allowed in each job flow. * * If your job flow is long-running (such as a Hive data warehouse) or complex, you may require more than 256 steps to process your data. You * can bypass the 256-step limitation in various ways, including using the SSH shell to connect to the master node and submitting queries * directly to the software running on the master node, such as Hive and Hadoop. For more information on how to do this, go to Add More than 256 Steps to a Job * Flow in the Amazon Elastic MapReduce Developer's Guide. * * For long running job flows, we recommend that you periodically store your results. * * @param string $name (Required) The name of the job flow. * @param array $instances (Required) A specification of the number and type of Amazon EC2 instances on which to run the job flow.
    *
  • MasterInstanceType - string - Optional - The EC2 instance type of the master node.
  • *
  • SlaveInstanceType - string - Optional - The EC2 instance type of the slave nodes.
  • *
  • InstanceCount - integer - Optional - The number of Amazon EC2 instances used to execute the job flow.
  • *
  • InstanceGroups - array - Optional - Configuration for the job flow's instance groups.
      *
    • x - array - This represents a simple array index.
        *
      • Name - string - Optional - Friendly name given to the instance group.
      • *
      • Market - string - Optional - Market type of the Amazon EC2 instances used to create a cluster node. [Allowed values: ON_DEMAND, SPOT]
      • *
      • InstanceRole - string - Required - The role of the instance group in the cluster. [Allowed values: MASTER, CORE, TASK]
      • *
      • BidPrice - string - Optional - Bid price for each Amazon EC2 instance in the instance group when launching nodes as Spot Instances, expressed in USD.
      • *
      • InstanceType - string - Required - The Amazon EC2 instance type for all instances in the instance group.
      • *
      • InstanceCount - integer - Required - Target number of instances for the instance group.
      • *
    • *
  • *
  • Ec2KeyName - string - Optional - Specifies the name of the Amazon EC2 key pair that can be used to ssh to the master node as the user called "hadoop."
  • *
  • Placement - array - Optional - Specifies the Availability Zone the job flow will run in. Takes an associative array of parameters that can have the following keys:
      *
    • AvailabilityZone - string - Required - The Amazon EC2 Availability Zone for the job flow.
    • *
  • *
  • KeepJobFlowAliveWhenNoSteps - boolean - Optional - Specifies whether the job flow should terminate after completing all steps.
  • *
  • TerminationProtected - boolean - Optional - Specifies whether to lock the job flow to prevent the Amazon EC2 instances from being terminated by API call, user intervention, or in the event of a job flow error.
  • *
  • HadoopVersion - string - Optional - Specifies the Hadoop version for the job flow. Valid inputs are "0.18" or "0.20".
  • *
* @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • LogUri - string - Optional - Specifies the location in Amazon S3 to write the log files of the job flow. If a value is not provided, logs are not created.
  • *
  • AdditionalInfo - string - Optional - A JSON string for selecting additional features.
  • *
  • Steps - array - Optional - A list of steps to be executed by the job flow.
      *
    • x - array - This represents a simple array index.
        *
      • Name - string - Required - The name of the job flow step.
      • *
      • ActionOnFailure - string - Optional - Specifies the action to take if the job flow step fails. [Allowed values: TERMINATE_JOB_FLOW, CANCEL_AND_WAIT, CONTINUE]
      • *
      • HadoopJarStep - array - Required - Specifies the JAR file used for the job flow step. Takes an associative array of parameters that can have the following keys:
          *
        • Properties - array - Optional - A list of Java properties that are set when the step runs. You can use these properties to pass key value pairs to your main function.
            *
          • x - array - This represents a simple array index.
              *
            • Key - string - Optional - The unique identifier of a key value pair.
            • *
            • Value - string - Optional - The value part of the identified key.
            • *
          • *
        • *
        • Jar - string - Required - A path to a JAR file run during the step.
        • *
        • MainClass - string - Optional - The name of the main class in the specified Java file. If not specified, the JAR file should specify a Main-Class in its manifest file.
        • *
        • Args - string|array - Optional - A list of command line arguments passed to the JAR file's main function when executed. Pass a string for a single value, or an indexed array for multiple values.
        • *
      • *
    • *
  • *
  • BootstrapActions - array - Optional - A list of bootstrap actions that will be run before Hadoop is started on the cluster nodes.
      *
    • x - array - This represents a simple array index.
        *
      • Name - string - Required - The name of the bootstrap action.
      • *
      • ScriptBootstrapAction - array - Required - The script run by the bootstrap action. Takes an associative array of parameters that can have the following keys:
          *
        • Path - string - Required - Location of the script to run during a bootstrap action. Can be either a location in Amazon S3 or on a local file system.
        • *
        • Args - string|array - Optional - A list of command line arguments to pass to the bootstrap action script. Pass a string for a single value, or an indexed array for multiple values.
        • *
      • *
    • *
  • *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function run_job_flow($name, $instances, $opt = null) { if (!$opt) $opt = array(); $opt['Name'] = $name; // Collapse these list values for the required parameter if (isset($instances['InstanceGroups'])) { $instances['InstanceGroups'] = CFComplexType::map(array( 'member' => (is_array($instances['InstanceGroups']) ? $instances['InstanceGroups'] : array($instances['InstanceGroups'])) )); } // Required parameter $opt = array_merge($opt, CFComplexType::map(array( 'Instances' => (is_array($instances) ? $instances : array($instances)) ), 'member')); // Optional parameter if (isset($opt['Steps'])) { $opt = array_merge($opt, CFComplexType::map(array( 'Steps' => $opt['Steps'] ), 'member')); unset($opt['Steps']); } // Optional parameter if (isset($opt['BootstrapActions'])) { $opt = array_merge($opt, CFComplexType::map(array( 'BootstrapActions' => $opt['BootstrapActions'] ), 'member')); unset($opt['BootstrapActions']); } return $this->authenticate('RunJobFlow', $opt, $this->hostname); } /** * * ModifyInstanceGroups modifies the number of nodes and configuration settings of an instance group. The input parameters include the new * target instance count for the group and the instance group ID. The call will either succeed or fail atomically. * * @param array $opt (Optional) An associative array of parameters that can have the following keys:
    *
  • InstanceGroups - array - Optional - Instance groups to change.
      *
    • x - array - This represents a simple array index.
        *
      • InstanceGroupId - string - Required - Unique ID of the instance group to expand or shrink.
      • *
      • InstanceCount - integer - Required - Target size for the instance group.
      • *
    • *
  • *
  • curlopts - array - Optional - A set of values to pass directly into curl_setopt(), where the key is a pre-defined CURLOPT_* constant.
  • *
  • returnCurlHandle - boolean - Optional - A private toggle specifying that the cURL handle be returned rather than actually completing the request. This toggle is useful for manually managed batch requests.
* @return CFResponse A object containing a parsed HTTP response. */ public function modify_instance_groups($opt = null) { if (!$opt) $opt = array(); // Optional parameter if (isset($opt['InstanceGroups'])) { $opt = array_merge($opt, CFComplexType::map(array( 'InstanceGroups' => $opt['InstanceGroups'] ), 'member')); unset($opt['InstanceGroups']); } return $this->authenticate('ModifyInstanceGroups', $opt, $this->hostname); } } /*%******************************************************************************************%*/ // EXCEPTIONS /** * Default EMR Exception. */ class EMR_Exception extends Exception {}