CreateRecordFile

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.chukwa.util;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.chukwa.extraction.demux.processor.mapper.MapProcessor;
import org.apache.hadoop.chukwa.extraction.demux.processor.mapper.TsProcessor;
import org.apache.hadoop.chukwa.extraction.demux.Demux;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecordKey;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecord;
import org.apache.hadoop.chukwa.ChunkImpl;
import org.apache.hadoop.chukwa.ChukwaArchiveKey;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.File;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.charset.Charset;

/**
 * Helper class used to create sequence files of Chukwa records
 */
public class CreateRecordFile {

   public static void makeTestSequenceFile(File inputFile,
                                           Path outputFile,
                                           String clusterName,
                                           String dataType,
                                           String streamName,
                                           MapProcessor processor) throws IOException {

     //initialize the output collector and the default processor
     MockOutputCollector collector = new MockOutputCollector();
     if (processor == null) processor = new TsProcessor();

     //initialize the sequence file writer
     Configuration conf = new Configuration();
     FileSystem fs = outputFile.getFileSystem(conf);
     FSDataOutputStream out = fs.create(outputFile);

     SequenceFile.Writer seqFileWriter = SequenceFile.createWriter(conf, out,
                                    ChukwaRecordKey.class, ChukwaRecord.class,
                                    SequenceFile.CompressionType.NONE, null);
     long lastSeqID = 0;
     String line;
     FileInputStream fis = new FileInputStream(inputFile);
     BufferedReader reader = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

     // for each line, create a chunk and an arckive key, pass it to the
     // processor, then write it to the sequence file.
     while ((line = reader.readLine()) != null) {

       ChunkImpl chunk = new ChunkImpl(dataType, streamName,
         line.length()  + lastSeqID, line.getBytes(Charset.forName("UTF-8")), null);
       lastSeqID += line.length();
       chunk.addTag("cluster=\"" + clusterName + "\"");

       ChukwaArchiveKey archiveKey = new ChukwaArchiveKey();
       archiveKey.setTimePartition(System.currentTimeMillis());
       archiveKey.setDataType(chunk.getDataType());
       archiveKey.setStreamName(chunk.getStreamName());
       archiveKey.setSeqId(chunk.getSeqID());

       processor.process(archiveKey, chunk, collector, Reporter.NULL);
       seqFileWriter.append(collector.getChukwaRecordKey(),
                            collector.getChukwaRecord());
     }

     out.flush();
     out.close();
     seqFileWriter.close();
     reader.close();
   }

   private static class MockOutputCollector
           implements OutputCollector<ChukwaRecordKey, ChukwaRecord> {
     ChukwaRecordKey chukwaRecordKey;
     ChukwaRecord chukwaRecord;

     public void collect(ChukwaRecordKey chukwaRecordKey,
                         ChukwaRecord chukwaRecord) throws IOException {
       this.chukwaRecordKey = chukwaRecordKey;
       this.chukwaRecord = chukwaRecord;
     }

     public ChukwaRecordKey getChukwaRecordKey() { return chukwaRecordKey; }
     public ChukwaRecord getChukwaRecord() { return chukwaRecord; }
   }

   public static void main(String[] args) throws IOException,
                                                 ClassNotFoundException,
                                                 IllegalAccessException,
                                                 InstantiationException {
     if(args.length == 0 || (args.length==1 && args[0].contains("-h"))) {
       usage();
       return;
     }

     File inputFile = new File(args[0]);
     Path outputFile = new Path(args[1]);
     String clusterName = "testClusterName";
     String dataType = "testDataType";
     String streamName = "testStreamName";
     MapProcessor processor = new TsProcessor();
     Path confFile = null;

     if (args.length > 2) clusterName = args[2];
     if (args.length > 3) dataType = args[3];
     if (args.length > 4) streamName = args[4];

     if (args.length > 5) {
       Class<?> clazz = null;
       try {
         clazz = Class.forName(args[5]);
       }
       catch (ClassNotFoundException e) {
         try {
           clazz = Class.forName(
                 "org.apache.hadoop.chukwa.extraction.demux.processor.mapper." + args[5]);
         }
         catch (Exception e2) {
           throw e;
         }
       }
       processor = (MapProcessor)clazz.newInstance();
     }

     if (args.length > 6) {
       confFile = new Path(args[6]);
       Demux.jobConf = new JobConf(confFile);
     }

     System.out.println("Creating sequence file using the following input:");
     System.out.println("inputFile  : " + inputFile);
     System.out.println("outputFile : " + outputFile);
     System.out.println("clusterName: " + clusterName);
     System.out.println("dataType   : " + dataType);
     System.out.println("streamName : " + streamName);
     System.out.println("processor  : " + processor.getClass().getName());
     System.out.println("confFile   : " + confFile);

     makeTestSequenceFile(inputFile, outputFile, clusterName, dataType, streamName, processor);

     System.out.println("Done");
   }

   public static void usage() {
     System.out.println("Usage: java " + CreateRecordFile.class.toString().split(" ")[1] + " <inputFile> <outputFile> [<clusterName> <dataType> <streamName> <processorClass> [confFile]]");
     System.out.println("Description: Takes a plain text input file and generates a Hadoop sequence file contaning ChukwaRecordKey,ChukwaRecord entries");
     System.out.println("Parameters: inputFile      - Text input file to read");
     System.out.println("            outputFile     - Sequence file to create");
     System.out.println("            clusterName    - Cluster name to use in the records");
     System.out.println("            dataType       - Data type to use in the records");
     System.out.println("            streamName     - Stream name to use in the records");
     System.out.println("            processorClass - Processor class to use. Defaults to TsProcessor");
     System.out.println("            confFile       - File to use to create the JobConf");
   }
}