001 /* 002 * SortDatasetBy.java 003 * 004 * Created on July 5, 2006, 1:25 PM 005 * 006 * This file is part of the STAR Scheduler. 007 * Copyright (c) 2002-2006 STAR Collaboration - Brookhaven National Laboratory 008 * 009 * STAR Scheduler is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU General Public License as published by 011 * the Free Software Foundation; either version 2 of the License, or 012 * (at your option) any later version. 013 * 014 * STAR Scheduler is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU General Public License for more details. 018 * 019 * You should have received a copy of the GNU General Public License 020 * along with STAR Scheduler; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023 024 025 package gov.bnl.star.offline.scheduler.dataset.datasetManipulators; 026 027 import gov.bnl.star.offline.scheduler.dataset.Dataset; 028 import gov.bnl.star.offline.scheduler.dataset.Dataset; 029 030 import gov.bnl.star.offline.scheduler.dataset.Dataset; 031 import gov.bnl.star.offline.scheduler.request.Request; 032 import java.io.File; 033 import java.util.Enumeration; 034 035 import java.util.*; 036 import java.util.ArrayList; 037 import java.util.ListIterator; 038 import org.apache.log4j.Logger; 039 040 import java.io.BufferedReader; 041 import java.io.DataInputStream; 042 import java.io.FileInputStream; 043 import java.io.FileNotFoundException; 044 import java.io.FileReader; 045 import java.io.IOException; 046 047 048 /** 049 * This is a sorter for large datasets that do not fit into 050 * memory. It sorts subsets of the main dataset, and writes 051 * each one of these to a file in step one. The in step two 052 * these are read back and mered into the final sorted dataset. 053 * 054 * $Id 055 * @author Leevnte B. Hajdu 056 */ 057 public class SortByRegX implements DatasetManipulator { 058 059 static private Logger log = Logger.getLogger(SortByRegX.class.getName()); 060 061 062 String sortByCaptureGroup = "(.*)"; 063 /** This object is used to sort datasets by a parameter of the values in the dataset. 064 * To create the object it must be supply the parameter SortByCaptureGroup. This is 065 * a regular expression string with a capture group. The dataset will be sorted on 066 * the capture group. 067 * 068 * Examples: 069 * "(.*)" Sort the whole string lexicographically 070 * ".*:{2}.*:{2}.*:{2}(.*):{2}.*:{2}.*:{2}.*" Sort by the parameter host. 071 * 072 * @param sortByCaptureGroup A regular expression capture group. 073 */ 074 public SortByRegX(String sortByCaptureGroup) { 075 this.sortByCaptureGroup = sortByCaptureGroup; 076 } 077 078 079 080 String captureGroupsOrder = "$1"; 081 /**Sort entries by a set of capture groups 082 * 083 *Example: 084 * 085 *Given: 086 *Typcal entry: 9404397::NFS::BNL::localhost::/star/data41/reco/productionMinBias/ReversedFullField/P05ic/2004/023::st_physics_adc_5023001_raw_1050014.MuDst.root::155 087 *Regular expression for eantry : "[0-9]*::[a-zA-Z]*::[a-zA-Z]*::[a-zA-Z0-9:.]*::[^:]*::[^:/]*::[0-9]*.*" 088 * 089 *Sort by -storage- then by -path- then by -fileName- : 090 *SortByRegX("[0-9]*::([a-zA-Z]*)::[a-zA-Z]*::[a-zA-Z0-9:.]*::([^:]*)::([^:/]*)::[0-9]*.*", "$1$2$3") 091 * 092 **/ 093 public SortByRegX(String sortByCaptureGroups, String captureGroupsOrder) { 094 this.sortByCaptureGroup = sortByCaptureGroup; 095 this.captureGroupsOrder = captureGroupsOrder; 096 } 097 098 099 private void WriteSortedSubsetOfDataset(FileWriter writer){ 100 //FileWriter writer = new FileWriter(tempFileName); 101 102 Vector keys = new Vector(entryBufferTable.keySet()); 103 Collections.sort(keys); //sort the table keys 104 105 for(Enumeration e = keys.elements(); e.hasMoreElements();){ //Write the data back out 106 //super.writeToBuffer((String) entryBufferTable.get((String) e.nextElement())); 107 String datasubset = (String) e.nextElement(); 108 writer.writeToBuffer((String) entryBufferTable.get(datasubset)); 109 } 110 entryBufferTable.clear(); 111 //writer.closeBuffer(); 112 //tempFileName = tempFileName + "~"; //keep adding a "~" for every temp file written 113 //i = 0; 114 115 } 116 117 118 119 Hashtable entryBufferTable = new Hashtable(); //The table buffer the part of the dataste being sorted 120 121 /** Used to pass the dataset to the dataset manipulator 122 * @param dataset The dataset to be modifyed 123 * @param request The request object of the current request for with will use the dataset 124 **/ 125 public void modify(Dataset dataset, Request request){ 126 127 128 //System.out.print("Sorting Dataset by : " + sortByCaptureGroup + "..."); 129 130 //dataset.modify(dataset, request); 131 132 ///////////////////////////////////////// pass #1 ////////////////////////////////////////////// 133 List tempfileList = new ArrayList(); //The list of file name that have been written as step one 134 String tempFileName = dataset.getDatasetFileBufferName(); 135 String datsetEntry; 136 137 try { 138 139 BufferedReader currentDataset = new BufferedReader( new FileReader(dataset.getDatasetName() )); 140 141 int i = 0; 142 143 while ((datsetEntry = currentDataset.readLine()) != null) { //read a line from the dataset file 144 145 entryBufferTable.put(datsetEntry.replaceAll(sortByCaptureGroup,captureGroupsOrder) + datsetEntry, datsetEntry); //put the data with it's hash for sorting into a table' 146 147 i ++; 148 if(maxBufferSize <= i){ //When the buffer is full sort the list and write it out to the temp file (.dataset~) 149 150 FileWriter writer = new FileWriter(tempFileName); 151 WriteSortedSubsetOfDataset(writer); 152 153 tempfileList.add(writer); 154 //tempFileName = tempFileName + ".tmp"; //keep adding a ".tmp" for every temp file written 155 tempFileName = File.createTempFile("shed", "tmp").getName(); 156 i = 0; 157 158 } 159 } 160 } catch (FileNotFoundException ex) { 161 ex.printStackTrace(); 162 } catch (IOException ex) { 163 ex.printStackTrace(); 164 } 165 166 167 168 if(! entryBufferTable.isEmpty()){ //fush out any last files 169 FileWriter writer = new FileWriter(tempFileName); 170 WriteSortedSubsetOfDataset(writer); 171 tempfileList.add(writer); 172 } 173 174 175 ///////////////////////////////////////// pass #2 ////////////////////////////////////////////// 176 177 Hashtable indexs = new Hashtable(); 178 FileWriter reader = null; 179 180 //setup the hash table 181 ListIterator e = tempfileList.listIterator(); 182 while(e.hasNext()){ 183 reader = (FileWriter) e.next(); //get all the writters from the list 184 185 186 if(reader.getNextLine() != null){ //set up hash table as <currentLine><FileWriter> 187 indexs.put(reader.currentLine.replaceAll(sortByCaptureGroup, captureGroupsOrder) + reader.currentLine, reader); 188 } 189 } 190 191 try { 192 193 while(! indexs.isEmpty() ){ 194 Vector keys = new Vector(indexs.keySet()); 195 Collections.sort(keys); 196 String key = (String) keys.elements().nextElement(); 197 reader = (FileWriter) indexs.get(key); 198 dataset.writeToBuffer(reader.currentLine); 199 indexs.remove(key); 200 if(reader.getNextLine() != null){ 201 indexs.put(reader.currentLine.replaceAll(sortByCaptureGroup, captureGroupsOrder) + reader.currentLine, reader); 202 } 203 else{ 204 reader.deleteBuffer(); 205 } 206 } 207 208 } catch (Exception ex) { 209 System.out.println(ex.toString()); 210 ex.printStackTrace(); 211 } 212 213 214 215 ///////////////////////////////////////end of pass two/////////////////////////////////////////// 216 217 218 dataset.swap_buffer_dataset_with_dataset(); 219 220 //System.out.println("Done"); 221 222 } 223 224 225 int maxBufferSize = 10000; 226 /**Sets how big the the max buffer size in memory sould be. 227 *The min value must be bigger then 1,000. The bigger the value the faster the list wi;; be sorted. 228 * The default value is 1000 entrys. 229 */ 230 public void setMaxBufferSize(int maxBufferSize){ 231 232 //error trap 233 if(maxBufferSize >= 100000) throw new RuntimeException("error: SortDatasetBy: The max buffer size must be at least 1000."); 234 235 this.maxBufferSize = maxBufferSize; 236 } 237 public int getMaxBufferSize(){return maxBufferSize;} 238 239 240 241 class FileWriter extends Dataset{ 242 243 244 public FileWriter(String fileName){ 245 super.setDatasetName(fileName); 246 } 247 248 public void closeReader(){ 249 try { 250 currentDataset.close(); 251 } catch (IOException ex) { 252 System.out.println("Wrinning could not close FileReader."); 253 ex.printStackTrace(); 254 } 255 } 256 257 258 BufferedReader currentDataset = null; 259 public String currentLine = null; 260 261 262 String getNextLine(){ 263 try { 264 if(currentDataset == null) currentDataset = new BufferedReader( new FileReader(super.getDatasetFileBufferName())); 265 currentLine = currentDataset.readLine(); 266 return currentLine; 267 } catch (FileNotFoundException ex) { 268 return null; 269 } catch (IOException ex) { 270 return null; 271 } 272 } 273 274 275 } 276 277 278 279 public boolean requirementsSatisfied() { 280 return true; 281 } 282 283 284 }