001    /*
002     * SortDatasetBy.java
003     *
004     * Created on July 5, 2006, 1:25 PM
005     *
006     * This file is part of the STAR Scheduler.
007     * Copyright (c) 2002-2006 STAR Collaboration - Brookhaven National Laboratory
008     *
009     * STAR Scheduler is free software; you can redistribute it and/or modify
010     * it under the terms of the GNU General Public License as published by
011     * the Free Software Foundation; either version 2 of the License, or
012     * (at your option) any later version.
013     *
014     * STAR Scheduler is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017     * GNU General Public License for more details.
018     *
019     * You should have received a copy of the GNU General Public License
020     * along with STAR Scheduler; if not, write to the Free Software
021     * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022     */
023    
024    
025    package gov.bnl.star.offline.scheduler.dataset.datasetManipulators;
026    
027    import gov.bnl.star.offline.scheduler.dataset.Dataset;
028    import gov.bnl.star.offline.scheduler.dataset.Dataset;
029    
030    import gov.bnl.star.offline.scheduler.dataset.Dataset;
031    import gov.bnl.star.offline.scheduler.request.Request;
032    import java.io.File;
033    import java.util.Enumeration;
034    
035    import java.util.*;
036    import java.util.ArrayList;
037    import java.util.ListIterator;
038    import org.apache.log4j.Logger;
039    
040    import java.io.BufferedReader;
041    import java.io.DataInputStream;
042    import java.io.FileInputStream;
043    import java.io.FileNotFoundException;
044    import java.io.FileReader;
045    import java.io.IOException;
046    
047    
048    /**
049     * This is a sorter for large datasets that do not fit into 
050     * memory. It sorts subsets of the main dataset, and writes 
051     * each one of these to a file in step one. The in step two 
052     * these are read back and mered into the final sorted dataset.
053     *
054     * $Id
055     * @author Leevnte B. Hajdu  
056     */
057    public class SortByRegX implements DatasetManipulator   {
058        
059        static private Logger log = Logger.getLogger(SortByRegX.class.getName());
060        
061        
062        String sortByCaptureGroup = "(.*)";
063        /** This object is used to sort datasets by a parameter of the values in the dataset. 
064         *   To create the object it must be supply the parameter SortByCaptureGroup.  This is 
065         *   a regular expression string with a capture group. The dataset will be sorted on 
066         *   the capture group.
067         *
068         *   Examples:
069         *   "(.*)"   Sort the whole string lexicographically  
070         *   ".*:{2}.*:{2}.*:{2}(.*):{2}.*:{2}.*:{2}.*"   Sort by the parameter host. 
071         *
072         *    @param sortByCaptureGroup A regular expression capture group.
073         */
074        public SortByRegX(String sortByCaptureGroup) {
075            this.sortByCaptureGroup = sortByCaptureGroup;
076        }
077        
078        
079        
080        String captureGroupsOrder = "$1";
081        /**Sort entries by a set of capture groups
082         *
083         *Example:
084         *
085         *Given:
086         *Typcal entry: 9404397::NFS::BNL::localhost::/star/data41/reco/productionMinBias/ReversedFullField/P05ic/2004/023::st_physics_adc_5023001_raw_1050014.MuDst.root::155
087         *Regular expression for eantry : "[0-9]*::[a-zA-Z]*::[a-zA-Z]*::[a-zA-Z0-9:.]*::[^:]*::[^:/]*::[0-9]*.*"
088         *
089         *Sort by -storage- then by -path- then by -fileName- :
090         *SortByRegX("[0-9]*::([a-zA-Z]*)::[a-zA-Z]*::[a-zA-Z0-9:.]*::([^:]*)::([^:/]*)::[0-9]*.*", "$1$2$3")
091         *
092        **/
093        public SortByRegX(String sortByCaptureGroups, String captureGroupsOrder) {
094            this.sortByCaptureGroup = sortByCaptureGroup;
095            this.captureGroupsOrder = captureGroupsOrder;
096        }       
097        
098        
099        private void WriteSortedSubsetOfDataset(FileWriter writer){
100                        //FileWriter writer = new FileWriter(tempFileName);
101                         
102                        Vector keys = new Vector(entryBufferTable.keySet());
103                        Collections.sort(keys); //sort the table keys 
104                        
105                        for(Enumeration e = keys.elements(); e.hasMoreElements();){ //Write the data back out
106                            //super.writeToBuffer((String) entryBufferTable.get((String) e.nextElement())); 
107                            String datasubset = (String) e.nextElement();
108                            writer.writeToBuffer((String) entryBufferTable.get(datasubset));
109                        }
110                        entryBufferTable.clear();
111                        //writer.closeBuffer();
112                        //tempFileName = tempFileName + "~"; //keep adding a "~" for every temp file written
113                        //i = 0;
114            
115        }
116        
117        
118        
119        Hashtable entryBufferTable = new Hashtable();  //The table buffer the part of the dataste being sorted
120        
121        /** Used to pass the dataset to the dataset manipulator
122          * @param dataset The dataset to be modifyed 
123          * @param request The request object of the current request for with will use the dataset 
124         **/    
125        public void modify(Dataset dataset, Request request){ 
126            
127    
128            //System.out.print("Sorting Dataset by : " + sortByCaptureGroup + "...");
129            
130            //dataset.modify(dataset, request);
131            
132            /////////////////////////////////////////  pass #1   //////////////////////////////////////////////
133            List   tempfileList  = new ArrayList(); //The list of file name that have been written as step one
134            String tempFileName = dataset.getDatasetFileBufferName();
135            String datsetEntry;
136            
137            try {
138                
139                BufferedReader currentDataset = new BufferedReader( new FileReader(dataset.getDatasetName() ));
140                
141                int i = 0;
142                
143                while ((datsetEntry = currentDataset.readLine()) != null) {  //read a line from the dataset file
144                    
145                    entryBufferTable.put(datsetEntry.replaceAll(sortByCaptureGroup,captureGroupsOrder) + datsetEntry, datsetEntry); //put the data with it's hash for sorting into a table'
146                    
147                    i ++;
148                    if(maxBufferSize <= i){ //When the buffer is full sort the list and write it out to the temp file (.dataset~)           
149                    
150                        FileWriter writer = new FileWriter(tempFileName);
151                        WriteSortedSubsetOfDataset(writer);
152                         
153                        tempfileList.add(writer);
154                        //tempFileName = tempFileName + ".tmp"; //keep adding a ".tmp" for every temp file written
155                        tempFileName = File.createTempFile("shed", "tmp").getName();
156                        i = 0;
157                        
158                    }   
159                }
160            } catch (FileNotFoundException ex) {
161                ex.printStackTrace();
162            } catch (IOException ex) {
163                ex.printStackTrace();
164            }
165            
166            
167            
168            if(! entryBufferTable.isEmpty()){ //fush out any last files
169                FileWriter writer = new FileWriter(tempFileName);
170                WriteSortedSubsetOfDataset(writer);
171                tempfileList.add(writer);
172            }
173            
174                   
175    /////////////////////////////////////////  pass #2   //////////////////////////////////////////////
176            
177            Hashtable indexs = new Hashtable(); 
178            FileWriter reader = null;
179            
180            //setup the hash table
181            ListIterator e = tempfileList.listIterator();
182            while(e.hasNext()){ 
183                reader = (FileWriter) e.next(); //get all the writters from the list
184                
185                   
186                if(reader.getNextLine() != null){ //set up hash table as <currentLine><FileWriter>
187                    indexs.put(reader.currentLine.replaceAll(sortByCaptureGroup, captureGroupsOrder) + reader.currentLine, reader);
188                }    
189            }
190    
191            try {
192                
193                while(! indexs.isEmpty()  ){
194                    Vector keys = new Vector(indexs.keySet());
195                    Collections.sort(keys);
196                    String key = (String) keys.elements().nextElement();
197                    reader = (FileWriter) indexs.get(key);
198                    dataset.writeToBuffer(reader.currentLine);
199                    indexs.remove(key);
200                    if(reader.getNextLine() != null){
201                        indexs.put(reader.currentLine.replaceAll(sortByCaptureGroup, captureGroupsOrder) + reader.currentLine, reader);   
202                    }
203                    else{
204                      reader.deleteBuffer();
205                    }
206                }
207                
208            } catch (Exception ex) {
209                System.out.println(ex.toString());
210                ex.printStackTrace();
211            }
212                
213    
214            
215    ///////////////////////////////////////end of pass two///////////////////////////////////////////
216            
217            
218            dataset.swap_buffer_dataset_with_dataset();
219            
220            //System.out.println("Done");  
221       
222        }
223        
224        
225        int maxBufferSize = 10000;
226        /**Sets how big the the max buffer size in memory sould be. 
227         *The min value must be bigger then 1,000. The bigger the value the faster the list wi;; be sorted.
228         * The default value is 1000 entrys.
229         */
230         public void setMaxBufferSize(int maxBufferSize){
231             
232             //error trap
233            if(maxBufferSize >= 100000) throw new RuntimeException("error: SortDatasetBy: The max buffer size must be at least 1000.");
234             
235             this.maxBufferSize = maxBufferSize;
236         }
237         public int getMaxBufferSize(){return maxBufferSize;}
238        
239         
240         
241         class FileWriter extends Dataset{
242             
243                
244             public FileWriter(String fileName){
245                super.setDatasetName(fileName);   
246             }
247                
248             public void closeReader(){
249                try {
250                    currentDataset.close();
251                } catch (IOException ex) {
252                    System.out.println("Wrinning could not close FileReader.");
253                    ex.printStackTrace();
254                }  
255             }
256             
257             
258             BufferedReader currentDataset = null;
259             public String currentLine = null;
260             
261             
262             String getNextLine(){
263                try {
264                    if(currentDataset == null) currentDataset = new BufferedReader( new FileReader(super.getDatasetFileBufferName()));
265                     currentLine = currentDataset.readLine();
266                     return currentLine;
267                } catch (FileNotFoundException ex) {
268                    return null;
269                } catch (IOException ex) {
270                    return null;
271                }   
272             }        
273    
274    
275         }
276         
277         
278         
279         public boolean requirementsSatisfied() {
280            return true;
281        }
282         
283    
284    }