/* filesrt.c */ /* COPYRIGHT (C) 1996-2002 Norbert H. Doerry June 2002 August 1997 June 1997 February 1997 January 1997 November 1996 SYNTAX FILESRT Infile -oOutfile -fXY where Infile = Filename of input text file containing records delimited by newline characters and fields delimited by whitespace characters. Blank lines and lines starting with ! are ignored. Outfile = Output File sorted. If omitted, Outfile is printed to the standard output. -fXY = Field to sort to. X is the field number starting with 1. Fields are separated by whitespaces Y is either A = alphabetic sorting (default) N = numeric sorting Version 1.0b of November 1996 Version 1.0c of January 1997 : fixed minor bug. Version 1.0d of February 1997: fixed minor bug in print_output_file() check for only one tempfile, is so then simply rename the tempfile the output file. replaced isspace() with is_space() Version 1.0e of June 1997: Checked for file renaming error when creating only one tempfile. Had case were file was not renamed properly. Version 1.0f of August 1997: Closed tempfile before renaming attempt when only one tempfile. Also deleted tempfiles at the end. Fixed bug that resulted in some lines being read multiple times. Version 2.0 of August 1997: changed the sorting algorithm to increase speed. Version 2.0a of June 2002: Add GPL, recompile using 32-bit compiler, fixed type-casting error FILESRT can conceivably sort very large files because it breaks the input file into multiple 3600 record chunks, sorts each of the chunks, then prints the files records in the proper order. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. to report bugs, send an email to doerry@aol.com */ #include #include #include #include #define VERSION "2.0a" #define MAXCHAR 1024 #define SORT_ALPHA 1 #define SORT_NUMERIC 2 #define DEBUG 0 /**********************************************************************/ typedef struct FileSort { char *Infile; FILE *in; char *Outfile; FILE *out; int sort_field; /* field strarting with 1 */ int sort_type; /* SORT_ALPHA or SORT_NUMERIC */ int max_nbr_elements; /* maximum number of elements per list */ int max_nbr_lists; /* maximum number of lists per TempFile */ int nbr_tempfile_records; /* the maximum number of records per TempFile */ struct Tempfile *tf; int nbr_TempFile; /* number of temp files to use */ struct List *l; int debug; } FILESORT; typedef struct Tempfile { char filename[13]; FILE *in; long offset; /* pointer in Tempfile for the next line, -1 if at end */ char *field; /* string containing the field for this line */ char line[MAXCHAR]; /* string holding the entire current line */ struct Tempfile *next; int nbr_records; /* number of records stored in this tempfile */ } TEMPFILE; typedef struct Element { char *s; /* character string of the field sorted on */ long offset; /* offset in file for line with the parent search field */ struct Element *next; } ELEMENT; typedef struct List { struct Element *e; struct Element *last; long count; /* count the number of offsets */ } LIST; /*******************************************************************/ void initialize_filesort(FILESORT *); void read_command_line(FILESORT *,int,char **); void print_help(void); void print_debug(FILESORT *); char *copystring(char *); char *strextract(char *,int); void strstrip(char *); void read_file(FILESORT *); void store_line(FILESORT *,char *,long); void sort_list(LIST *,FILESORT *); void sort_sublist(LIST *,FILESORT *); int compare(char *,char *,int ); TEMPFILE *alloc_TEMPFILE(FILESORT *); ELEMENT *alloc_ELEMENT(void); void delete_LIST(LIST *); void free_ELEMENT(ELEMENT *); void print_list(LIST *); void print_output_file(FILESORT *); int is_space(char); /*******************************************************************/ int main(int argc, char **argv) { FILESORT fs; initialize_filesort(&fs); read_command_line(&fs,argc,argv); if (fs.debug) print_debug(&fs); if (fs.out != stdout) printf("Done Reading Command Line\n"); if (fs.out != stdout) printf("Reading File %s\n",fs.Infile); read_file(&fs); if (fs.out != stdout) printf("Done Reading File %s\n",fs.Infile); if (fs.debug > 4) print_list(fs.l); if (fs.out != stdout) printf("Writing File %s\n",fs.Outfile); print_output_file(&fs); return EXIT_SUCCESS; } void initialize_filesort(FILESORT *fs) { fs->Infile = (char *) NULL; fs->in = (FILE *) NULL; fs->Outfile = (char *) NULL; fs->out = (FILE *) NULL; fs->sort_field = 1; fs->sort_type = SORT_ALPHA; fs->max_nbr_elements = 60; /* maximum number of elements per list */ fs->max_nbr_lists = 60; /* maximum number of lists per TempFile */ /* the maximum number of records per TempFile */ fs->nbr_tempfile_records = fs->max_nbr_elements * fs->max_nbr_lists; fs->nbr_TempFile = 0; fs->tf = (TEMPFILE *) NULL; alloc_TEMPFILE(fs); fs->l = (LIST *) calloc((size_t) 1, sizeof(LIST)); if (fs->l == (LIST *) NULL) { fprintf(stderr, " ***ERROR: Out of Memory in initialize_filesort()<%d>\n",__LINE__); exit(EXIT_FAILURE); }; fs->l->e = (ELEMENT *) NULL; fs->l->count = 0l; fs->debug = 0; } void read_command_line(FILESORT *fs,int argc,char **argv) { int i,j; for (i=1 ; i < argc ; i++) { if (argv[i][0] == '-' || argv[i][0] == '/') { if (argv[i][1] == 'o' || argv[i][1] == 'O') { if (fs->out != (FILE *) NULL) { fprintf(stderr," ***ERROR: Output file multiply defined\n"); exit(EXIT_FAILURE); } fs->Outfile = copystring(argv[i]+2); fs->out = fopen(fs->Outfile,"w"); if (fs->out == (FILE *) NULL) { fprintf(stderr," ***ERROR: Unable to open Output File %s\n", fs->Outfile); exit(EXIT_FAILURE); } } else if (argv[i][1] == 'f' || argv[i][1] == 'F') { fs->sort_field = 0; fs->sort_type = SORT_ALPHA; for (j=2 ; isdigit((int)argv[i][j]) ; j++) { fs->sort_field = 10*fs->sort_field + (int)(argv[i][j] - '0'); } if (fs->sort_field < 1) fs->sort_field = 1; if (argv[i][j] == 'a' || argv[i][j] == 'A') fs->sort_type = SORT_ALPHA; else if (argv[i][j] == 'n' || argv[i][j] == 'N') fs->sort_type = SORT_NUMERIC; } else if (argv[i][1] == 'd' || argv[i][1] == 'D') { if (isdigit((int)argv[i][2])) { fs->debug = (int)(argv[i][2]-'0'); } else fs->debug = 1; } else if (argv[i][1] == 'h' || argv[i][1] == 'H' || argv[i][1] == '?') { print_help(); exit(EXIT_SUCCESS); } } else /* input filename */ { if (fs->in != (FILE *) NULL) { fprintf(stderr," ***ERROR: Input file multiply defined\n"); exit(EXIT_FAILURE); } fs->Infile = copystring(argv[i]); fs->in = fopen(fs->Infile,"r"); if (fs->in == (FILE *) NULL) { fprintf(stderr," ***ERROR: Unable to open Input File %s\n", fs->Infile); exit(EXIT_FAILURE); } } } if (fs->in == (FILE *) NULL) { fs->in = stdin; fs->Infile = copystring(""); } if (fs->out == (FILE *) NULL) { fs->out = stdout; fs->Outfile = copystring(""); } } void print_help(void) { printf(" FILESRT version %s <%s>\n\n",VERSION,__DATE__); printf(" SYNTAX: FILESRT Infile -oOutfile -fXY\n"); printf("\n"); printf(" where\n\n"); printf(" Infile = Filename of input text file containing records\n"); printf(" delimited by newline characters and fields delimited\n"); printf(" by whitespace characters. Blank lines and lines\n"); printf(" starting with ! are ignored.\n"); printf(" Outfile = Output File sorted. If omitted, Outfile is printed\n"); printf(" to the standard output.\n"); printf(" -fXY = Field to sort to.\n"); printf(" X is the field number starting with 1.\n"); printf(" Fields are separated by whitespaces.\n"); printf(" Y is either:\n"); printf(" A = alphabetic sorting (default)\n"); printf(" N = numeric sorting\n"); } void print_debug(FILESORT *fs) { printf("Infile = %s\n",fs->Infile); printf("Outfile = %s\n",fs->Outfile); printf("sort_field = %d\n",fs->sort_field); printf("sort_type = %d (%d = SORT_ALPHA, %d = SORT_NUMERIC)\n", fs->sort_type,SORT_ALPHA,SORT_NUMERIC); printf("l->count = %ld\n",fs->l->count); } char *copystring(char *s) { char *ps; if (s == (char *) NULL) ps = (char *) calloc((size_t) 1,sizeof(char)); else ps = (char *) calloc(strlen(s)+1,sizeof(char)); if (ps == (char *) NULL) { fprintf(stderr," ***ERROR: OUT OF MEMORY IN COPYSTRING() <%d>\n",__LINE__); exit(EXIT_FAILURE); } if (s == (char *) NULL) *ps = (char) NULL; else { strcpy(ps,s); } return ps; } char *strextract(char *s,int n) { char *ps,*sn; int ncnt; int slen; int i; for (ncnt = 1,ps = s ; ncnt < n ; ncnt++) { /* skip leading spaces */ while (isspace((int)(*ps))) ps++; /* go past the word */ while (isspace((int)(*ps)) == 0 && *ps != (char) NULL) ps++; } /* skip leading spaces */ while(isspace((int)(*ps))) ps++; /* find the length of the nth word */ for (slen = 0 ; isspace((int)ps[slen]) == 0 && ps[slen] != (char) NULL ; slen++); /* allocate the answer */ sn = (char *) malloc((size_t)((slen+1)*sizeof(char))); if (sn == (char *) NULL) { fprintf(stderr," ***ERROR: Out of Memory in strextract() <%d>\n",__LINE__); exit(EXIT_FAILURE); } /* see if zero length line */ if (slen == 0) { sn[0] = (char) NULL; return sn; } /* copy the string */ for (i=0 ; i < slen ; i++,ps++) sn[i] = *ps; sn[slen] = (char) NULL; return sn; } void strstrip(char *s) { int i,j; if (s == (char *) NULL) return; /* find first non space */ for (i=0;s[i] != (char) NULL && isspace((int)s[i]) ; i++); /* move the entire string left */ for (j=0;s[i] != (char) NULL ;i++,j++) s[j] = s[i]; s[j] = (char) NULL; /* strip trailing spaces */ for (j-- ; j >= 0 && isspace((int)s[j]) ; j--) s[j] = (char) NULL; } void read_file_block(FILESORT *fs,TEMPFILE *tf) { char rdline[MAXCHAR]; long offset; tf->nbr_records = 0; offset = ftell(fs->in); /* get the position in the file for the line */ if (fs->out != stdout) printf("="); while (fgets(rdline,MAXCHAR-1,fs->in)) { if (rdline[0] != '!') /* skip comment lines */ { strstrip(rdline); /* eliminate leading and trailing whitespace */ if (rdline[0] != (char) NULL) /* skip blank lines */ { store_line(fs,rdline,offset); tf->nbr_records ++; if (fs->debug == 4) printf("OFFSET = %ld <> RDLINE = %s\n",offset,rdline); else if (fs->out != stdout) { if (tf->nbr_records % fs->max_nbr_elements == 0) { printf("."); fflush(stdout); } } if (tf->nbr_records == fs->nbr_tempfile_records) break; } } offset = ftell(fs->in); /* get the new position of the file */ } if (fs->out != stdout) printf("\n"); sort_list(fs->l,fs); } void store_line(FILESORT *fs,char *rdline,long offset) { char *ps; ELEMENT *e,*ee,*le; int i; ps = strextract(rdline,fs->sort_field); /* find the field */ if (ps[0] == (char) NULL) /* don't store zero length elements */ { free((void *) ps); return; } /* see if the first entry */ if (fs->l->e == (ELEMENT *) NULL) { fs->l->e = alloc_ELEMENT(); fs->l->e->s = ps; fs->l->e->offset = offset; fs->l->last = fs->l->e; fs->l->count = 1l; } else { /* otherwise should be the last */ fs->l->last->next = alloc_ELEMENT(); fs->l->last->next->s = ps; fs->l->last->next->offset = offset; fs->l->last = fs->l->last->next; fs->l->count++; } } void sort_list(LIST *ls, FILESORT *fs) { LIST **list; int i,j; ELEMENT *e, *ee; int min; /* see if only need to sort this one list */ if (ls->count <= fs->max_nbr_elements) { sort_sublist(ls,fs); return; } if (fs->debug) printf("nbr_lists = %d\nnbr_element = %d\n", fs->max_nbr_lists,fs->max_nbr_elements); /* allocate the sub-list array */ list = (LIST **)calloc((size_t) fs->max_nbr_lists, sizeof(LIST *)); if (list == (LIST **) NULL) { fprintf(stderr,"*** ERROR : Out of memory in sort_list() <%d>\n",__LINE__); exit(EXIT_FAILURE); } /* allocate the sub-lists */ for (i = 0 ; i < fs->max_nbr_lists ; i++) { list[i] = (LIST *)calloc((size_t) 1, sizeof(LIST)); if (list[i] == (LIST *) NULL) { fprintf(stderr,"*** ERROR : Out of memory in sort_list() <%d>\n",__LINE__); exit(EXIT_FAILURE); } list[i]->e = (ELEMENT *) NULL; list[i]->last = (ELEMENT *) NULL; list[i]->count = 0l; } /* break up the list into sub-lists */ for (i = 0, e = ls->e ; i < fs->max_nbr_lists && e != (ELEMENT *) NULL ; i++) { list[i]->e = e; if (list[i]->e != (ELEMENT *) NULL ) list[i]->count = 1l; for (j = 1 ; j < fs->max_nbr_elements && e != (ELEMENT *) NULL; j++) { e = e->next; list[i]->count++; } if (e != (ELEMENT *)NULL) { ee = e->next; e->next = (ELEMENT *) NULL; e = ee; } else { list[i]->count--; } } if (fs->debug) printf("sub-lists created, starting to sort sub-lists\n"); /* sort each sub-list */ if (fs->out != stdout) printf("="); for (i = 0 ; i < fs->max_nbr_lists ; i++) { if (list[i] == (LIST *) NULL) break; if (list[i]->count == 0) break; if (fs->debug) printf("list[%d]->count = %ld\n",i,list[i]->count); if (list[i]->count > 1l) sort_list(list[i],fs); if (fs->out != stdout && list[i]->count == fs->max_nbr_elements) { printf("-"); fflush(stdout); } } if (fs->out != stdout) printf("\n"); if (fs->debug) printf("sub-lists sorted, assembling the list\n"); /* assemble the list */ ls->e = (ELEMENT *) NULL; ls->last = (ELEMENT *) NULL; ls->count = 0l; if (fs->out != stdout) printf("="); while (1) { /* find the first element */ for (min = 0 ; min < fs->max_nbr_lists ; min++) if (list[min]->count > 0) break; /* see if done sorting */ if (min == fs->max_nbr_lists) { break; } /* find the minimum */ for (i = min + 1 ; i < fs->max_nbr_lists ; i++) { if (list[i]->count <= 0) continue; if (compare(list[min]->e->s,list[i]->e->s,fs->sort_type) > 0) { min = i; } } /* insert the minimum onto the end of the list */ if (ls->e == (ELEMENT *) NULL) { ls->e = list[min]->e; ls->last = ls->e; } else { ls->last->next = list[min]->e; ls->last = ls->last->next; } ls->count++; /* take the element out of the sublist */ list[min]->e = list[min]->e->next; list[min]->count--; /* NULL terminate the list */ ls->last->next = (ELEMENT *) NULL; if (fs->out != stdout) { if (ls->count % fs->max_nbr_elements == 0) { printf("*"); fflush(stdout); } } } if (fs->out != stdout) printf("\n"); } void sort_sublist(LIST *ls,FILESORT *fs) { ELEMENT *e,*ne,*be; /* set the base */ be = ls->e; ls->e = (ELEMENT *) NULL; while (be != (ELEMENT *) NULL) { ne = be->next; /* see if the only element in the new list */ if (ls->e == (ELEMENT *) NULL) { ls->e = be; be->next = (ELEMENT *) NULL; } /* see if the first element in the new list */ else if (compare(be->s,ls->e->s,fs->sort_type) < 0) { be->next = ls->e; ls->e = be; } /* otherwise insert in the right place */ else { for (e = ls->e ; e->next != (ELEMENT *) NULL ; e = e->next) { if (compare(be->s,e->next->s,fs->sort_type) < 0) break; } be->next = e->next; e->next = be; } be = ne; } } /* returns neg number if s1 < s2, 0 if s1 = s2, pos number if s1 > s2 */ int compare(char *s1,char *s2,int sort_type) { long l1,l2; char *ps1,*ps2; if (sort_type == SORT_ALPHA) return strcmp(s1,s2); /* pure alphabetic comparison */ if (sort_type != SORT_NUMERIC) return 0; /* don't know the sort, claim equal */ l1 = strtol(s1,&ps1,10); l2 = strtol(s2,&ps2,10); if (l1 > l2) return 1; if (l1 < l2) return -1; return strcmp(ps1,ps2); } ELEMENT *alloc_ELEMENT(void) { ELEMENT *e; e = (ELEMENT *) calloc((size_t) 1, sizeof(ELEMENT)); if (e == (ELEMENT *) NULL) { fprintf(stderr," ***ERROR: Out of Memory in alloc_ELEMENT() <%d>\n",__LINE__); exit(EXIT_FAILURE); } e->s = (char *) NULL; e->next = (ELEMENT *) NULL; return e; } void print_list(LIST *l) { ELEMENT *e; for (e = l->e ; e != (ELEMENT *) NULL ; e = e->next) { printf("s=%s offset=%ld\n",e->s,e->offset); } } void print_tempfile_block(FILESORT *fs,TEMPFILE *tf) { ELEMENT *e; char rdline[MAXCHAR]; int cnt; long offset; cnt = 0; offset = ftell(fs->in); /* get the current position in the file */ if (fs->debug) printf("offset = %ld\n",offset); if (fs->out != stdout) printf("="); for (e = fs->l->e ; e != (ELEMENT *) NULL ; e = e->next) { fseek(fs->in,e->offset,SEEK_SET); fgets(rdline,MAXCHAR-1,fs->in); fputs(rdline,tf->in); if (fs->out != stdout) { cnt++; if (cnt % fs->max_nbr_elements == 0) { printf("+"); fflush(stdout); } } } if (fs->out != stdout) printf("\n"); if (tf->in != stdout) fclose(tf->in); fseek(fs->in,offset,SEEK_SET); /* reset the position in the file */ if (fs->debug) { offset = ftell(fs->in); printf("offset = %ld\n",offset); } } TEMPFILE *alloc_TEMPFILE(FILESORT *fs) { TEMPFILE *tf,*ttf; tf = (TEMPFILE *) calloc((size_t) 1, sizeof(TEMPFILE)); if (tf == (TEMPFILE *) NULL) { fprintf(stderr," ***ERROR: Out of memory in alloc_TempFile() <%d>\n",__LINE__); exit(EXIT_FAILURE); } fs->nbr_TempFile ++; sprintf(tf->filename,"_FSRT_%d.TMP",fs->nbr_TempFile); tf->in = fopen(tf->filename,"w"); if (tf->in == (FILE *) NULL) { fprintf(stderr," ***ERROR: Unable to open temp file %s\n",tf->filename); exit(EXIT_FAILURE); } tf->offset = 0; /* pointer in Tempfile for the next line, -1 if at end */ tf->field = (char *) NULL; /* string containing the field for this line */ tf->line[0] = (char) NULL; /* string holding the entire current line */ tf->next = (TEMPFILE *) NULL; /* attach to the linked list */ if (fs->tf == (TEMPFILE *) NULL) fs->tf = tf; else { for (ttf = fs->tf ; ttf->next != (TEMPFILE *) NULL ; ttf = ttf->next); ttf->next = tf; } return tf; } void read_file(FILESORT *fs) { TEMPFILE *tf; tf = fs->tf; while (1) { /* read in the first set of data */ read_file_block(fs,tf); /* print to the temporary files */ print_tempfile_block(fs,tf); /* free the elements in the LIST */ delete_LIST(fs->l); /* see if done */ if (tf->nbr_records != fs->nbr_tempfile_records) break; /* allocate a new TEMPFILE */ tf = alloc_TEMPFILE(fs); } } void delete_LIST(LIST *l) { free_ELEMENT(l->e); l->e = (ELEMENT *) NULL; l->count = 0; } void free_ELEMENT(ELEMENT *e) { ELEMENT *ee,*nee; for (ee = e; ee != (ELEMENT *) NULL ; ee = nee) { nee = ee->next; free((void *)ee->s); free((void *) ee); } } void print_output_file(FILESORT *fs) { TEMPFILE *tf, *mintf; int cnt,tf_cnt; cnt = 0; /* open the tempfiles and read the first line of each file */ for (tf = fs->tf,tf_cnt=0 ; tf != (TEMPFILE *) NULL ; tf = tf->next) { tf->in = fopen(tf->filename,"r"); if (tf->in == (FILE *) NULL) { fprintf(stderr," ***ERROR: Unable to open file %s\n",tf->filename); exit(EXIT_FAILURE); } if (fgets(tf->line,MAXCHAR,tf->in)) { tf->field = strextract(tf->line,fs->sort_field); } else { tf->field = (char *) NULL; } tf_cnt++; } /* see if only one tempfile */ if (tf_cnt == 1 && fs->out != stdout) { /* only need to rename the tempfile, since it is already sorted */ fclose(fs->out); fclose(fs->tf->in); if (remove(fs->Outfile) == 0) { if (rename(fs->tf->filename,fs->Outfile) == 0) { return; } else { fprintf(stderr," *** WARNING: Unable to rename file %s %s, using copy\n", fs->tf->filename,fs->Outfile); fs->out = fopen(fs->Outfile,"w"); fs->tf->in = fopen(fs->tf->filename,"r"); if (fs->out == (FILE *) NULL) { fprintf(stderr," *** ERROR: Unable to create file %s\n",fs->Outfile); exit(EXIT_FAILURE); } if (fs->tf->in == (FILE *) NULL) { fprintf(stderr," *** ERROR: Unable to open file %s\n",fs->tf->filename); exit(EXIT_FAILURE); } } } else { fprintf(stderr," *** WARNING: Unable to delete file %s\n",fs->Outfile); fs->out = fopen(fs->Outfile,"w"); if (fs->out == (FILE *) NULL) { fprintf(stderr," *** ERROR: Unable to create file %s\n",fs->Outfile); exit(EXIT_FAILURE); } } } /* loop until done */ while (1) { /* find the first field */ for (mintf = fs->tf ; mintf != (TEMPFILE *) NULL ; mintf = mintf->next) { if (mintf->field != (char *) NULL) break; } /* see if done */ if (mintf == (TEMPFILE *) NULL) break; /* find the minimum */ for (tf = mintf->next ; tf != (TEMPFILE *) NULL ; tf = tf->next) { if (tf->field == (char *) NULL) continue; if (compare(mintf->field,tf->field,fs->sort_type) > 0) mintf = tf; } /* print the minimum */ fputs(mintf->line,fs->out); cnt++; if (cnt % 200 == 0 && fs->out != stdout) { printf("*"); fflush(stdout); } /* get the next line from mintf */ free((void *) mintf->field); if (fgets(mintf->line,MAXCHAR,mintf->in)) { mintf->field = strextract(mintf->line,fs->sort_field); } else { mintf->field = (char *) NULL; mintf->line[0] = (char) NULL; fclose(mintf->in); } } /* close the output file */ if (fs->out != stdout) { printf("\n"); fclose(fs->out); } /* delete all of the temp files */ for (tf = fs->tf ; tf != (TEMPFILE *) NULL && fs->debug == 0; tf = tf->next) { remove(tf->filename); } } int is_space(char c) { if (c == ' ' || c == '\t') return 1; return 0; }