From: Lionel Sambuc Date: Wed, 5 Dec 2012 14:36:48 +0000 (+0100) Subject: Upgrading sort, which is needed by lorder X-Git-Tag: v3.2.1~167 X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zlib_tech.html?a=commitdiff_plain;h=0fbbaa43e914d38ef3af549125d31574117d1ebf;p=minix.git Upgrading sort, which is needed by lorder Change-Id: I64ac0509f4360c947a677600db77e7612a7cbebd --- diff --git a/commands/Makefile b/commands/Makefile index fcca26423..92500969d 100644 --- a/commands/Makefile +++ b/commands/Makefile @@ -23,7 +23,7 @@ SUBDIR= add_route arp ash at backup banner basename btrace cal \ ramdisk rarpd rawspeed rcp rdate readclock \ reboot remsync rev rget rlogin \ rotate rsh rshd service setup shar acksize \ - sleep slip sort spell split sprofalyze sprofdiff srccrc \ + sleep slip spell split sprofalyze sprofdiff srccrc \ stty svclog svrctl swifi sync synctree sysenv \ syslogd tail tcpd tcpdp tcpstat tee telnet \ telnetd term termcap tget time touch tr \ diff --git a/commands/sort/Makefile b/commands/sort/Makefile deleted file mode 100644 index 53f36ee24..000000000 --- a/commands/sort/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -PROG= sort -MAN= - -.include diff --git a/commands/sort/sort.c b/commands/sort/sort.c deleted file mode 100644 index 5481d2779..000000000 --- a/commands/sort/sort.c +++ /dev/null @@ -1,1217 +0,0 @@ -/* sort - sort a file of lines Author: Michiel Huisjes */ - -/* SYNOPSIS: - * sort [-funbirdcmt'x'] [+beg_pos[opts] [-end_pos]] [-o outfile] [file].. - * - * [opts] can be any of - * -f : Fold upper case to lower. - * -n : Sort to numeric value (optional decimal point) implies -b - * -b : Skip leading blanks - * -i : Ignore chars outside ASCII range (040 - 0176) - * -r : Reverse the sense of comparisons. - * -d : Sort to dictionary order. Only letters, digits, comma's and points - * are compared. - * If any of these flags are used in [opts], then they override all global - * ordering for this field. - * - * I/O control flags are: - * -u : Print uniq lines only once. - * -c : Check if files are sorted in order. - * -m : Merge already sorted files. - * -o outfile : Name of output file. (Can be one of the input files). - * Default is stdout. - * - : Take stdin as input. - * - * Fields: - * -t'x' : Field separating character is 'x' - * +a.b : Start comparing at field 'a' with offset 'b'. A missing 'b' is - * taken to be 0. - * -a.b : Stop comparing at field 'a' with offset 'b'. A missing 'b' is - * taken to be 0. - * A missing -a.b means the rest of the line. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define OPEN_FILES (OPEN_MAX-4) /* Nr of open files per process */ -#if __minix_vmd -#define MEMORY_SIZE (1024 * 1024) -#else -#define MEMORY_SIZE ((10 * sizeof(int)) * 1024) -#endif - /* Total mem_size */ -#define LINE_SIZE (1024 >> 1) /* Max length of a line */ -#define IO_SIZE (2 * 1024) /* Size of buffered output */ -#define STD_OUT 1 /* Fd of terminal */ - -/* Return status of functions */ -#define OK 0 -#define ERROR -1 - -/* Compare return values */ -#define LOWER -1 -#define SAME 0 -#define HIGHER 1 - -/* Table definitions. */ -#define DICT 0x001 /* Alpha, numeric, letters and . */ -#define ASCII 0x002 /* All between ' ' and '~' */ -#define BLANK 0x004 /* ' ' and '\t' */ -#define DIGIT 0x008 /* 0-9 */ -#define UPPER 0x010 /* A-Z */ - -typedef int BOOL; - -#define FALSE 0 -#define TRUE 1 - -typedef struct { - int fd; /* Fd of file */ - char *buffer; /* Buffer for reads */ - int read_chars; /* Nr of chars actually read in buffer */ - int cnt; /* Nr of chars taken out of buffer */ - char *line; /* Contains line currently used */ -} MERGE; - -MERGE merge_f[OPEN_FILES]; /* Merge structs */ -int buf_size; /* Size of core available for each struct */ - -#define FIELDS_LIMIT 10 /* 1 global + 9 user */ -#define GLOBAL 0 - -typedef struct { - int beg_field, beg_pos; /* Begin field + offset */ - int end_field, end_pos; /* End field + offset. ERROR == EOLN */ - BOOL reverse; /* TRUE if rev. flag set on this field */ - BOOL blanks; - BOOL dictionary; - BOOL fold_case; - BOOL ascii; - BOOL numeric; - BOOL hexmode; -} FIELD; - -/* Field declarations. A total of FILEDS_LIMIT is allowed */ -FIELD fields[FIELDS_LIMIT]; -int field_cnt; /* Nr of field actually assigned */ - -/* Various output control flags */ -BOOL check = FALSE; -BOOL only_merge = FALSE; -BOOL uniq = FALSE; - -char *mem_top; /* Mem_top points to lowest pos of memory. */ -char *cur_pos; /* First free position in mem */ -char **line_table; /* Pointer to the internal line table */ -BOOL in_core = TRUE; /* Set if input cannot all be sorted in core */ - - /* Place where temp_files should be made */ -char temp_files[] = "/tmp/sort.XXXXX.XX"; -char *output_file; /* Name of output file */ -int out_fd; /* Fd to output file (could be STD_OUT) */ -char out_buffer[IO_SIZE]; /* For buffered output */ - -char **argptr; /* Pointer to argv structure */ -int args_offset; /* Nr of args spilled on options */ -int args_limit; /* Nr of args given */ - -char separator; /* Char that separates fields */ -int nr_of_files = 0; /* Nr_of_files to be merged */ -int disabled; /* Nr of files done */ - -char USAGE[] = "Usage: sort [-funbirdcmt'x'] [+beg_pos [-end_pos]] [-o outfile] [file] .."; - -/* Forward declarations */ -int main(int argc, char **argv); -void get_opts(char *ptr, FIELD * field); -void new_field(FIELD * field, int *offset, BOOL beg_fl); -void adjust_options(FIELD * field); -void error(BOOL quit, char *message, char *arg); -void open_outfile(void); -void get_file(int fd, off_t size); -int last_line(void); -void print_table(int fd); -char *file_name(int nr); -void mread(int fd, char *address, int bytes); -void mwrite(int fd, char *address, int bytes); -void sort(void); -void sort_table(int nel); -void incr(int si, int ei); -int cmp_fields(char *el1, char *el2); -void build_field(char *dest, FIELD * field, char *src); -char *skip_fields(char *str, int nf); -int compare(char *el1, char *el2); -int cmp(unsigned char *el1, unsigned char *el2, FIELD * field); -int digits(char *str1, char *str2, BOOL check_sign); -int hexits(char *str1, char *str2); -void files_merge(int file_cnt); -void merge(int start_file, int limit_file); -void put_line(char *line); -MERGE * print(MERGE * merg, int file_cnt); -int read_line(MERGE * merg); -MERGE * skip_lines(MERGE * smallest, int file_cnt); -void uniq_lines(MERGE * merg); -void check_file(int fd, char *file); -int length(char *line); -void copy(char *dest, char *src); -char *msbrk(int size); -void mbrk(char *address); -void catch(int dummy); - -/* Table of all chars. 0 means no special meaning. */ -char table[256] = { -/* '^@' to space */ - 0, 0, 0, 0, 0, 0, 0, 0, - 0, BLANK | DICT, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - -/* Space to '0' */ - BLANK | DICT | ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, - ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, - ASCII, ASCII, - -/* '0' until '9' */ - DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, - DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, - DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, - DIGIT | DICT | ASCII, - -/* ASCII from ':' to '@' */ - ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, - -/* Upper case letters 'A' to 'Z' */ - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, - UPPER | DICT | ASCII, UPPER | DICT | ASCII, - -/* ASCII from '[' to '`' */ - ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, - -/* Lower case letters from 'a' to 'z' */ - DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, - DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, - DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, - DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, - DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, - DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, - DICT | ASCII, DICT | ASCII, - -/* ASCII from '{' to '~' */ - ASCII, ASCII, ASCII, ASCII, - -/* Stuff from -1 to -177 */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 -}; - - -/* - * Get_opts () assigns the options into the field structure as described in ptr. - * This field structure could be the GLOBAL one. - */ -void get_opts(ptr, field) -register char *ptr; -register FIELD *field; -{ - switch (*ptr) { - case 'b': /* Skip leading blanks */ - field->blanks = TRUE; - break; - case 'd': /* Dictionary order */ - field->dictionary = TRUE; - break; - case 'f': /* Fold upper case to lower */ - field->fold_case = TRUE; - break; - case 'i': /* Skip chars outside ' ' '~' */ - field->ascii = TRUE; - break; - case 'n': /* Sort on numeric */ - field->numeric = TRUE; - field->blanks = TRUE; - break; - case 'x': - field->hexmode = TRUE; - field->blanks = TRUE; - break; - case 'r': /* Reverse comparisons */ - field->reverse = TRUE; - break; - default: /* Illegal options */ - error(TRUE, USAGE, NULL); - } -} - -/* New_field () assigns a new field as described by the arguments. - * A field description is of the form: +a.b[opts] -c.d, where b and d, as well - * as -c.d and [opts] are optional. Nr before digit is field nr. Nr after digit - * is offset from field. - */ -void new_field(field, offset, beg_fl) -register FIELD *field; /* Field to assign */ -int *offset; /* Offset in argv structure */ -BOOL beg_fl; /* Assign beg or end of field */ -{ - register char *ptr; - - ptr = argptr[*offset]; - *offset += 1; /* Incr offset to next arg */ - ptr++; - - if (beg_fl) - field->beg_field = atoi(ptr); /* Assign int of first field */ - else - field->end_field = atoi(ptr); - - while (table[*ptr] & DIGIT) /* Skip all digits */ - ptr++; - - if (*ptr == '.') { /* Check for offset */ - ptr++; - if (beg_fl) - field->beg_pos = atoi(ptr); - else - field->end_pos = atoi(ptr); - while (table[*ptr] & DIGIT) /* Skip digits */ - ptr++; - } - if (beg_fl) { - while (*ptr != '\0') /* Check options after field */ - get_opts(ptr++, field); - } - if (beg_fl) { /* Check for end pos */ - ptr = argptr[*offset]; - if (ptr && *ptr == '-' && ((table[*(ptr + 1)] & DIGIT) || *(ptr + 1) == '.')) { - new_field(field, offset, FALSE); - if (field->beg_field > field->end_field) - error(TRUE, "End field is before start field!", NULL); - } else /* No end pos. */ - field->end_field = ERROR; - } -} - -int main(argc, argv) -int argc; -char *argv[]; -{ - int arg_count = 1; /* Offset in argv */ - struct stat st; - register char *ptr; /* Ptr to *argv in use */ - register int fd; - int pid, pow; - - argptr = argv; - cur_pos = mem_top = msbrk(MEMORY_SIZE); /* Find lowest mem. location */ - - while (arg_count < argc && ((ptr = argv[arg_count])[0] == '-' || *ptr == '+')) { - if (*ptr == '-' && *(ptr + 1) == '\0') /* "-" means stdin */ - break; - if (*ptr == '+') { /* Assign field. */ - if (++field_cnt == FIELDS_LIMIT) - error(TRUE, "Too many fields", NULL); - new_field(&fields[field_cnt], &arg_count, TRUE); - } else { /* Get output options */ - while (*++ptr) { - switch (*ptr) { - case 'c': /* Only check file */ - check = TRUE; - break; - case 'm': /* Merge (sorted) files */ - only_merge = TRUE; - break; - case 'u': /* Only give uniq lines */ - uniq = TRUE; - break; - case 'o': /* Name of output file */ - output_file = argv[++arg_count]; - break; - case 't': /* Field separator */ - ptr++; - separator = *ptr; - break; - default: /* Sort options */ - get_opts(ptr, &fields[GLOBAL]); - } - } - arg_count++; - } - } - - for (fd = 1; fd <= field_cnt; fd++) adjust_options(&fields[fd]); - -/* Create name of tem_files 'sort.pid.aa' */ - ptr = &temp_files[10]; - pid = getpid(); - pow = 10000; - while (pow != 0) { - *ptr++ = pid / pow + '0'; - pid %= pow; - pow /= 10; - } - - signal(SIGINT, catch); - -/* Only merge files. Set up */ - if (only_merge) { - args_limit = args_offset = arg_count; - while (argv[args_limit] != NULL) - args_limit++; /* Find nr of args */ - files_merge(args_limit - arg_count); - exit(0); - } - if (arg_count == argc) { /* No args left. Use stdin */ - if (check) - check_file(0, NULL); - else - get_file(0, (off_t) 0); - } else - while (arg_count < argc) { /* Sort or check args */ - if (strcmp(argv[arg_count], "-") == 0) - fd = 0; - else if (stat(argv[arg_count], &st) < 0) { - error(FALSE, "Cannot find ", argv[arg_count++]); - continue; - } - - /* Open files */ - else if ((fd = open(argv[arg_count], O_RDONLY)) < 0) { - error(FALSE, "Cannot open ", argv[arg_count++]); - continue; - } - if (check) - check_file(fd, argv[arg_count]); - else /* Get_file reads whole file */ - get_file(fd, st.st_size); - arg_count++; - } - - if (check) exit(0); - - sort(); /* Sort whatever is left */ - - if (nr_of_files == 1) /* Only one file sorted -> don't merge */ - exit(0); - - files_merge(nr_of_files); - return(0); -} - -/* Adjust_options() assigns all global variables set also in the fields - * assigned. - */ -void adjust_options(field) -register FIELD *field; -{ - register FIELD *gfield = &fields[GLOBAL]; - - if (gfield->reverse) field->reverse = TRUE; - if (gfield->blanks) field->blanks = TRUE; - if (gfield->dictionary) field->dictionary = TRUE; - if (gfield->fold_case) field->fold_case = TRUE; - if (gfield->ascii) field->ascii = TRUE; - if (gfield->numeric) field->numeric = TRUE; -} - -/* Error () prints the error message on stderr and exits if quit == TRUE. */ -void error(quit, message, arg) -register BOOL quit; -register char *message, *arg; -{ - write(2, message, strlen(message)); - if (arg != NULL) write(2, arg, strlen(arg)); - perror(" "); - if (quit) exit(1); -} - -/* Open_outfile () assigns to out_fd the fd where the output must go when all - * the sorting is done. - */ -void open_outfile() -{ - if (output_file == NULL) - out_fd = STD_OUT; - else if ((out_fd = creat(output_file, 0644)) < 0) - error(TRUE, "Cannot creat ", output_file); -} - -/* Get_file reads the whole file of filedescriptor fd. If the file is too big - * to keep in core, a partial sort is done, and the output is stashed somewhere. - */ -void get_file(fd, size) -int fd; /* Fd of file to read */ -register off_t size; /* Size of file */ -{ - register int i; - int rest; /* Rest in memory */ - char save_ch; /* Used in stdin readings */ - - rest = MEMORY_SIZE - (cur_pos - mem_top); - if (fd == 0) { /* We're reding stdin */ - while ((i = read(0, cur_pos, rest)) > 0) { - if ((cur_pos - mem_top) + i == MEMORY_SIZE) { - in_core = FALSE; - i = last_line(); /* End of last line */ - save_ch = mem_top[i]; - mem_top[i] = '\0'; - sort(); /* Sort core */ - mem_top[i] = save_ch; /* Restore erased char */ - /* Restore last (half read) line */ - for (rest = 0; i + rest != MEMORY_SIZE; rest++) - mem_top[rest] = mem_top[i + rest]; - /* Assign current pos. in memory */ - cur_pos = &mem_top[rest]; - } else { /* Fits, just assign position in mem. */ - cur_pos = cur_pos + i; - *cur_pos = '\0'; - } - - /* Calculate rest of mem */ - rest = MEMORY_SIZE - (cur_pos - mem_top); - } - } - - /* Reading file. Check size */ - else if (size > rest) { /* Won't fit */ - mread(fd, cur_pos, rest); - in_core = FALSE; - i = last_line(); /* Get pos. of last line */ - mem_top[i] = '\0'; /* Truncate */ - (void) lseek(fd, (off_t) (i - MEMORY_SIZE), SEEK_CUR); /* Do this next time */ - size = size - rest - i + MEMORY_SIZE; /* Calculate rest */ - cur_pos = mem_top; /* Reset mem */ - sort(); /* Sort core */ - get_file(fd, size); /* Get rest of file */ - } else { /* Fits. Just read in */ - rest = size; - mread(fd, cur_pos, rest); - cur_pos = cur_pos + rest; /* Reassign cur_pos */ - *cur_pos = '\0'; - (void) close(fd); /* File completed */ - } -} - -/* Last_line () find the last line in core and retuns the offset from the top - * of the memory. - */ -int last_line() -{ - register int i; - - for (i = MEMORY_SIZE - 2; i > 0; i--) - if (mem_top[i] == '\n') break; - return i + 1; -} - -/* Print_table prints the line table in the given file_descriptor. If the fd - * equals ERROR, it opens a temp_file itself. - */ -void print_table(fd) -int fd; -{ - register char **line_ptr; /* Ptr in line_table */ - register char *ptr; /* Ptr to line */ - int index = 0; /* Index in output buffer */ - - if (fd == ERROR) { - if ((fd = creat(file_name(nr_of_files), 0644)) < 0) - error(TRUE, "Cannot creat ", file_name(nr_of_files)); - } - for (line_ptr = line_table; *line_ptr != NULL; line_ptr++) { - ptr = *line_ptr; - /* Skip all same lines if uniq is set */ - if (uniq && *(line_ptr + 1) != NULL) { - if (compare(ptr, *(line_ptr + 1)) == SAME) continue; - } - do { /* Print line in a buffered way */ - out_buffer[index++] = *ptr; - if (index == IO_SIZE) { - mwrite(fd, out_buffer, IO_SIZE); - index = 0; - } - } while (*ptr++ != '\n'); - } - mwrite(fd, out_buffer, index);/* Flush buffer */ - (void) close(fd); /* Close file */ - nr_of_files++; /* Increment nr_of_files to merge */ -} - -/* File_name () returns the nr argument from the argument list, or a uniq - * filename if the nr is too high, or the arguments were not merge files. - */ -char *file_name(nr) -register int nr; -{ - if (only_merge) { - if (args_offset + nr < args_limit) return argptr[args_offset + nr]; - } - temp_files[16] = nr / 26 + 'a'; - temp_files[17] = nr % 26 + 'a'; - - return temp_files; -} - -/* Mread () performs a normal read (), but checks the return value. */ -void mread(fd, address, bytes) -int fd; -char *address; -register int bytes; -{ - if (read(fd, address, bytes) < 0 && bytes != 0) - error(TRUE, "Read error", NULL); -} - -/* Mwrite () performs a normal write (), but checks the return value. */ -void mwrite(fd, address, bytes) -int fd; -char *address; -register int bytes; -{ - if (write(fd, address, bytes) != bytes && bytes != 0) - error(TRUE, "Write error", NULL); -} - -/* Sort () sorts the input in memory starting at mem_top. */ -void sort() -{ - register char *ptr = mem_top; - register int count = 0; - -/* Count number of lines in memory */ - while (*ptr) { - if (*ptr++ == '\n') count++; - } - -/* Set up the line table */ - line_table = (char **) msbrk(count * sizeof(char *) + sizeof(char *)); - - count = 1; - ptr = line_table[0] = mem_top; - while (*ptr) { - if (*ptr++ == '\n') line_table[count++] = ptr; - } - - line_table[count - 1] = NULL; - -/* Sort the line table */ - sort_table(count - 1); - -/* Stash output somewhere */ - if (in_core) { - open_outfile(); - print_table(out_fd); - } else - print_table(ERROR); - -/* Free line table */ - mbrk((char *) line_table); -} - -/* Sort_table () sorts the line table consisting of nel elements. */ -void sort_table(nel) -register int nel; -{ - char *tmp; - register int i; - - /* Make heap */ - for (i = (nel >> 1); i >= 1; i--) incr(i, nel); - - /* Sort from heap */ - for (i = nel; i > 1; i--) { - tmp = line_table[0]; - line_table[0] = line_table[i - 1]; - line_table[i - 1] = tmp; - incr(1, i - 1); - } -} - -/* Incr () increments the heap. */ -void incr(si, ei) -register int si, ei; -{ - char *tmp; - - while (si <= (ei >> 1)) { - si <<= 1; - if (si + 1 <= ei && compare(line_table[si - 1], line_table[si]) <= 0) - si++; - if (compare(line_table[(si >> 1) - 1], line_table[si - 1]) >= 0) - return; - tmp = line_table[(si >> 1) - 1]; - line_table[(si >> 1) - 1] = line_table[si - 1]; - line_table[si - 1] = tmp; - } -} - -/* Cmp_fields builds new lines out of the lines pointed to by el1 and el2 and - * puts it into the line1 and line2 arrays. It then calls the cmp () routine - * with the field describing the arguments. - */ -int cmp_fields(el1, el2) -register char *el1, *el2; -{ - int i, ret; - char line1[LINE_SIZE], line2[LINE_SIZE]; - - for (i = 0; i < field_cnt; i++) { /* Setup line parts */ - build_field(line1, &fields[i + 1], el1); - build_field(line2, &fields[i + 1], el2); - if ((ret = cmp((unsigned char *) line1, (unsigned char *) line2, - &fields[i + 1])) != SAME) - break; /* If equal, try next field */ - } - -/* Check for reverse flag */ - if (i != field_cnt && fields[i + 1].reverse) return -ret; - -/* Else return the last return value of cmp () */ - return ret; -} - -/* Build_field builds a new line from the src as described by the field. - * The result is put in dest. - */ -void build_field(dest, field, src) -char *dest; /* Holds result */ -register FIELD *field; /* Field description */ -register char *src; /* Source line */ -{ - char *begin = src; /* Remember start location */ - char *last; /* Pointer to end location */ - int i; - -/* Skip begin fields */ - src = skip_fields(src, field->beg_field); - -/* Skip begin positions */ - for (i = 0; i < field->beg_pos && *src != '\n'; i++) src++; - -/* Copy whatever is left */ - copy(dest, src); - -/* If end field is assigned truncate (perhaps) the part copied */ - if (field->end_field != ERROR) { /* Find last field */ - last = skip_fields(begin, field->end_field); -/* Skip positions as given by end fields description */ - for (i = 0; i < field->end_pos && *last != '\n'; i++) last++; - dest[last - src] = '\n';/* Truncate line */ - } -} - -/* Skip_fields () skips nf fields of the line pointed to by str. */ -char *skip_fields(str, nf) -register char *str; -int nf; -{ - while (nf-- > 0) { - if (separator == '\0') {/* Means ' ' or '\t' */ - while (*str != ' ' && *str != '\t' && *str != '\n') str++; - while (table[*str] & BLANK) str++; - } else { - while (*str != separator && *str != '\n') str++; - if (*str == separator) str++; - } - } - return str; /* Return pointer to indicated field */ -} - -/* Compare is called by all sorting routines. It checks if fields assignments - * has been made. if so, it calls cmp_fields (). If not, it calls cmp () and - * reversed the return value if the (global) reverse flag is set. - */ -int compare(el1, el2) -register char *el1, *el2; -{ - int ret; - - if (field_cnt > GLOBAL) return cmp_fields(el1, el2); - - ret = cmp((unsigned char *) el1, (unsigned char *) el2, &fields[GLOBAL]); - return(fields[GLOBAL].reverse) ? -ret : ret; -} - -/* Cmp () is the actual compare routine. It compares according to the - * description given in the field pointer. - */ -int cmp(el1, el2, field) -register unsigned char *el1, *el2; -FIELD *field; -{ - int c1, c2; - - if (field->blanks) { /* Skip leading blanks */ - while (table[*el1] & BLANK) el1++; - while (table[*el2] & BLANK) el2++; - } - if (field->numeric) /* Compare numeric */ - return digits((char *) el1, (char *) el2, TRUE); - if (field->hexmode) /* Compare hex */ - return hexits((char *) el1, (char *) el2); - - for (;;) { - while (*el1 == *el2) { - if (*el1++ == '\n') /* EOLN on both strings */ - return SAME; - el2++; - } - if (*el1 == '\n') /* EOLN on string one */ - return LOWER; - if (*el2 == '\n') return HIGHER; - if (field->ascii) { /* Skip chars outside 040 - 0177 */ - if ((table[*el1] & ASCII) == 0) { - do { - el1++; - } while ((table[*el1] & ASCII) == 0); - continue; - } - if ((table[*el2] & ASCII) == 0) { - do { - el2++; - } while ((table[*el2] & ASCII) == 0); - continue; - } - } - if (field->dictionary) {/* Skip non-dict chars */ - if ((table[*el1] & DICT) == 0) { - do { - el1++; - } while ((table[*el1] & DICT) == 0); - continue; - } - if ((table[*el2] & DICT) == 0) { - do { - el2++; - } while ((table[*el2] & DICT) == 0); - continue; - } - } - if (field->fold_case) { /* Fold upper case to lower */ - if (table[c1 = *el1++] & UPPER) c1 += 'a' - 'A'; - if (table[c2 = *el2++] & UPPER) c2 += 'a' - 'A'; - if (c1 == c2) continue; - return c1 - c2; - } - return *el1 - *el2; - } - - /* NOTREACHED */ -} - -int hexits(char *str1, char *str2) -{ - unsigned long v1, v2; - int r1, r2; - r1 = sscanf(str1, "0x%lx", &v1); - r2 = sscanf(str2, "0x%lx", &v2); - - /* ordering based on reasonable hex number */ - if(r1 == 1 && r2 != 1) return HIGHER; - if(r1 != 1 && r2 == 1) return LOWER; - if(r1 != 1 && r2 != 1) return SAME; - - if(v1 > v2) return HIGHER; - if(v1 < v2) return LOWER; - - return SAME; -} - -/* - * Digits compares () the two strings that point to a number of digits followed - * by an optional decimal point. - */ -int digits(str1, str2, check_sign) -register char *str1, *str2; -BOOL check_sign; /* True if sign must be checked */ -{ - BOOL negative = FALSE; /* True if negative numbers */ - int diff, pow, ret; - -/* Check for optional minus or plus sign */ - if (check_sign) { - if (*str1 == '-') { - negative = TRUE; - str1++; - } else if (*str1 == '+') - str1++; - - if (*str2 == '-') { - if (negative == FALSE) return HIGHER; - str2++; - } else if (negative) - return LOWER; - else if (*str2 == '+') - str2++; - } - -/* Keep incrementing as long as digits are available and equal */ - while ((table[*str1] & DIGIT) && table[*str2] & DIGIT) { - if (*str1 != *str2) break; - str1++; - str2++; - } - -/* First check for the decimal point. */ - if (*str1 == '.' || *str2 == '.') { - if (*str1 == '.') { - if (*str2 == '.') /* Both. Check decimal part */ - ret = digits(str1 + 1, str2 + 1, FALSE); - else - ret = (table[*str2] & DIGIT) ? LOWER : HIGHER; - } else - ret = (table[*str1] & DIGIT) ? HIGHER : LOWER; - } - -/* Now either two digits differ, or unknown char is seen (e.g. end of string) */ - else if ((table[*str1] & DIGIT) && (table[*str2] & DIGIT)) { - diff = *str1 - *str2; /* Basic difference */ - pow = 0; /* Check power of numbers */ - while (table[*str1++] & DIGIT) pow++; - while (table[*str2++] & DIGIT) pow--; - ret = (pow == 0) ? diff : pow; - } - -/* Unknown char. Check on which string it occurred */ - else { - if ((table[*str1] & DIGIT) == 0) - ret = (table[*str2] & DIGIT) ? LOWER : SAME; - else - ret = HIGHER; - } - -/* Reverse sense of comparisons if negative is true. (-1000 < -1) */ - return(negative) ? -ret : ret; -} - -/* Files_merge () merges all files as indicated by nr_of_files. Merging goes - * in numbers of files that can be opened at the same time. (OPEN_FILES) - */ -void files_merge(file_cnt) -register int file_cnt; /* Nr_of_files to merge */ -{ - register int i; - int limit; - - for (i = 0; i < file_cnt; i += OPEN_FILES) { - /* Merge last files and store in output file */ - if ((limit = i + OPEN_FILES) >= file_cnt) { - open_outfile(); - limit = file_cnt; - } else { /* Merge OPEN_FILES files and store in temp - * file */ - temp_files[16] = file_cnt / 26 + 'a'; - temp_files[17] = file_cnt % 26 + 'a'; - if ((out_fd = creat(temp_files, 0644)) < 0) - error(TRUE, "Cannot creat ", temp_files); - file_cnt++; - } - merge(i, limit); - } - -/* Cleanup mess */ - i = (only_merge) ? args_limit - args_offset : 0; - while (i < file_cnt) (void) unlink(file_name(i++)); -} - -/* Merge () merges the files between start_file and limit_file. */ -void merge(start_file, limit_file) -int start_file, limit_file; -{ - register MERGE *smallest; /* Keeps track of smallest line */ - register int i; - int file_cnt = limit_file - start_file; /* Nr of files to merge */ - -/* Calculate size in core available for file_cnt merge structs */ - buf_size = MEMORY_SIZE / file_cnt - LINE_SIZE; - - mbrk(mem_top); /* First reset mem to lowest loc. */ - disabled = 0; /* All files not done yet */ - -/* Set up merge structures. */ - for (i = start_file; i < limit_file; i++) { - smallest = &merge_f[i - start_file]; - if (!strcmp(file_name(i), "-")) /* File is stdin */ - smallest->fd = 0; - else if ((smallest->fd = open(file_name(i), O_RDONLY)) < 0) { - smallest->fd = ERROR; - error(FALSE, "Cannot open ", file_name(i)); - disabled++; /* Done this file */ - continue; - } - smallest->buffer = msbrk(buf_size); - smallest->line = msbrk(LINE_SIZE); - smallest->cnt = smallest->read_chars = 0; - (void) read_line(smallest); /* Read first line */ - } - - if (disabled == file_cnt) { /* Couldn't open files */ - (void) close(out_fd); - return; - } - -/* Find a merg struct to assign smallest. */ - for (i = 0; i < file_cnt; i++) { - if (merge_f[i].fd != ERROR) { - smallest = &merge_f[i]; - break; - } - } - -/* Loop until all files minus one are done */ - while (disabled < file_cnt - 1) { - if (uniq) /* Skip all same lines */ - smallest = skip_lines(smallest, file_cnt); - else { /* Find smallest line */ - for (i = 0; i < file_cnt; i++) { - if (merge_f[i].fd == ERROR) - continue; /* We've had this one */ - if (compare(merge_f[i].line, smallest->line) < 0) - smallest = &merge_f[i]; - } - } /* Print line and read next */ - smallest = print(smallest, file_cnt); - } - - if (only_merge && uniq) - uniq_lines(smallest); /* Print only uniq lines */ - else /* Print rest of file */ - while (print(smallest, file_cnt) != NULL); - - put_line(NULL); /* Flush output buffer */ -} - -/* Put_line () prints the line into the out_fd filedescriptor. If line equals - * NULL, the out_fd is flushed and closed. - */ -void put_line(line) -register char *line; -{ - static int index = 0; /* Index in out_buffer */ - - if (line == NULL) { /* Flush and close */ - mwrite(out_fd, out_buffer, index); - index = 0; - (void) close(out_fd); - return; - } - do { /* Fill out_buffer with line */ - out_buffer[index++] = *line; - if (index == IO_SIZE) { - mwrite(out_fd, out_buffer, IO_SIZE); - index = 0; - } - } while (*line++ != '\n'); -} - -/* - * Print () prints the line of the merg structure and tries to read another one. - * If this fails, it returns the next merg structure which file_descriptor is - * still open. If none could be found, a NIL structure is returned. - */ -MERGE *print(merg, file_cnt) -register MERGE *merg; -int file_cnt; /* Nr of files that are being merged */ -{ - register int i; - - put_line(merg->line); /* Print the line */ - - if (read_line(merg) == ERROR) { /* Read next line */ - for (i = 0; i < file_cnt; i++) { - if (merge_f[i].fd != ERROR) { - merg = &merge_f[i]; - break; - } - } - if (i == file_cnt) /* No more files left */ - return NULL; - } - return merg; -} - -/* Read_line () reads a line from the fd from the merg struct. If the read - * failed, disabled is incremented and the file is closed. Readings are - * done in buf_size bytes. - * Lines longer than LINE_SIZE are silently truncated. - */ -int read_line(merg) -register MERGE *merg; -{ - register char *ptr = merg->line - 1; /* Ptr buf that will hold line */ - - do { - ptr++; - if (merg->cnt == merg->read_chars) { /* Read new buffer */ - if ((merg->read_chars = - read(merg->fd, merg->buffer, buf_size)) <= 0) { - (void) close(merg->fd); /* OOPS */ - merg->fd = ERROR; - disabled++; - return ERROR; - } - merg->cnt = 0; - } - *ptr = merg->buffer[merg->cnt++]; /* Assign next char of line */ - if (ptr - merg->line == LINE_SIZE - 1) - *ptr = '\n'; /* Truncate very long lines */ - } while (*ptr != '\n' && *ptr != '\0'); - - if (*ptr == '\0') /* Add '\n' to last line */ - *ptr = '\n'; - *++ptr = '\0'; /* Add '\0' */ - return OK; -} - -/* Skip_lines () skips all same lines in all the files currently being merged. - * It returns a pointer to the merge struct containing the smallest line. - */ -MERGE *skip_lines(smallest, file_cnt) -register MERGE *smallest; -int file_cnt; -{ - register int i; - int ret; - - if (disabled == file_cnt - 1) /* We've had all */ - return smallest; - - for (i = 0; i < file_cnt; i++) { - if (merge_f[i].fd == ERROR || smallest == &merge_f[i]) - continue; /* Don't check same file */ - while ((ret = compare(merge_f[i].line, smallest->line)) == 0) { - if (read_line(&merge_f[i]) == ERROR) break; /* EOF */ - } - if (ret < 0) /* Line wasn't smallest. Try again */ - return skip_lines(&merge_f[i], file_cnt); - } - return smallest; -} - -/* Uniq_lines () prints only the uniq lines out of the fd of the merg struct. */ -void uniq_lines(merg) -register MERGE *merg; -{ - char lastline[LINE_SIZE]; /* Buffer to hold last line */ - - for (;;) { - put_line(merg->line); /* Print this line */ - copy(lastline, merg->line); /* and save it */ - if (read_line(merg) == ERROR) /* Read the next */ - return; - /* Keep reading until lines duffer */ - while (compare(lastline, merg->line) == SAME) - if (read_line(merg) == ERROR) return; - } - - /* NOTREACHED */ -} - -/* - * Check_file () checks if a file is sorted in order according to the arguments - * given in main (). - */ -void check_file(fd, file) -int fd; -char *file; -{ - register MERGE *merg; /* 1 file only */ - char lastline[LINE_SIZE]; /* Save last line */ - register int ret; /* ret status of compare */ - - if (fd == 0) file = "stdin"; - merg = (MERGE *) mem_top; /* Assign MERGE structure */ - merg->buffer = mem_top + sizeof(MERGE); - merg->line = msbrk(LINE_SIZE); - merg->cnt = merg->read_chars = 0; - merg->fd = fd; - buf_size = MEMORY_SIZE - sizeof(MERGE); - - if (read_line(merg) == ERROR) /* Read first line */ - return; - copy(lastline, merg->line); /* and save it */ - - for (;;) { - if (read_line(merg) == ERROR) /* EOF reached */ - break; - if ((ret = compare(lastline, merg->line)) > 0) { - error(FALSE, "Disorder in file ", file); - write(2, merg->line, length(merg->line)); - break; - } else if (ret < 0) /* Copy if lines not equal */ - copy(lastline, merg->line); - else if (uniq) { - error(FALSE, "Non uniq line in file ", file); - write(2, merg->line, length(merg->line)); - break; - } - } - - mbrk(mem_top); /* Reset mem */ -} - -/* Length () returns the length of the argument line including the linefeed. */ -int length(line) -register char *line; -{ - register int i = 1; /* Add linefeed */ - - while (*line++ != '\n') i++; - return i; -} - -/* Copy () copies the src line into the dest line including linefeed. */ -void copy(dest, src) -register char *dest, *src; -{ - while ((*dest++ = *src++) != '\n'); -} - -/* Msbrk() does a sbrk() and checks the return value. */ -char *msbrk(size) -register int size; -{ - register char *address; - - if ((address = sbrk(size)) == (char *) -1) - error(TRUE, "Not enough memory. Use chmem to allocate more", NULL); - return address; -} - -/* Mbrk() does a brk() and checks the return value. */ -void mbrk(address) -char *address; -{ - if (brk(address) == -1) error(TRUE, "Cannot reset memory", NULL); -} - -void catch(dummy) -int dummy; /* to satisfy the prototype */ -{ - register int i; - - signal(SIGINT, SIG_IGN); - only_merge = FALSE; - for (i = 0; i < 26; i++) (void) unlink(file_name(i)); - exit(2); -} diff --git a/man/man1/Makefile b/man/man1/Makefile index 36e63098f..f1edbc0c5 100644 --- a/man/man1/Makefile +++ b/man/man1/Makefile @@ -15,7 +15,7 @@ MAN= ash.1 at.1 banner.1 basename.1 \ paste.1 ping.1 playwave.1 pr.1 prep.1 \ profile.1 ps.1 pwd.1 rcp.1 recwave.1 \ ref.1 remsync.1 rget.1 rlogin.1 rsh.1 rz.1 \ - shar.1 acksize.1 sleep.1 sort.1 spell.1 \ + shar.1 acksize.1 sleep.1 spell.1 \ split.1 stty.1 svc.1 svrctl.1 \ synctree.1 sysenv.1 sz.1 tail.1 tee.1 telnet.1 template.1 \ term.1 termcap.1 tget.1 time.1 tr.1 true.1 \ diff --git a/man/man1/sort.1 b/man/man1/sort.1 deleted file mode 100644 index 9d276ae9c..000000000 --- a/man/man1/sort.1 +++ /dev/null @@ -1,83 +0,0 @@ -.TH SORT 1 -.SH NAME -sort \- sort a file of ASCII lines -.SH SYNOPSIS -\fBsort\fR [\fB\-bcdf\&imnru\fR]\fR [\fB\-t\fIc\fR] [\fB\-o \fIname\fR] [\fB+\fIpos1\fR] [\fB\-\fIpos2\fR] \fIfile\fR ...\fR -.br -.de FL -.TP -\\fB\\$1\\fR -\\$2 -.. -.de EX -.TP 20 -\\fB\\$1\\fR -# \\$2 -.. -.SH OPTIONS -.TP 5 -.B \-b -# Skip leading blanks when making comparisons -.TP 5 -.B \-c -# Check to see if a file is sorted -.TP 5 -.B \-d -# Dictionary order: ignore punctuation -.TP 5 -.B \-f -# Fold upper case onto lower case -.TP 5 -.B \-i -# Ignore nonASCII characters -.TP 5 -.B \-m -# Merge presorted files -.TP 5 -.B \-n -# Numeric sort order (decimal) -.TP 5 -.B \-x -# Numeric sort order (hex) -.TP 5 -.B \-o -# Next argument is output file -.TP 5 -.B \-r -# Reverse the sort order -.TP 5 -.B \-t -# Following character is field separator -.TP 5 -.B \-u -# Unique mode (delete duplicate lines) -.SH EXAMPLES -.TP 20 -.B sort \-nr file -# Sort keys numerically, reversed -.TP 20 -.B sort +2 \-4 file -# Sort using fields 2 and 3 as key -.TP 20 -.B sort +2 \-t: \-o out -# Field separator is \fI:\fP -.TP 20 -.B sort +.3 \-.6 -# Characters 3 through 5 form the key -.SH DESCRIPTION -.PP -.I Sort -sorts one or more files. -If no files are specified, \fIstdin\fR is sorted. -Output is written on standard output, unless \fB\-o\fP is specified. -The options \fB+\fIpos1 \fB\-\fIpos2\fR use only fields \fIpos1\fR -up to but not including \fIpos2\fR as the sort key, where a field is a -string of characters delimited by spaces and tabs, unless a different field -delimiter is specified with \fB\-t\fR. -Both \fIpos1\fR and \fIpos2\fR have the form \fIm.n\fR where \fIm\fR tells -the number of fields and \fIn\fR tells the number of characters. -Either \fIm\fR or \fIn\fR may be omitted. -.SH "SEE ALSO" -.BR comm (1), -.BR grep (1), -.BR uniq (1). diff --git a/releasetools/nbsd_ports b/releasetools/nbsd_ports index 7d9fb403e..2914f32ac 100644 --- a/releasetools/nbsd_ports +++ b/releasetools/nbsd_ports @@ -69,6 +69,7 @@ 2012/10/17 12:00:00,usr.bin/Makefile 2012/10/17 12:00:00,usr.bin/Makefile.inc 2012/10/17 12:00:00,usr.bin/passwd/Makefile +2012/10/17 12:00:00,usr.bin/sort 2012/10/17 12:00:00,usr.bin/xinstall 2012/10/17 12:00:00,usr.sbin/Makefile 2012/10/17 12:00:00,usr.sbin/Makefile.inc diff --git a/usr.bin/Makefile b/usr.bin/Makefile index 3a7f81399..fbfcea4b6 100644 --- a/usr.bin/Makefile +++ b/usr.bin/Makefile @@ -18,7 +18,7 @@ SUBDIR= \ newgrp \ passwd \ sed seq \ - stat su \ + sort stat su \ tic \ uniq \ xinstall diff --git a/usr.bin/sort/Makefile b/usr.bin/sort/Makefile new file mode 100644 index 000000000..9219a2255 --- /dev/null +++ b/usr.bin/sort/Makefile @@ -0,0 +1,15 @@ +# $NetBSD: Makefile,v 1.8 2009/09/10 22:02:40 dsl Exp $ +# from: @(#)Makefile 8.1 (Berkeley) 6/6/93 + +PROG= sort +SRCS= append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c +SRCS+= radix_sort.c + +LDADD+=-lutil +DPADD+=${LIBUTIL} + +.if defined(__MINIX) +CPPFLAGS+= -Dlchown=chown -Dlchmod=chmod +.endif # defined(__MINIX) + +.include diff --git a/usr.bin/sort/append.c b/usr.bin/sort/append.c new file mode 100644 index 000000000..07e15e1da --- /dev/null +++ b/usr.bin/sort/append.c @@ -0,0 +1,94 @@ +/* $NetBSD: append.c,v 1.23 2009/11/06 18:34:22 joerg Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "sort.h" + +__RCSID("$NetBSD: append.c,v 1.23 2009/11/06 18:34:22 joerg Exp $"); + +#include + +/* + * copy sorted lines to output + * Ignore duplicates (marked with -ve keylen) + */ +void +append(RECHEADER **keylist, int nelem, FILE *fp, put_func_t put) +{ + RECHEADER **cpos, **lastkey; + RECHEADER *crec; + + lastkey = keylist + nelem; + if (REVERSE) { + for (cpos = lastkey; cpos-- > keylist;) { + crec = *cpos; + if (crec->keylen >= 0) + put(crec, fp); + } + } else { + for (cpos = keylist; cpos < lastkey; cpos++) { + crec = *cpos; + if (crec->keylen >= 0) + put(crec, fp); + } + } +} diff --git a/usr.bin/sort/fields.c b/usr.bin/sort/fields.c new file mode 100644 index 000000000..9533b1c70 --- /dev/null +++ b/usr.bin/sort/fields.c @@ -0,0 +1,377 @@ +/* $NetBSD: fields.c,v 1.32 2010/12/18 23:09:48 christos Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Subroutines to generate sort keys. */ + +#include "sort.h" + +__RCSID("$NetBSD: fields.c,v 1.32 2010/12/18 23:09:48 christos Exp $"); + +#define SKIP_BLANKS(ptr) { \ + if (BLANK & d_mask[*(ptr)]) \ + while (BLANK & d_mask[*(++(ptr))]); \ +} + +#define NEXTCOL(pos) { \ + if (!SEP_FLAG) \ + while (BLANK & l_d_mask[*(++pos)]); \ + while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\ +} + +static u_char *enterfield(u_char *, const u_char *, struct field *, int); +static u_char *number(u_char *, const u_char *, u_char *, u_char *, int); +static u_char *length(u_char *, const u_char *, u_char *, u_char *, int); + +#define DECIMAL_POINT '.' + +/* + * constructs sort key with leading recheader, followed by the key, + * followed by the original line. + */ +length_t +enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data, + size_t line_size, struct field fieldtable[]) + /* keybuf: pointer to start of key */ +{ + int i; + u_char *l_d_mask; + u_char *lineend, *pos; + const u_char *endkey; + u_char *keypos; + struct coldesc *clpos; + int col = 1; + struct field *ftpos; + + l_d_mask = d_mask; + pos = line_data - 1; + lineend = line_data + line_size-1; + /* don't include rec_delimiter */ + + for (i = 0; i < ncols; i++) { + clpos = clist + i; + for (; (col < clpos->num) && (pos < lineend); col++) { + NEXTCOL(pos); + } + if (pos >= lineend) + break; + clpos->start = SEP_FLAG ? pos + 1 : pos; + NEXTCOL(pos); + clpos->end = pos; + col++; + if (pos >= lineend) { + clpos->end = lineend; + i++; + break; + } + } + for (; i <= ncols; i++) + clist[i].start = clist[i].end = lineend; + if (clist[0].start < line_data) + clist[0].start++; + + /* + * We write the sort keys (concatenated) followed by the + * original line data (for output) as the 'keybuf' data. + * keybuf->length is the number of key bytes + data bytes. + * keybuf->offset is the number of key bytes. + * We add a record separator weight after the key in case + * (as is usual) we need to preserve the order of equal lines, + * and for 'sort -u'. + * The key itself will have had the correct weight applied. + */ + keypos = keybuf->data; + endkey = keybuf_end - line_size - 1; + if (endkey <= keypos) + /* No room for any key bytes */ + return 1; + + for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) { + if ((keypos = enterfield(keypos, endkey, ftpos, + fieldtable->flags)) == NULL) + return (1); + } + + keybuf->offset = keypos - keybuf->data; + keybuf->length = keybuf->offset + line_size; + + /* + * Posix requires that equal keys be further sorted by the + * entire original record. + * NetBSD has (at least for some time) kept equal keys in + * their original order. + * For 'sort -u' posix_sort is unset. + */ + keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset; + + memcpy(keypos, line_data, line_size); + return (0); +} + +/* + * constructs a field (as defined by -k) within a key + */ +static u_char * +enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld, + int gflags) +{ + u_char *start, *end, *lineend, *mask, *lweight; + struct column icol, tcol; + u_int flags; + + icol = cur_fld->icol; + tcol = cur_fld->tcol; + flags = cur_fld->flags; + start = icol.p->start; + lineend = clist[ncols].end; + if (flags & BI) + SKIP_BLANKS(start); + start += icol.indent; + start = min(start, lineend); + + if (!tcol.num) + end = lineend; + else { + if (tcol.indent) { + end = tcol.p->start; + if (flags & BT) + SKIP_BLANKS(end); + end += tcol.indent; + end = min(end, lineend); + } else + end = tcol.p->end; + } + + if (flags & L) + return length(tablepos, endkey, start, end, flags); + if (flags & N) + return number(tablepos, endkey, start, end, flags); + + /* Bound check space - assuming nothing is skipped */ + if (tablepos + (end - start) + 1 >= endkey) + return NULL; + + mask = cur_fld->mask; + lweight = cur_fld->weights; + for (; start < end; start++) { + if (!mask || mask[*start]) { + *tablepos++ = lweight[*start]; + } + } + /* Add extra byte (absent from lweight) to sort short keys correctly */ + *tablepos++ = lweight[REC_D]; + return tablepos; +} + +/* + * Numbers are converted to a floating point format (exponent & mantissa) + * so that they compare correctly as sequence of unsigned bytes. + * Bytes 0x00 and 0xff are used to terminate positive and negative numbers + * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12. + * + * The first byte contain the overall sign, exponent sign and some of the + * exponent. These have to be ordered (-ve value, decreasing exponent), + * zero, (+ve value, increasing exponent). + * + * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0. + * -ve values are the 1's compliments (so 0x7f isn't used!). + * + * This only leaves 63 byte values for +ve exponents - which isn't enough. + * The largest 4 exponent values are used to hold a byte count of the + * number of following bytes that contain 8 exponent bits per byte, + * This lets us sort exponents from -2^31 to +2^31. + * + * The mantissa is stored 2 digits per byte offset by 0x40, for negative + * numbers the order must be reversed (they are bit inverted). + * + * Reverse sorts are done by inverting the sign of the number. + */ +#define MAX_EXP_ENC ((int)sizeof(int)) + +static u_char * +number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend, + int reverse) +{ + int exponent = -1; + int had_dp = 0; + u_char *tline; + char ch; + unsigned int val; + u_char *last_nz_pos; + u_char negate; + + if (reverse & R) + negate = 0xff; + else + negate = 0; + + /* Give ourselves space for the key terminator */ + bufend--; + + /* Ensure we have enough space for the exponent */ + if (pos + 1 + MAX_EXP_ENC > bufend) + return (NULL); + + SKIP_BLANKS(line); + if (*line == '-') { /* set the sign */ + negate ^= 0xff; + line++; + } + /* eat initial zeroes */ + for (; *line == '0' && line < lineend; line++) + continue; + + /* calculate exponents */ + if (*line == DECIMAL_POINT) { + /* Decimal fraction */ + had_dp = 1; + while (*++line == '0' && line < lineend) + exponent--; + } else { + /* Large (absolute) value, count digits */ + for (tline = line; *tline >= '0' && + *tline <= '9' && tline < lineend; tline++) + exponent++; + } + + /* If the first/next character isn't a digit, value is zero */ + if (*line < '1' || *line > '9' || line >= lineend) { + /* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */ + /* XXX what about NaN, NAN, inf and INF */ + *pos++ = 0x80; + return pos; + } + + /* Maybe here we should allow for e+12 (etc) */ + + if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) { + /* Value ok for simple encoding */ + /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */ + exponent += 0xc0; + *pos++ = negate ^ exponent; + } else { + /* Out or range for a single byte */ + int c, t; + t = exponent > 0 ? exponent : -exponent; + /* Count how many 8-bit bytes are needed */ + for (c = 0; ; c++) { + t >>= 8; + if (t == 0) + break; + } + /* 'c' better be 0..3 here - but probably 0..1 */ + /* Offset just outside valid range */ + t = c + 0x40 - MAX_EXP_ENC; + if (exponent < 0) + t = -t; + *pos++ = negate ^ (t + 0xc0); + /* now add each byte, most significant first */ + for (; c >= 0; c--) + *pos++ = negate ^ (exponent >> (c * 8)); + } + + /* Finally add mantissa, 2 digits per byte */ + for (last_nz_pos = pos; line < lineend; ) { + if (pos >= bufend) + return NULL; + ch = *line++; + val = (ch - '0') * 10; + if (val > 90) { + if (ch == DECIMAL_POINT && !had_dp) { + had_dp = 1; + continue; + } + break; + } + while (line < lineend) { + ch = *line++; + if (ch == DECIMAL_POINT && !had_dp) { + had_dp = 1; + continue; + } + if (ch < '0' || ch > '9') + line = lineend; + else + val += ch - '0'; + break; + } + *pos++ = negate ^ (val + 0x40); + if (val != 0) + last_nz_pos = pos; + } + + /* Add key terminator, deleting any trailing "00" */ + *last_nz_pos++ = negate; + + return (last_nz_pos); +} + +static u_char * +length(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend, + int flag) +{ + u_char buf[32]; + int l; + SKIP_BLANKS(line); + l = snprintf((char *)buf, sizeof(buf), "%td", lineend - line); + return number(pos, bufend, buf, buf + l, flag); +} diff --git a/usr.bin/sort/files.c b/usr.bin/sort/files.c new file mode 100644 index 000000000..b36f6945a --- /dev/null +++ b/usr.bin/sort/files.c @@ -0,0 +1,276 @@ +/* $NetBSD: files.c,v 1.41 2009/11/06 18:34:22 joerg Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "sort.h" +#include "fsort.h" + +__RCSID("$NetBSD: files.c,v 1.41 2009/11/06 18:34:22 joerg Exp $"); + +#include + +/* Align records in temporary files to avoid misaligned copies */ +#define REC_ROUNDUP(n) (((n) + sizeof (long) - 1) & ~(sizeof (long) - 1)) + +static ssize_t seq(FILE *, u_char **); + +/* + * this is called when there is no special key. It's only called + * in the first fsort pass. + */ + +static u_char *opos; +static size_t osz; + +void +makeline_copydown(RECHEADER *recbuf) +{ + memmove(recbuf->data, opos, osz); +} + +int +makeline(FILE *fp, RECHEADER *recbuf, u_char *bufend, struct field *dummy2) +{ + u_char *pos; + int c; + + pos = recbuf->data; + if (osz != 0) { + /* + * Buffer shortage is solved by either of two ways: + * o flush previous buffered data and start using the + * buffer from start. + * makeline_copydown() above must be called. + * o realloc buffer + * + * This code has relied on realloc changing 'bufend', + * but that isn't necessarily true. + */ + pos += osz; + osz = 0; + } + + while (pos < bufend) { + c = getc(fp); + if (c == EOF) { + if (pos == recbuf->data) { + FCLOSE(fp); + return EOF; + } + /* Add terminator to partial line */ + c = REC_D; + } + *pos++ = c; + if (c == REC_D) { + recbuf->offset = 0; + recbuf->length = pos - recbuf->data; + recbuf->keylen = recbuf->length - 1; + return (0); + } + } + + /* Ran out of buffer space... */ + if (recbuf->data < bufend) { + /* Remember where the partial record is */ + osz = pos - recbuf->data; + opos = recbuf->data; + } + return (BUFFEND); +} + +/* + * This generates keys. It's only called in the first fsort pass + */ +int +makekey(FILE *fp, RECHEADER *recbuf, u_char *bufend, struct field *ftbl) +{ + static u_char *line_data; + static ssize_t line_size; + static int overflow = 0; + + /* We get re-entered after returning BUFFEND - save old data */ + if (overflow) { + overflow = enterkey(recbuf, bufend, line_data, line_size, ftbl); + return overflow ? BUFFEND : 0; + } + + line_size = seq(fp, &line_data); + if (line_size == 0) { + FCLOSE(fp); + return EOF; + } + + if (line_size > bufend - recbuf->data) { + overflow = 1; + } else { + overflow = enterkey(recbuf, bufend, line_data, line_size, ftbl); + } + return overflow ? BUFFEND : 0; +} + +/* + * get a line of input from fp + */ +static ssize_t +seq(FILE *fp, u_char **line) +{ + static u_char *buf; + static size_t buf_size = DEFLLEN; + u_char *end, *pos; + int c; + u_char *new_buf; + + if (!buf) { + /* one-time initialization */ + buf = malloc(buf_size); + if (!buf) + err(2, "malloc of linebuf for %zu bytes failed", + buf_size); + } + + end = buf + buf_size; + pos = buf; + while ((c = getc(fp)) != EOF) { + *pos++ = c; + if (c == REC_D) { + *line = buf; + return pos - buf; + } + if (pos == end) { + /* Long line - double size of buffer */ + /* XXX: Check here for stupidly long lines */ + buf_size *= 2; + new_buf = realloc(buf, buf_size); + if (!new_buf) + err(2, "realloc of linebuf to %zu bytes failed", + buf_size); + + end = new_buf + buf_size; + pos = new_buf + (pos - buf); + buf = new_buf; + } + } + + if (pos != buf) { + /* EOF part way through line - add line terminator */ + *pos++ = REC_D; + *line = buf; + return pos - buf; + } + + return 0; +} + +/* + * write a key/line pair to a temporary file + */ +void +putrec(const RECHEADER *rec, FILE *fp) +{ + EWRITE(rec, 1, REC_ROUNDUP(offsetof(RECHEADER, data) + rec->length), fp); +} + +/* + * write a line to output + */ +void +putline(const RECHEADER *rec, FILE *fp) +{ + EWRITE(rec->data+rec->offset, 1, rec->length - rec->offset, fp); +} + +/* + * write dump of key to output (for -Dk) + */ +void +putkeydump(const RECHEADER *rec, FILE *fp) +{ + EWRITE(rec, 1, REC_ROUNDUP(offsetof(RECHEADER, data) + rec->offset), fp); +} + +/* + * get a record from a temporary file. (Used by merge sort.) + */ +int +geteasy(FILE *fp, RECHEADER *rec, u_char *end, struct field *dummy2) +{ + length_t file_len; + int i; + + (void)sizeof (char[offsetof(RECHEADER, length) == 0 ? 1 : -1]); + + if ((u_char *)(rec + 1) > end) + return (BUFFEND); + if (!fread(&rec->length, 1, sizeof rec->length, fp)) { + fclose(fp); + return (EOF); + } + file_len = REC_ROUNDUP(offsetof(RECHEADER, data) + rec->length); + if (end - rec->data < (ptrdiff_t)file_len) { + for (i = sizeof rec->length - 1; i >= 0; i--) + ungetc(*((char *) rec + i), fp); + return (BUFFEND); + } + + fread(&rec->length + 1, file_len - sizeof rec->length, 1, fp); + return (0); +} diff --git a/usr.bin/sort/fsort.c b/usr.bin/sort/fsort.c new file mode 100644 index 000000000..a53ad3e71 --- /dev/null +++ b/usr.bin/sort/fsort.c @@ -0,0 +1,214 @@ +/* $NetBSD: fsort.c,v 1.47 2010/02/05 21:58:41 enami Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Read in a block of records (until 'enough'). + * sort, write to temp file. + * Merge sort temp files into output file + * Small files miss out the temp file stage. + * Large files might get multiple merges. + */ +#include "sort.h" +#include "fsort.h" + +__RCSID("$NetBSD: fsort.c,v 1.47 2010/02/05 21:58:41 enami Exp $"); + +#include +#include + +#define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) + +void +fsort(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl) +{ + RECHEADER **keylist; + RECHEADER **keypos, **keyp; + RECHEADER *buffer; + size_t bufsize = DEFBUFSIZE; + u_char *bufend; + int mfct = 0; + int c, nelem; + get_func_t get; + RECHEADER *crec; + RECHEADER *nbuffer; + FILE *fp, *tmp_fp; + int file_no; + int max_recs = DEBUG('m') ? 16 : MAXNUM; + + buffer = allocrec(NULL, bufsize); + bufend = (u_char *)buffer + bufsize; + /* Allocate double length keymap for radix_sort */ + keylist = malloc(2 * max_recs * sizeof(*keylist)); + if (buffer == NULL || keylist == NULL) + err(2, "failed to malloc initial buffer or keylist"); + + if (SINGL_FLD) + /* Key and data are one! */ + get = makeline; + else + /* Key (merged key fields) added before data */ + get = makekey; + + file_no = 0; +#if defined(__minix) + /* LSC FIXME: Not very pretty, but reduce the diff */ +#include "pathnames.h" + if (!strcmp(filelist->names[0], _PATH_STDIN)) + fp = stdin; + else +#endif /* defined(__minix) */ + fp = fopen(filelist->names[0], "r"); + if (fp == NULL) + err(2, "%s", filelist->names[0]); + + /* Loop through reads of chunk of input files that get sorted + * and then merged together. */ + for (;;) { + keypos = keylist; + nelem = 0; + crec = buffer; + makeline_copydown(crec); + + /* Loop reading records */ + for (;;) { + c = get(fp, crec, bufend, ftbl); + /* 'c' is 0, EOF or BUFFEND */ + if (c == 0) { + /* Save start of key in input buffer */ + *keypos++ = crec; + if (++nelem == max_recs) { + c = BUFFEND; + break; + } + crec = (RECHEADER *)(crec->data + SALIGN(crec->length)); + continue; + } + if (c == EOF) { + /* try next file */ + if (++file_no >= nfiles) + /* no more files */ + break; +#if defined(__minix) + if (!strcmp(filelist->names[0], _PATH_STDIN)) + fp = stdin; + else +#endif /* defined(__minix) */ + fp = fopen(filelist->names[file_no], "r"); + if (fp == NULL) + err(2, "%s", filelist->names[file_no]); + continue; + } + if (nelem >= max_recs + || (bufsize >= MAXBUFSIZE && nelem > 8)) + /* Need to sort and save this lot of data */ + break; + + /* c == BUFFEND, and we can process more data */ + /* Allocate a larger buffer for this lot of data */ + bufsize *= 2; + nbuffer = allocrec(buffer, bufsize); + if (!nbuffer) { + err(2, "failed to realloc buffer to %zu bytes", + bufsize); + } + + /* patch up keylist[] */ + for (keyp = &keypos[-1]; keyp >= keylist; keyp--) + *keyp = nbuffer + (*keyp - buffer); + + crec = nbuffer + (crec - buffer); + buffer = nbuffer; + bufend = (u_char *)buffer + bufsize; + } + + /* Sort this set of records */ + radix_sort(keylist, keylist + max_recs, nelem); + + if (c == EOF && mfct == 0) { + /* all the data is (sorted) in the buffer */ + append(keylist, nelem, outfp, + DEBUG('k') ? putkeydump : putline); + break; + } + + /* Save current data to a temporary file for a later merge */ + if (nelem != 0) { + tmp_fp = ftmp(); + append(keylist, nelem, tmp_fp, putrec); + save_for_merge(tmp_fp, geteasy, ftbl); + } + mfct = 1; + + if (c == EOF) { + /* merge to output file */ + merge_sort(outfp, + DEBUG('k') ? putkeydump : putline, ftbl); + break; + } + } + + free(keylist); + keylist = NULL; + free(buffer); + buffer = NULL; +} diff --git a/usr.bin/sort/fsort.h b/usr.bin/sort/fsort.h new file mode 100644 index 000000000..3d8ef4c0f --- /dev/null +++ b/usr.bin/sort/fsort.h @@ -0,0 +1,78 @@ +/* $NetBSD: fsort.h,v 1.17 2009/09/26 21:16:55 dsl Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fsort.h 8.1 (Berkeley) 6/6/93 + */ + +#define BUFSIZE (1<<20) +#define MAXNUM 131072 /* low guess at average record count */ +#define BUFFEND (EOF-2) +#define MAXFCT 1000 +#define DEFLLEN 65536 + +/* + * Default (initial) and maximum size of record buffer for fsort(). + * Note that no more than MAXNUM records are stored in the buffer, + * even if the buffer is not full yet. + */ +#define DEFBUFSIZE (1 << 20) /* 1MB */ +#define MAXBUFSIZE (8 << 20) /* 10 MB */ diff --git a/usr.bin/sort/init.c b/usr.bin/sort/init.c new file mode 100644 index 000000000..0ef34b339 --- /dev/null +++ b/usr.bin/sort/init.c @@ -0,0 +1,448 @@ +/* $NetBSD: init.c,v 1.28 2010/12/18 23:09:48 christos Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "sort.h" + +__RCSID("$NetBSD: init.c,v 1.28 2010/12/18 23:09:48 christos Exp $"); + +#include +#include + +static void insertcol(struct field *); +static const char *setcolumn(const char *, struct field *); + +/* + * masks of ignored characters. + */ +static u_char dtable[NBINS], itable[NBINS]; + +/* + * parsed key options + */ +struct coldesc *clist = NULL; +int ncols = 0; + +/* + * clist (list of columns which correspond to one or more icol or tcol) + * is in increasing order of columns. + * Fields are kept in increasing order of fields. + */ + +/* + * keep clist in order--inserts a column in a sorted array + */ +static void +insertcol(struct field *field) +{ + int i; + struct coldesc *p; + + /* Make space for new item */ + p = realloc(clist, (ncols + 2) * sizeof(*clist)); + if (!p) + err(1, "realloc"); + clist = p; + memset(&clist[ncols], 0, sizeof(clist[ncols])); + + for (i = 0; i < ncols; i++) + if (field->icol.num <= clist[i].num) + break; + if (field->icol.num != clist[i].num) { + memmove(clist+i+1, clist+i, sizeof(COLDESC)*(ncols-i)); + clist[i].num = field->icol.num; + ncols++; + } + if (field->tcol.num && field->tcol.num != field->icol.num) { + for (i = 0; i < ncols; i++) + if (field->tcol.num <= clist[i].num) + break; + if (field->tcol.num != clist[i].num) { + memmove(clist+i+1, clist+i,sizeof(COLDESC)*(ncols-i)); + clist[i].num = field->tcol.num; + ncols++; + } + } +} + +/* + * matches fields with the appropriate columns--n^2 but who cares? + */ +void +fldreset(struct field *fldtab) +{ + int i; + + fldtab[0].tcol.p = clist + ncols - 1; + for (++fldtab; fldtab->icol.num; ++fldtab) { + for (i = 0; fldtab->icol.num != clist[i].num; i++) + ; + fldtab->icol.p = clist + i; + if (!fldtab->tcol.num) + continue; + for (i = 0; fldtab->tcol.num != clist[i].num; i++) + ; + fldtab->tcol.p = clist + i; + } +} + +/* + * interprets a column in a -k field + */ +static const char * +setcolumn(const char *pos, struct field *cur_fld) +{ + struct column *col; + char *npos; + int tmp; + col = cur_fld->icol.num ? (&cur_fld->tcol) : (&cur_fld->icol); + col->num = (int) strtol(pos, &npos, 10); + pos = npos; + if (col->num <= 0 && !(col->num == 0 && col == &(cur_fld->tcol))) + errx(2, "field numbers must be positive"); + if (*pos == '.') { + if (!col->num) + errx(2, "cannot indent end of line"); + ++pos; + col->indent = (int) strtol(pos, &npos, 10); + pos = npos; + if (&cur_fld->icol == col) + col->indent--; + if (col->indent < 0) + errx(2, "illegal offset"); + } + for(; (tmp = optval(*pos, cur_fld->tcol.num)); pos++) + cur_fld->flags |= tmp; + if (cur_fld->icol.num == 0) + cur_fld->icol.num = 1; + return (pos); +} + +int +setfield(const char *pos, struct field *cur_fld, int gflag) +{ + cur_fld->mask = NULL; + + pos = setcolumn(pos, cur_fld); + if (*pos == '\0') /* key extends to EOL. */ + cur_fld->tcol.num = 0; + else { + if (*pos != ',') + errx(2, "illegal field descriptor"); + setcolumn((++pos), cur_fld); + } + if (!cur_fld->flags) + cur_fld->flags = gflag; + if (REVERSE) + /* A local 'r' doesn't invert the global one */ + cur_fld->flags &= ~R; + + /* Assign appropriate mask table and weight table. */ + cur_fld->weights = weight_tables[cur_fld->flags & (R | F)]; + if (cur_fld->flags & I) + cur_fld->mask = itable; + else if (cur_fld->flags & D) + cur_fld->mask = dtable; + + cur_fld->flags |= (gflag & (BI | BT)); + if (!cur_fld->tcol.indent) /* BT has no meaning at end of field */ + cur_fld->flags &= ~BT; + + if (cur_fld->tcol.num + && !(!(cur_fld->flags & BI) && cur_fld->flags & BT) + && (cur_fld->tcol.num <= cur_fld->icol.num + /* indent if 0 -> end of field, i.e. okay */ + && cur_fld->tcol.indent != 0 + && cur_fld->tcol.indent < cur_fld->icol.indent)) + errx(2, "fields out of order"); + + insertcol(cur_fld); + return (cur_fld->tcol.num); +} + +int +optval(int desc, int tcolflag) +{ + switch(desc) { + case 'b': + if (!tcolflag) + return BI; + else + return BT; + case 'd': return D; + case 'f': return F; + case 'i': return I; + case 'l': return L; + case 'n': return N; + case 'r': return R; + default: return 0; + } +} + +/* + * Return true if the options found in ARG, according to the getopt + * spec in OPTS, require an additional argv word as an option + * argument. + */ +static int +options_need_argument(const char *arg, const char *opts) +{ + size_t pos; + const char *s; + + /*assert(arg[0] == '-');*/ + + pos = 1; + while (arg[pos]) { + s = strchr(opts, arg[pos]); + if (s == NULL) { + /* invalid option */ + return 0; + } + if (s[1] == ':') { + /* option requires argument */ + if (arg[pos+1] == '\0') { + /* no argument in this arg */ + return 1; + } + else { + /* argument is in this arg; no more options */ + return 0; + } + } + pos++; + } + return 0; +} + +/* + * Replace historic +SPEC arguments with appropriate -kSPEC. + * + * The form can be either a single +SPEC or a pair +SPEC -SPEC. + * The following -SPEC is not recognized unless it follows + * immediately. + */ +void +fixit(int *argc, char **argv, const char *opts) +{ + int i, j, sawplus; + char *vpos, *tpos, spec[20]; + int col, indent; + size_t sz; + + sawplus = 0; + for (i = 1; i < *argc; i++) { + /* + * This loop must stop exactly where getopt will stop. + * Otherwise it turns e.g. "sort x +3" into "sort x + * -k4.1", which will croak if +3 was in fact really a + * file name. In order to do this reliably we need to + * be able to identify argv words that are option + * arguments. + */ + + if (!strcmp(argv[i], "--")) { + /* End of options; stop. */ + break; + } + + if (argv[i][0] == '+') { + /* +POS argument */ + sawplus = 1; + } else if (argv[i][0] == '-' && sawplus && + isdigit((unsigned char)argv[i][1])) { + /* -POS argument */ + sawplus = 0; + } else if (argv[i][0] == '-') { + /* other option */ + sawplus = 0; + if (options_need_argument(argv[i], opts)) { + /* skip over the argument */ + i++; + } + continue; + } else { + /* not an option at all; stop */ + sawplus = 0; + break; + } + + /* + * At this point argv[i] is an old-style spec. The + * sawplus flag used by the above loop logic also + * tells us if it's a +SPEC or -SPEC. + */ + + /* parse spec */ + tpos = argv[i]+1; + col = (int)strtol(tpos, &tpos, 10); + if (*tpos == '.') { + ++tpos; + indent = (int) strtol(tpos, &tpos, 10); + } else + indent = 0; + /* tpos now points to the optional flags */ + + /* + * In the traditional form, x.0 means beginning of line; + * in the new form, x.0 means end of line. Adjust the + * value of INDENT accordingly. + */ + if (sawplus) { + /* +POS */ + col += 1; + indent += 1; + } else { + /* -POS */ + if (indent > 0) + col += 1; + } + + /* make the new style spec */ + sz = snprintf(spec, sizeof(spec), "%d.%d%s", col, indent, + tpos); + + if (sawplus) { + /* Replace the +POS argument with new-style -kSPEC */ + asprintf(&vpos, "-k%s", spec); + argv[i] = vpos; + } else { + /* + * Append the spec to the one from the + * preceding +POS argument, and remove the + * current argv element entirely. + */ + asprintf(&vpos, "%s,%s", argv[i-1], spec); + free(argv[i-1]); + argv[i-1] = vpos; + for (j=i; j < *argc; j++) + argv[j] = argv[j+1]; + *argc -= 1; + i--; + } + } +} + +/* + * ascii, Rascii, Ftable, and RFtable map + * + * Sorting 'weight' tables. + * Convert 'ascii' characters into their sort order. + * The 'F' variants fold lower case to upper equivalent + * The 'R' variants are for reverse sorting. + * + * The record separator (REC_D) never needs a weight, this frees one + * byte value as an 'end of key' marker. This must be 0 for normal + * weight tables, and 0xff for reverse weight tables - and is used + * to terminate keys so that short keys sort before (after if reverse) + * longer keys. + * + * The field separator has a normal weight - although it cannot occur + * within a key unless it is the default (space+tab). + * + * All other bytes map to the appropriate value for the sort order. + * Numeric sorts don't need any tables, they are reversed by negation. + * + * Global reverse sorts are done by writing the sorted keys in reverse + * order - the sort itself is stil forwards. + * This means that weights are only ever used when generating keys, any + * sort of the original data bytes is always forwards and unweighted. + * + * Note: this is only good for ASCII sorting. For different LC 's, + * all bets are off. + * + * itable[] and dtable[] are the masks for -i (ignore non-printables) + * and -d (only sort blank and alphanumerics). + */ +void +settables(void) +{ + int i; + int next_weight = 1; + int rev_weight = 254; + + ascii[REC_D] = 0; + Rascii[REC_D] = 255; + Ftable[REC_D] = 0; + RFtable[REC_D] = 255; + + for (i = 0; i < 256; i++) { + if (i == REC_D) + continue; + ascii[i] = next_weight; + Rascii[i] = rev_weight; + if (Ftable[i] == 0) { + Ftable[i] = next_weight; + RFtable[i] = rev_weight; + Ftable[tolower(i)] = next_weight; + RFtable[tolower(i)] = rev_weight; + } + next_weight++; + rev_weight--; + + if (i == '\n' || isprint(i)) + itable[i] = 1; + + if (i == '\n' || i == '\t' || i == ' ' || isalnum(i)) + dtable[i] = 1; + } +} diff --git a/usr.bin/sort/msort.c b/usr.bin/sort/msort.c new file mode 100644 index 000000000..4383c7d3d --- /dev/null +++ b/usr.bin/sort/msort.c @@ -0,0 +1,439 @@ +/* $NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "sort.h" +#include "fsort.h" + +__RCSID("$NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $"); + +#include +#include +#include +#include + +/* Subroutines using comparisons: merge sort and check order */ +#define DELETE (1) + +typedef struct mfile { + FILE *fp; + get_func_t get; + RECHEADER *rec; + u_char *end; +} MFILE; + +static int cmp(RECHEADER *, RECHEADER *); +static int insert(struct mfile **, struct mfile *, int, int); +static void merge_sort_fstack(FILE *, put_func_t, struct field *); + +/* + * Number of files merge() can merge in one pass. + */ +#define MERGE_FNUM 16 + +static struct mfile fstack[MERGE_FNUM]; +static struct mfile fstack_1[MERGE_FNUM]; +static struct mfile fstack_2[MERGE_FNUM]; +static int fstack_count, fstack_1_count, fstack_2_count; + +void +save_for_merge(FILE *fp, get_func_t get, struct field *ftbl) +{ + FILE *mfp, *mfp1, *mfp2; + + if (fstack_count == MERGE_FNUM) { + /* Must reduce the number of temporary files */ + mfp = ftmp(); + merge_sort_fstack(mfp, putrec, ftbl); + /* Save output in next layer */ + if (fstack_1_count == MERGE_FNUM) { + mfp1 = ftmp(); + memcpy(fstack, fstack_1, sizeof fstack); + merge_sort_fstack(mfp1, putrec, ftbl); + if (fstack_2_count == MERGE_FNUM) { + /* More than 4096 files! */ + mfp2 = ftmp(); + memcpy(fstack, fstack_2, sizeof fstack); + merge_sort_fstack(mfp2, putrec, ftbl); + fstack_2[0].fp = mfp2; + fstack_2_count = 1; + } + fstack_2[fstack_2_count].fp = mfp1; + fstack_2[fstack_2_count].get = geteasy; + fstack_2_count++; + fstack_1_count = 0; + } + fstack_1[fstack_1_count].fp = mfp; + fstack_1[fstack_1_count].get = geteasy; + fstack_1_count++; + fstack_count = 0; + } + + fstack[fstack_count].fp = fp; + fstack[fstack_count++].get = get; +} + +void +fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl) +{ + get_func_t get = SINGL_FLD ? makeline : makekey; + FILE *fp; + int i; + + for (i = 0; i < nfiles; i++) { +#if defined(__minix) + /* LSC FIXME: Not very pretty, but reduce the diff */ +#include "pathnames.h" + if (!strcmp(filelist->names[0], _PATH_STDIN)) + fp = stdin; + else +#endif /* defined(__minix) */ + fp = fopen(filelist->names[i], "r"); + if (fp == NULL) + err(2, "%s", filelist->names[i]); + save_for_merge(fp, get, ftbl); + } + + merge_sort(outfp, putline, ftbl); +} + +void +merge_sort(FILE *outfp, put_func_t put, struct field *ftbl) +{ + int count = fstack_1_count + fstack_2_count; + FILE *mfp; + int i; + + if (count == 0) { + /* All files in initial array */ + merge_sort_fstack(outfp, put, ftbl); + return; + } + + count += fstack_count; + + /* Too many files for one merge sort */ + for (;;) { + /* Sort latest 16 files */ + i = count; + if (i > MERGE_FNUM) + i = MERGE_FNUM; + while (fstack_count > 0) + fstack[--i] = fstack[--fstack_count]; + while (i > 0 && fstack_1_count > 0) + fstack[--i] = fstack_1[--fstack_1_count]; + while (i > 0) + fstack[--i] = fstack_2[--fstack_2_count]; + if (count <= MERGE_FNUM) { + /* Got all the data */ + fstack_count = count; + merge_sort_fstack(outfp, put, ftbl); + return; + } + mfp = ftmp(); + fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count; + merge_sort_fstack(mfp, putrec, ftbl); + fstack[0].fp = mfp; + fstack[0].get = geteasy; + fstack_count = 1; + count -= MERGE_FNUM - 1; + } +} + +static void +merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl) +{ + struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile; + RECHEADER *new_rec; + u_char *new_end; + void *tmp; + int c, i, nfiles; + size_t sz; + + /* Read one record from each file (read again if a duplicate) */ + for (nfiles = i = 0; i < fstack_count; i++) { + cfile = &fstack[i]; + if (cfile->rec == NULL) { + cfile->rec = allocrec(NULL, DEFLLEN); + cfile->end = (u_char *)cfile->rec + DEFLLEN; + } + rewind(cfile->fp); + + for (;;) { + c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl); + if (c == EOF) + break; + + if (c == BUFFEND) { + /* Double buffer size */ + sz = (cfile->end - (u_char *)cfile->rec) * 2; + cfile->rec = allocrec(cfile->rec, sz); + cfile->end = (u_char *)cfile->rec + sz; + continue; + } + + if (nfiles != 0) { + if (insert(flist, cfile, nfiles, !DELETE)) + /* Duplicate removed */ + continue; + } else + flist[0] = cfile; + nfiles++; + break; + } + } + + if (nfiles == 0) + return; + + /* + * We now loop reading a new record from the file with the + * 'sorted first' existing record. + * As each record is added, the 'first' record is written to the + * output file - maintaining one record from each file in the sorted + * list. + */ + new_rec = allocrec(NULL, DEFLLEN); + new_end = (u_char *)new_rec + DEFLLEN; + for (;;) { + cfile = flist[0]; + c = cfile->get(cfile->fp, new_rec, new_end, ftbl); + if (c == EOF) { + /* Write out last record from now-empty input */ + put(cfile->rec, outfp); + if (--nfiles == 0) + break; + /* Replace from file with now-first sorted record. */ + /* (Moving base 'flist' saves copying everything!) */ + flist++; + continue; + } + if (c == BUFFEND) { + /* Buffer not large enough - double in size */ + sz = (new_end - (u_char *)new_rec) * 2; + new_rec = allocrec(new_rec, sz); + new_end = (u_char *)new_rec +sz; + continue; + } + + /* Swap in new buffer, saving old */ + tmp = cfile->rec; + cfile->rec = new_rec; + new_rec = tmp; + tmp = cfile->end; + cfile->end = new_end; + new_end = tmp; + + /* Add into sort, removing the original first entry */ + c = insert(flist, cfile, nfiles, DELETE); + if (c != 0 || (UNIQUE && cfile == flist[0] + && cmp(new_rec, cfile->rec) == 0)) { + /* Was an unwanted duplicate, restore buffer */ + tmp = cfile->rec; + cfile->rec = new_rec; + new_rec = tmp; + tmp = cfile->end; + cfile->end = new_end; + new_end = tmp; + continue; + } + + /* Write out 'old' record */ + put(new_rec, outfp); + } + + free(new_rec); +} + +/* + * if delete: inserts rec in flist, deletes flist[0]; + * otherwise just inserts *rec in flist. + * Returns 1 if record is a duplicate to be ignored. + */ +static int +insert(struct mfile **flist, struct mfile *rec, int ttop, int delete) +{ + int mid, top = ttop, bot = 0, cmpv = 1; + + for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) { + cmpv = cmp(rec->rec, flist[mid]->rec); + if (cmpv == 0 ) { + if (UNIQUE) + /* Duplicate key, read another record */ + /* NB: This doesn't guarantee to keep any + * particular record. */ + return 1; + /* + * Apply sort by input file order. + * We could truncate the sort is the fileno are + * adjacent - but that is all too hard! + * The fileno cannot be equal, since we only have one + * record from each file (+ flist[0] which never + * comes here). + */ + cmpv = rec < flist[mid] ? -1 : 1; + if (REVERSE) + cmpv = -cmpv; + } + if (cmpv < 0) + top = mid; + else + bot = mid; + } + + /* At this point we haven't yet compared against flist[0] */ + + if (delete) { + /* flist[0] is ourselves, only the caller knows the old data */ + if (bot != 0) { + memmove(flist, flist + 1, bot * sizeof(MFILE *)); + flist[bot] = rec; + } + return 0; + } + + /* Inserting original set of records */ + + if (bot == 0 && cmpv != 0) { + /* Doesn't match flist[1], must compare with flist[0] */ + cmpv = cmp(rec->rec, flist[0]->rec); + if (cmpv == 0 && UNIQUE) + return 1; + /* Add matching keys in file order (ie new is later) */ + if (cmpv < 0) + bot = -1; + } + bot++; + memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *)); + flist[bot] = rec; + return 0; +} + +/* + * check order on one file + */ +void +order(struct filelist *filelist, struct field *ftbl) +{ + get_func_t get = SINGL_FLD ? makeline : makekey; + RECHEADER *crec, *prec, *trec; + u_char *crec_end, *prec_end, *trec_end; + FILE *fp; + int c; + +#if defined(__minix) + if (!strcmp(filelist->names[0], _PATH_STDIN)) + fp = stdin; + else +#endif /* defined(__minix) */ + fp = fopen(filelist->names[0], "r"); + if (fp == NULL) + err(2, "%s", filelist->names[0]); + + crec = malloc(offsetof(RECHEADER, data[DEFLLEN])); + crec_end = crec->data + DEFLLEN; + prec = malloc(offsetof(RECHEADER, data[DEFLLEN])); + prec_end = prec->data + DEFLLEN; + + /* XXX this does exit(0) for overlong lines */ + if (get(fp, prec, prec_end, ftbl) != 0) + exit(0); + while (get(fp, crec, crec_end, ftbl) == 0) { + if (0 < (c = cmp(prec, crec))) { + crec->data[crec->length-1] = 0; + errx(1, "found disorder: %s", crec->data+crec->offset); + } + if (UNIQUE && !c) { + crec->data[crec->length-1] = 0; + errx(1, "found non-uniqueness: %s", + crec->data+crec->offset); + } + /* + * Swap pointers so that this record is on place pointed + * to by prec and new record is read to place pointed to by + * crec. + */ + trec = prec; + prec = crec; + crec = trec; + trec_end = prec_end; + prec_end = crec_end; + crec_end = trec_end; + } + exit(0); +} + +static int +cmp(RECHEADER *rec1, RECHEADER *rec2) +{ + int len; + int r; + + /* key is weights */ + len = min(rec1->keylen, rec2->keylen); + r = memcmp(rec1->data, rec2->data, len); + if (r == 0) + r = rec1->keylen - rec2->keylen; + if (REVERSE) + r = -r; + return r; +} diff --git a/usr.bin/sort/pathnames.h b/usr.bin/sort/pathnames.h new file mode 100644 index 000000000..534671e32 --- /dev/null +++ b/usr.bin/sort/pathnames.h @@ -0,0 +1,66 @@ +/* $NetBSD: pathnames.h,v 1.6 2008/04/28 20:24:15 martin Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pathnames.h 8.1 (Berkeley) 6/6/93 + */ + +#define _PATH_STDIN "/dev/stdin" diff --git a/usr.bin/sort/radix_sort.c b/usr.bin/sort/radix_sort.c new file mode 100644 index 000000000..4ec5ff89a --- /dev/null +++ b/usr.bin/sort/radix_sort.c @@ -0,0 +1,217 @@ +/* $NetBSD: radix_sort.c,v 1.4 2009/09/19 16:18:00 dsl Exp $ */ + +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy and by Dan Bernstein at New York University, + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#if defined(LIBC_SCCS) && !defined(lint) +#if 0 +static char sccsid[] = "@(#)radixsort.c 8.2 (Berkeley) 4/28/95"; +#else +__RCSID("$NetBSD: radix_sort.c,v 1.4 2009/09/19 16:18:00 dsl Exp $"); +#endif +#endif /* LIBC_SCCS and not lint */ + +/* + * 'stable' radix sort initially from libc/stdlib/radixsort.c + */ + +#include + +#include +#include +#include +#include "sort.h" + +typedef struct { + RECHEADER **sa; /* Base of saved area */ + int sn; /* Number of entries */ + int si; /* index into data for compare */ +} stack; + +static void simplesort(RECHEADER **, int, int); + +#define THRESHOLD 20 /* Divert to simplesort(). */ + +#define empty(s) (s >= sp) +#define pop(a, n, i) a = (--sp)->sa, n = sp->sn, i = sp->si +#define push(a, n, i) sp->sa = a, sp->sn = n, (sp++)->si = i +#define swap(a, b, t) t = a, a = b, b = t + +void +radix_sort(RECHEADER **a, RECHEADER **ta, int n) +{ + u_int count[256], nc, bmin; + u_int c; + RECHEADER **ak, **tai, **lim; + RECHEADER *hdr; + int stack_size = 512; + stack *s, *sp, *sp0, *sp1, temp; + RECHEADER **top[256]; + u_int *cp, bigc; + int data_index = 0; + + if (n < THRESHOLD && !DEBUG('r')) { + simplesort(a, n, 0); + return; + } + + s = emalloc(stack_size * sizeof *s); + memset(&count, 0, sizeof count); + /* Technically 'top' doesn't need zeroing */ + memset(&top, 0, sizeof top); + + sp = s; + push(a, n, data_index); + while (!empty(s)) { + pop(a, n, data_index); + if (n < THRESHOLD && !DEBUG('r')) { + simplesort(a, n, data_index); + continue; + } + + /* Count number of times each 'next byte' occurs */ + nc = 0; + bmin = 255; + lim = a + n; + for (ak = a, tai = ta; ak < lim; ak++) { + hdr = *ak; + if (data_index >= hdr->keylen) { + /* Short key, copy to start of output */ + if (UNIQUE && a != sp->sa) + /* Stop duplicate being written out */ + hdr->keylen = -1; + *a++ = hdr; + n--; + continue; + } + /* Save in temp buffer for distribute */ + *tai++ = hdr; + c = hdr->data[data_index]; + if (++count[c] == 1) { + if (c < bmin) + bmin = c; + nc++; + } + } + /* + * We need save the bounds for each 'next byte' that + * occurs more so we can sort each block. + */ + if (sp + nc > s + stack_size) { + stack_size *= 2; + sp1 = erealloc(s, stack_size * sizeof *s); + sp = sp1 + (sp - s); + s = sp1; + } + + /* Minor optimisation to do the largest set last */ + sp0 = sp1 = sp; + bigc = 2; + /* Convert 'counts' positions, saving bounds for later sorts */ + ak = a; + for (cp = count + bmin; nc > 0; cp++) { + while (*cp == 0) + cp++; + if ((c = *cp) > 1) { + if (c > bigc) { + bigc = c; + sp1 = sp; + } + push(ak, c, data_index+1); + } + ak += c; + top[cp-count] = ak; + *cp = 0; /* Reset count[]. */ + nc--; + } + swap(*sp0, *sp1, temp); + + for (ak = ta+n; --ak >= ta;) /* Deal to piles. */ + *--top[(*ak)->data[data_index]] = *ak; + } + + free(s); +} + +/* insertion sort, short records are sorted before long ones */ +static void +simplesort(RECHEADER **a, int n, int data_index) +{ + RECHEADER **ak, **ai; + RECHEADER *akh; + RECHEADER **lim = a + n; + const u_char *s, *t; + int s_len, t_len; + int i; + int r; + + if (n <= 1) + return; + + for (ak = a+1; ak < lim; ak++) { + akh = *ak; + s = akh->data; + s_len = akh->keylen; + for (ai = ak; ;) { + ai--; + t_len = (*ai)->keylen; + if (t_len != -1) { + t = (*ai)->data; + for (i = data_index; ; i++) { + if (i >= s_len || i >= t_len) { + r = s_len - t_len; + break; + } + r = s[i] - t[i]; + if (r != 0) + break; + } + if (r >= 0) { + if (r == 0 && UNIQUE) { + /* Put record below existing */ + ai[1] = ai[0]; + /* Mark as duplicate - ignore */ + akh->keylen = -1; + } else { + ai++; + } + break; + } + } + ai[1] = ai[0]; + if (ai == a) + break; + } + ai[0] = akh; + } +} diff --git a/usr.bin/sort/sort.1 b/usr.bin/sort/sort.1 new file mode 100644 index 000000000..8d0d46720 --- /dev/null +++ b/usr.bin/sort/sort.1 @@ -0,0 +1,462 @@ +.\" $NetBSD: sort.1,v 1.32 2010/12/18 23:36:23 wiz Exp $ +.\" +.\" Copyright (c) 2000-2003 The NetBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" This code is derived from software contributed to The NetBSD Foundation +.\" by Ben Harris and Jaromir Dolecek. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.\" Copyright (c) 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" the Institute of Electrical and Electronics Engineers, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)sort.1 8.1 (Berkeley) 6/6/93 +.\" +.Dd December 18, 2010 +.Dt SORT 1 +.Os +.Sh NAME +.Nm sort +.Nd sort or merge text files +.Sh SYNOPSIS +.Nm sort +.Op Fl bcdfHilmnrSsu +.Oo +.Fl k +.Ar field1 Ns Op Li \&, Ns Ar field2 +.Oc +.Op Fl o Ar output +.Op Fl R Ar char +.Op Fl T Ar dir +.Op Fl t Ar char +.Op Ar +.Sh DESCRIPTION +The +.Nm +utility sorts text files by lines. +Comparisons are based on one or more sort keys extracted +from each line of input, and are performed lexicographically. +By default, if keys are not given, +.Nm +regards each input line as a single field. +.Pp +The following options are available: +.Bl -tag -width Fl +.It Fl c +Check that the single input file is sorted. +If the file is not sorted, +.Nm +produces the appropriate error messages and exits with code 1; otherwise, +.Nm +returns 0. +.Nm +.Fl c +produces no output. +.It Fl H +Ignored for compatibility with earlier versions of +.Nm . +.It Fl m +Merge only; the input files are assumed to be pre-sorted. +.It Fl o Ar output +The argument given is the name of an +.Ar output +file to be used instead of the standard output. +This file can be the same as one of the input files. +.It Fl S +Don't use stable sort. +Default is to use stable sort. +.It Fl s +Use stable sort, keeps records with equal keys in their original order. +This is the default. +Provided for compatibility with other +.Nm +implementations only. +.It Fl T Ar dir +Use +.Ar dir +as the directory for temporary files. +The default is the value specified in the environment variable +.Ev TMPDIR or +.Pa /tmp +if +.Ev TMPDIR +is not defined. +.It Fl u +Unique: suppress all but one in each set of lines having equal keys. +If used with the +.Fl c +option, check that there are no lines with duplicate keys. +.El +.Pp +The following options override the default ordering rules. +When ordering options appear independent of key field +specifications, the requested field ordering rules are +applied globally to all sort keys. +When attached to a specific key (see +.Fl k ) , +the ordering options override +all global ordering options for that key. +.Bl -tag -width Fl +.It Fl d +Only blank space and alphanumeric characters +.\" according +.\" to the current setting of LC_CTYPE +are used +in making comparisons. +.It Fl f +Considers all lowercase characters that have uppercase +equivalents to be the same for purposes of comparison. +.It Fl i +Ignore all non-printable characters. +.It Fl l +Sort by the string length of the field, not by the field itself. +.It Fl n +An initial numeric string, consisting of optional blank space, optional +minus sign, and zero or more digits (including decimal point) +.\" with +.\" optional radix character and thousands +.\" separator +.\" (as defined in the current locale), +is sorted by arithmetic value. +(The +.Fl n +option no longer implies the +.Fl b +option.) +.It Fl r +Reverse the sense of comparisons. +.El +.Pp +The treatment of field separators can be altered using these options: +.Bl -tag -width Fl +.It Fl b +Ignores leading blank space when determining the start +and end of a restricted sort key. +A +.Fl b +option specified before the first +.Fl k +option applies globally to all +.Fl k +options. +Otherwise, the +.Fl b +option can be attached independently to each +.Ar field +argument of the +.Fl k +option (see below). +Note that the +.Fl b +option has no effect unless key fields are specified. +.It Fl t Ar char +.Ar char +is used as the field separator character. +The initial +.Ar char +is not considered to be part of a field when determining +key offsets (see below). +Each occurrence of +.Ar char +is significant (for example, +.Dq Ar charchar +delimits an empty field). +If +.Fl t +is not specified, the default field separator is a sequence of +blank-space characters, and consecutive blank spaces do +.Em not +delimit an empty field; further, the initial blank space +.Em is +considered part of a field when determining key offsets. +.It Fl R Ar char +.Ar char +is used as the record separator character. +This should be used with discretion; +.Fl R Ar \*[Lt]alphanumeric\*[Gt] +usually produces undesirable results. +The default record separator is newline. +.It Fl k Ar field1 Ns Op Li \&, Ns Ar field2 +Designates the starting position, +.Ar field1 , +and optional ending position, +.Ar field2 , +of a key field. +The +.Fl k +option replaces the obsolescent options +.Cm \(pl Ns Ar pos1 +and +.Fl Ns Ar pos2 . +.El +.Pp +The following operands are available: +.Bl -tag -width Ar +.It Ar file +The pathname of a file to be sorted, merged, or checked. +If no +.Ar file +operands are specified, or if +a +.Ar file +operand is +.Fl , +the standard input is used. +.El +.Pp +A field is defined as a minimal sequence of characters followed by a +field separator or a newline character. +By default, the first +blank space of a sequence of blank spaces acts as the field separator. +All blank spaces in a sequence of blank spaces are considered +as part of the next field; for example, all blank spaces at +the beginning of a line are considered to be part of the +first field. +.Pp +Fields are specified +by the +.Fl k +.Ar field1 Ns Op \&, Ns Ar field2 +argument. +A missing +.Ar field2 +argument defaults to the end of a line. +.Pp +The arguments +.Ar field1 +and +.Ar field2 +have the form +.Ar m Ns Li \&. Ns Ar n +and can be followed by one or more of the letters +.Cm b , d , f , i , +.Cm l , n , +and +.Cm r , +which correspond to the options discussed above. +A +.Ar field1 +position specified by +.Ar m Ns Li \&. Ns Ar n +.Pq Ar m , n No \*[Gt] 0 +is interpreted as the +.Ar n Ns th +character in the +.Ar m Ns th +field. +A missing +.Li \&. Ns Ar n +in +.Ar field1 +means +.Ql \&.1 , +indicating the first character of the +.Ar m Ns th +field; if the +.Fl b +option is in effect, +.Ar n +is counted from the first non-blank character in the +.Ar m Ns th +field; +.Ar m Ns Li \&.1b +refers to the first non-blank character in the +.Ar m Ns th +field. +.Pp +A +.Ar field2 +position specified by +.Ar m Ns Li \&. Ns Ar n +is interpreted as +the +.Ar n Ns th +character (including separators) of the +.Ar m Ns th +field. +A missing +.Li \&. Ns Ar n +indicates the last character of the +.Ar m Ns th +field; +.Ar m += \&0 +designates the end of a line. +Thus the option +.Fl k +.Sm off +.Xo +.Ar v Li \&. Ar x Li \&, +.Ar w Li \&. Ar y +.Xc +.Sm on +is synonymous with the obsolescent option +.Sm off +.Cm \(pl Ar v-\&1 Li \&. Ar x-\&1 +.Fl Ar w-\&1 Li \&. Ar y ; +.Sm on +when +.Ar y +is omitted, +.Fl k +.Sm off +.Ar v Li \&. Ar x Li \&, Ar w +.Sm on +is synonymous with +.Sm off +.Cm \(pl Ar v-\&1 Li \&. Ar x-\&1 +.Fl Ar w+1 Li \&.0 . +.Sm on +The obsolescent +.Cm \(pl Ns Ar pos1 +.Fl Ns Ar pos2 +option is still supported, except for +.Fl Ns Ar w Ns Li \&.0b , +which has no +.Fl k +equivalent. +.Sh ENVIRONMENT +If the following environment variable exists, it is used by +.Nm . +.Bl -tag -width Ev +.It Ev TMPDIR +.Nm +uses the contents of the +.Ev TMPDIR +environment variable as the path in which to store +temporary files. +.El +.Sh FILES +.Bl -tag -width outputNUMBER+some -compact +.It Pa /tmp/sort.* +Default temporary files. +.It Ar output Ns NUMBER +Temporary file which is used for output if +.Ar output +already exists. +Once sorting is finished, this file replaces +.Ar output +(via +.Xr link 2 +and +.Xr unlink 2 ) . +.El +.Sh EXIT STATUS +Sort exits with one of the following values: +.Bl -tag -width flag -compact +.It 0 +Normal behavior. +.It 1 +On disorder (or non-uniqueness) with the +.Fl c +option +.It 2 +An error occurred. +.El +.Sh SEE ALSO +.Xr comm 1 , +.Xr join 1 , +.Xr uniq 1 , +.Xr qsort 3 , +.Xr radixsort 3 +.Sh HISTORY +A +.Nm +command appeared in +.At v5 . +This +.Nm +implementation appeared in +.Bx 4.4 +and is used since +.Nx 1.6 . +.Sh BUGS +Posix requires the locale's thousands separator be ignored in numbers. +It may be faster to sort very large files in pieces and then explicitly +merge them. +.Sh NOTES +This +.Nm +has no limits on input line length (other than imposed by available +memory) or any restrictions on bytes allowed within lines. +.Pp +To protect data +.Nm +.Fl o +calls +.Xr link 2 +and +.Xr unlink 2 , +and thus fails on protected directories. +.Pp +Input files should be text files. +If file doesn't end with record separator (which is typically newline), the +.Nm +utility silently supplies one. +.Pp +The current +.Nm +uses lexicographic radix sorting, which requires +that sort keys be kept in memory (as opposed to previous versions which used quick +and merge sorts and did not.) +Thus performance depends highly on efficient choice of sort keys, and the +.Fl b +option and the +.Ar field2 +argument of the +.Fl k +option should be used whenever possible. +Similarly, +.Nm +.Fl k1f +is equivalent to +.Nm +.Fl f +and may take twice as long. diff --git a/usr.bin/sort/sort.c b/usr.bin/sort/sort.c new file mode 100644 index 000000000..a0d4f5f20 --- /dev/null +++ b/usr.bin/sort/sort.c @@ -0,0 +1,419 @@ +/* $NetBSD: sort.c,v 1.61 2011/09/16 15:39:29 joerg Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Sort sorts a file using an optional user-defined key. + * Sort uses radix sort for internal sorting, and allows + * a choice of merge sort and radix sort for external sorting. + */ + +#include +#include "sort.h" +#include "fsort.h" +#include "pathnames.h" + +#ifndef lint +__COPYRIGHT("@(#) Copyright (c) 1993\ + The Regents of the University of California. All rights reserved."); +#endif /* not lint */ + +__RCSID("$NetBSD: sort.c,v 1.61 2011/09/16 15:39:29 joerg Exp $"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +int REC_D = '\n'; +u_char d_mask[NBINS]; /* flags for rec_d, field_d, */ + +/* + * weight tables. Gweights is one of ascii, Rascii.. + * modified to weight rec_d = 0 (or 255) + */ +u_char *const weight_tables[4] = { ascii, Rascii, Ftable, RFtable }; +u_char ascii[NBINS], Rascii[NBINS], RFtable[NBINS], Ftable[NBINS]; + +int SINGL_FLD = 0, SEP_FLAG = 0, UNIQUE = 0; +int REVERSE = 0; +int posix_sort; + +unsigned int debug_flags = 0; + +static char toutpath[MAXPATHLEN]; + +const char *tmpdir; /* where temporary files should be put */ + +static void cleanup(void); +static void onsignal(int); +__dead static void usage(const char *); + +int +main(int argc, char *argv[]) +{ + int ch, i, stdinflag = 0; + char cflag = 0, mflag = 0; + char *outfile, *outpath = 0; + struct field *fldtab; + size_t fldtab_sz, fld_cnt; + struct filelist filelist; + int num_input_files; + FILE *outfp = NULL; +#if !defined(__minix) + struct rlimit rl; +#endif /* !defined(__minix) */ + struct stat st; + + setlocale(LC_ALL, ""); + +#if !defined(__minix) + /* bump RLIMIT_NOFILE to maximum our hard limit allows */ + if (getrlimit(RLIMIT_NOFILE, &rl) < 0) + err(2, "getrlimit"); + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_NOFILE, &rl) < 0) + err(2, "setrlimit"); +#endif /* !defined(__minix) */ + + d_mask[REC_D = '\n'] = REC_D_F; + d_mask['\t'] = d_mask[' '] = BLANK | FLD_D; + + /* fldtab[0] is the global options. */ + fldtab_sz = 3; + fld_cnt = 0; + fldtab = emalloc(fldtab_sz * sizeof(*fldtab)); + memset(fldtab, 0, fldtab_sz * sizeof(*fldtab)); + +#define SORT_OPTS "bcdD:fHik:lmno:rR:sSt:T:ux" + + /* Convert "+field" args to -f format */ + fixit(&argc, argv, SORT_OPTS); + + if (!(tmpdir = getenv("TMPDIR"))) + tmpdir = _PATH_TMP; + + while ((ch = getopt(argc, argv, SORT_OPTS)) != -1) { + switch (ch) { + case 'b': + fldtab[0].flags |= BI | BT; + break; + case 'c': + cflag = 1; + break; + case 'D': /* Debug flags */ + for (i = 0; optarg[i]; i++) + debug_flags |= 1 << (optarg[i] & 31); + break; + case 'd': case 'f': case 'i': case 'n': case 'l': + fldtab[0].flags |= optval(ch, 0); + break; + case 'H': + /* -H was ; use merge sort for blocks of large files' */ + /* That is now the default. */ + break; + case 'k': + fldtab = erealloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab)); + memset(&fldtab[fldtab_sz], 0, sizeof(fldtab[0])); + fldtab_sz++; + + setfield(optarg, &fldtab[++fld_cnt], fldtab[0].flags); + break; + case 'm': + mflag = 1; + break; + case 'o': + outpath = optarg; + break; + case 'r': + REVERSE = 1; + break; + case 's': + /* + * Nominally 'stable sort', keep lines with equal keys + * in input file order. (Default for NetBSD) + * (-s for GNU sort compatibility.) + */ + posix_sort = 0; + break; + case 'S': + /* + * Reverse of -s! + * This needs to enforce a POSIX sort where records + * with equal keys are then sorted by the raw data. + * Currently not implemented! + * (using libc radixsort() v sradixsort() doesn't + * have the desired effect.) + */ + posix_sort = 1; + break; + case 't': + if (SEP_FLAG) + usage("multiple field delimiters"); + SEP_FLAG = 1; + d_mask[' '] &= ~FLD_D; + d_mask['\t'] &= ~FLD_D; + d_mask[(u_char)*optarg] |= FLD_D; + if (d_mask[(u_char)*optarg] & REC_D_F) + errx(2, "record/field delimiter clash"); + break; + case 'R': + if (REC_D != '\n') + usage("multiple record delimiters"); + REC_D = *optarg; + if (REC_D == '\n') + break; + if (optarg[1] != '\0') { + char *ep; + int t = 0; + if (optarg[0] == '\\') + optarg++, t = 8; + REC_D = (int)strtol(optarg, &ep, t); + if (*ep != '\0' || REC_D < 0 || + REC_D >= (int)__arraycount(d_mask)) + errx(2, "invalid record delimiter %s", + optarg); + } + d_mask['\n'] = d_mask[' ']; + d_mask[REC_D] = REC_D_F; + break; + case 'T': + /* -T tmpdir */ + tmpdir = optarg; + break; + case 'u': + UNIQUE = 1; + break; + case '?': + default: + usage(NULL); + } + } + + if (UNIQUE) + /* Don't sort on raw record if keys match */ + posix_sort = 0; + + if (cflag && argc > optind+1) + errx(2, "too many input files for -c option"); + if (argc - 2 > optind && !strcmp(argv[argc-2], "-o")) { + outpath = argv[argc-1]; + argc -= 2; + } + if (mflag && argc - optind > (MAXFCT - (16+1))*16) + errx(2, "too many input files for -m option"); + + for (i = optind; i < argc; i++) { + /* allow one occurrence of /dev/stdin */ + if (!strcmp(argv[i], "-") || !strcmp(argv[i], _PATH_STDIN)) { + if (stdinflag) + warnx("ignoring extra \"%s\" in file list", + argv[i]); + else + stdinflag = 1; + + /* change to /dev/stdin if '-' */ + if (argv[i][0] == '-') { + static char path_stdin[] = _PATH_STDIN; + argv[i] = path_stdin; + } + + } else if ((ch = access(argv[i], R_OK))) + err(2, "%s", argv[i]); + } + + if (fldtab[1].icol.num == 0) { + /* No sort key specified */ + if (fldtab[0].flags & (I|D|F|N|L)) { + /* Modified - generate a key that covers the line */ + fldtab[0].flags &= ~(BI|BT); + setfield("1", &fldtab[++fld_cnt], fldtab->flags); + fldreset(fldtab); + } else { + /* Unmodified, just compare the line */ + SINGL_FLD = 1; + fldtab[0].icol.num = 1; + } + } else { + fldreset(fldtab); + } + + settables(); + + if (optind == argc) { + static const char * const names[] = { _PATH_STDIN, NULL }; + filelist.names = names; + num_input_files = 1; + } else { + filelist.names = (const char * const *) &argv[optind]; + num_input_files = argc - optind; + } + + if (cflag) { + order(&filelist, fldtab); + /* NOT REACHED */ + } + + if (!outpath) { + toutpath[0] = '\0'; /* path not used in this case */ + outfile = outpath = toutpath; + outfp = stdout; + } else if (lstat(outpath, &st) == 0 + && !S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) { + /* output file exists and isn't character or block device */ + struct sigaction act; + static const int sigtable[] = {SIGHUP, SIGINT, SIGPIPE, +#if defined(__minix) + SIGVTALRM, SIGPROF, 0}; +#else + SIGXCPU, SIGXFSZ, SIGVTALRM, SIGPROF, 0}; +#endif /* defined(__minix) */ + int outfd; + errno = 0; + if (access(outpath, W_OK)) + err(2, "%s", outpath); + (void)snprintf(toutpath, sizeof(toutpath), "%sXXXXXX", + outpath); + if ((outfd = mkstemp(toutpath)) == -1) + err(2, "Cannot create temporary file `%s'", toutpath); + (void)atexit(cleanup); + act.sa_handler = onsignal; + (void) sigemptyset(&act.sa_mask); + act.sa_flags = SA_RESTART | SA_RESETHAND; + for (i = 0; sigtable[i]; ++i) /* always unlink toutpath */ + sigaction(sigtable[i], &act, 0); + outfile = toutpath; + if ((outfp = fdopen(outfd, "w")) == NULL) + err(2, "Cannot open temporary file `%s'", toutpath); + } else { + outfile = outpath; + + if ((outfp = fopen(outfile, "w")) == NULL) + err(2, "output file %s", outfile); + } + + if (mflag) + fmerge(&filelist, num_input_files, outfp, fldtab); + else + fsort(&filelist, num_input_files, outfp, fldtab); + + if (outfile != outpath) { + if (access(outfile, F_OK)) + err(2, "%s", outfile); + + /* + * Copy file permissions bits of the original file. + * st is initialized above, when we create the + * temporary spool file. + */ + if (lchmod(outfile, st.st_mode & ALLPERMS) != 0) { + err(2, "cannot chmod %s: output left in %s", + outpath, outfile); + } + + (void)unlink(outpath); + if (link(outfile, outpath)) + err(2, "cannot link %s: output left in %s", + outpath, outfile); + (void)unlink(outfile); + toutpath[0] = 0; + } + exit(0); +} + +static void +onsignal(int sig) +{ + cleanup(); +} + +static void +cleanup(void) +{ + if (toutpath[0]) + (void)unlink(toutpath); +} + +static void +usage(const char *msg) +{ + if (msg != NULL) + (void)fprintf(stderr, "%s: %s\n", getprogname(), msg); + (void)fprintf(stderr, + "usage: %s [-bcdfHilmnrSsu] [-k field1[,field2]] [-o output]" + " [-R char] [-T dir]", getprogname()); + (void)fprintf(stderr, + " [-t char] [file ...]\n"); + exit(2); +} + +RECHEADER * +allocrec(RECHEADER *rec, size_t size) +{ + + return (erealloc(rec, size + sizeof(long) - 1)); +} diff --git a/usr.bin/sort/sort.h b/usr.bin/sort/sort.h new file mode 100644 index 000000000..677fd4522 --- /dev/null +++ b/usr.bin/sort/sort.h @@ -0,0 +1,201 @@ +/* $NetBSD: sort.h,v 1.34 2011/09/16 15:39:29 joerg Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sort.h 8.1 (Berkeley) 6/6/93 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NBINS 256 + +/* values for masks, weights, and other flags. */ +/* R and F get used to index weight_tables[] */ +#define R 0x01 /* Field is reversed */ +#define F 0x02 /* weight lower and upper case the same */ +#define I 0x04 /* mask out non-printable characters */ +#define D 0x08 /* sort alphanumeric characters only */ +#define N 0x10 /* Field is a number */ +#define BI 0x20 /* ignore blanks in icol */ +#define BT 0x40 /* ignore blanks in tcol */ +#define L 0x80 /* Sort by field length */ + +/* masks for delimiters: blanks, fields, and termination. */ +#define BLANK 1 /* ' ', '\t'; '\n' if -R is invoked */ +#define FLD_D 2 /* ' ', '\t' default; from -t otherwise */ +#define REC_D_F 4 /* '\n' default; from -R otherwise */ + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) + +#define FCLOSE(file) { \ + if (EOF == fclose(file)) \ + err(2, "%p", file); \ +} + +#define EWRITE(ptr, size, n, f) { \ + if (!fwrite(ptr, size, n, f)) \ + err(2, NULL); \ +} + +/* Records are limited to MAXBUFSIZE (8MB) and less if you want to sort + * in a sane way. + * Anyone who wants to sort data records longer than 2GB definitely needs a + * different program! */ +typedef unsigned int length_t; + +/* A record is a key/line pair starting at rec.data. It has a total length + * and an offset to the start of the line half of the pair. + */ +typedef struct recheader { + length_t length; /* total length of key and line */ + length_t offset; /* to line */ + int keylen; /* length of key */ + u_char data[]; /* key then line */ +} RECHEADER; + +/* This is the column as seen by struct field. It is used by enterfield. + * They are matched with corresponding coldescs during initialization. + */ +struct column { + struct coldesc *p; + int num; + int indent; +}; + +/* a coldesc has a number and pointers to the beginning and end of the + * corresponding column in the current line. This is determined in enterkey. + */ +typedef struct coldesc { + u_char *start; + u_char *end; + int num; +} COLDESC; + +/* A field has an initial and final column; an omitted final column + * implies the end of the line. Flags regulate omission of blanks and + * numerical sorts; mask determines which characters are ignored (from -i, -d); + * weights determines the sort weights of a character (from -f, -r). + * + * The first field contain the global flags etc. + * The list terminates when icol = 0. + */ +struct field { + struct column icol; + struct column tcol; + u_int flags; + u_char *mask; + u_char *weights; +}; + +struct filelist { + const char * const * names; +}; + +typedef int (*get_func_t)(FILE *, RECHEADER *, u_char *, struct field *); +typedef void (*put_func_t)(const RECHEADER *, FILE *); + +extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS]; +extern u_char *const weight_tables[4]; /* ascii, Rascii, Ftable, RFtable */ +extern u_char d_mask[NBINS]; +extern int SINGL_FLD, SEP_FLAG, UNIQUE, REVERSE; +extern int posix_sort; +extern int REC_D; +extern const char *tmpdir; +extern struct coldesc *clist; +extern int ncols; + +#define DEBUG(ch) (debug_flags & (1 << ((ch) & 31))) +extern unsigned int debug_flags; + +RECHEADER *allocrec(RECHEADER *, size_t); +void append(RECHEADER **, int, FILE *, void (*)(const RECHEADER *, FILE *)); +void concat(FILE *, FILE *); +length_t enterkey(RECHEADER *, const u_char *, u_char *, size_t, struct field *); +void fixit(int *, char **, const char *); +void fldreset(struct field *); +FILE *ftmp(void); +void fmerge(struct filelist *, int, FILE *, struct field *); +void save_for_merge(FILE *, get_func_t, struct field *); +void merge_sort(FILE *, put_func_t, struct field *); +void fsort(struct filelist *, int, FILE *, struct field *); +int geteasy(FILE *, RECHEADER *, u_char *, struct field *); +int makekey(FILE *, RECHEADER *, u_char *, struct field *); +int makeline(FILE *, RECHEADER *, u_char *, struct field *); +void makeline_copydown(RECHEADER *); +int optval(int, int); +__dead void order(struct filelist *, struct field *); +void putline(const RECHEADER *, FILE *); +void putrec(const RECHEADER *, FILE *); +void putkeydump(const RECHEADER *, FILE *); +void rd_append(int, int, int, FILE *, u_char *, u_char *); +void radix_sort(RECHEADER **, RECHEADER **, int); +int setfield(const char *, struct field *, int); +void settables(void); diff --git a/usr.bin/sort/tmp.c b/usr.bin/sort/tmp.c new file mode 100644 index 000000000..155b3f5ac --- /dev/null +++ b/usr.bin/sort/tmp.c @@ -0,0 +1,106 @@ +/* $NetBSD: tmp.c,v 1.16 2009/11/06 18:34:22 joerg Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__RCSID("$NetBSD: tmp.c,v 1.16 2009/11/06 18:34:22 joerg Exp $"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sort.h" +#include "pathnames.h" + +#define _NAME_TMP "sort.XXXXXXXX" + +FILE * +ftmp(void) +{ + sigset_t set, oset; + FILE *fp; + int fd; + char path[MAXPATHLEN]; + + (void)snprintf(path, sizeof(path), "%s%s%s", tmpdir, + (tmpdir[strlen(tmpdir)-1] != '/') ? "/" : "", _NAME_TMP); + + sigfillset(&set); + (void)sigprocmask(SIG_BLOCK, &set, &oset); + if ((fd = mkstemp(path)) < 0) + err(2, "ftmp: mkstemp(\"%s\")", path); + if (!(fp = fdopen(fd, "w+"))) + err(2, "ftmp: fdopen(\"%s\")", path); + if (!DEBUG('t')) + (void)unlink(path); + + (void)sigprocmask(SIG_SETMASK, &oset, NULL); + return (fp); +}