How can I sort things that are too large to bring into memory ?
A sorting program that sorts items that are on secondary storage (disk or tape) rather than primary storage (memory) is called an external sort. Exactly how to sort large data depends on what is meant by “too large to fit in memory.” If the items to be sorted are themselves too large to fit in memory (such as images), but there aren’t many items, you can keep in memory only the sort key and a value indicating the data’s location on disk. After the key/value pairs are sorted, the data is rearranged on disk into the correct order.
If “too large to fit in memory” means that there are too many items to fit into memory at one time, the data can be sorted in groups that will fit into memory, and then the resulting files can be merged. A sort such as a radix sort can also be used as an external sort, by making each bucket in the sort a file.
Even the quick sort can be an external sort. The data can be partitioned by writing it to two smaller files. When the partitions are small enough to fit, they are sorted in memory and concatenated to form the sorted file
The example in Listing III.3 is an external sort. It sorts data in groups of 10,000 strings and writes them to files, which are then merged. If you compare this listing to the listing of the merge sort (Listing III.2b), you will notice many similarities.
Any of the four sort programs introduced so far in this chapter can be used as the in-memory sort algorithm (the makefile given at the end of the chapter specifies using qsort() as shown in Listing III.1). The functions myfgets() and myfputs() simply handle inserting and removing the newline (‘\n’) characters at the ends of lines. The openFile() function handles error conditions during the opening of files, and fileName() generates temporary filenames.
The function split() reads in up to 10,000 lines from the input file on lines 69–74, sorts them in memory on line 76, and writes them to a temporary file on lines 77–80. The function merge() takes two files that are already sorted and merges them into a third file in exactly the same way that the merge() routine in Listing III.2b merged two lists. The function mergePairs() goes through all the temporary files and calls merge() to combine pairs of files into single files, just as mergePairs() in Listing III.2b combines lists. Finally, main() invokes split() on the original file, then calls mergePairs() until all the files are combined into one big file. It then replaces the original unsorted file with the new, sorted file.
An example of an external sorting algorithm.
1: #include <stdlib.h>
2: #include <string.h>
3: #include <stdio.h>
4: #include <stdio.h>
5:
6: #define LINES_PER_FILE 10000
7:
8: /* Just like fgets(), but removes trailing ‘\n’. */
9: char*
10: myfgets(char *buf, size_t size, FILE *fp)
11: {
12: char *s = fgets(buf, size, fp);
13: if (s)
14: s[strlen(s) - 1] = ‘\0’;
15: return s;
16: }
17:
18: /* Just like fputs(), but adds trailing ‘\n’. */
19: void
20: myfputs(char *s, FILE *fp)
21: {
22: int n = strlen(s);
23: s[n] = ‘\n’;
24: fwrite(s, 1, n + 1, fp);
25: s[n] = ‘\0’;
26: }
27:
28: /* Just like fopen(), but prints message and dies if error. */
29: FILE*
30: openFile(const char *name, const char *mode)
31: {
32: FILE *fp = fopen(name, mode);
33:
34: if (fp == NULL)
35: {
36: perror(name);
37: exit(1);
38: }
39: return fp;
40: }
41:
42: /* Takes a number and generates a filename from it. */
43: const char*
44: fileName(int n)
45: {
46: static char name[16];
47:
48: sprintf(name, “temp%d”, n);
49: return name;
50: }
51:
52: /*
53: * Splits input file into sorted files with no more
54: * than LINES_PER_FILE lines each.
55: */
56: int
57: split(FILE *infp)
58: {
59: int nfiles = 0;
60: int line;
61:
62: for (line = LINES_PER_FILE; line == LINES_PER_FILE; )
63: {
64: char *array[LINES_PER_FILE + 1];
65: char buf[1024];
66: int i;
67: FILE *fp;
68:
69: for (line = 0; line < LINES_PER_FILE; line++)
70: {
71: if (!myfgets(buf, sizeof(buf), infp))
72: break;
72: array[line] = strdup(buf);
74: }
75: array[line] = NULL;
76: sortStrings(array);
77: fp = openFile(fileName(nfiles++), “w”);
78: for (i = 0; i < line; i++)
79: myfputs(array[i], fp);
80: fclose(fp);
81: }
82: return nfiles;
83: }
84:
85: /*
86: * Merges two sorted input files into
87: * one sorted output file.
88: */
89: void
90: merge(FILE *outfp, FILE *fp1, FILE *fp2)
36: perror(name);
37: exit(1);
38: }
39: return fp;
40: }
41:
42: /* Takes a number and generates a filename from it. */
43: const char*
44: fileName(int n)
45: {
46: static char name[16];
47:
48: sprintf(name, “temp%d”, n);
49: return name;
50: }
51:
52: /*
53: * Splits input file into sorted files with no more
54: * than LINES_PER_FILE lines each.
55: */
56: int
57: split(FILE *infp)
58: {
59: int nfiles = 0;
60: int line;
61:
62: for (line = LINES_PER_FILE; line == LINES_PER_FILE; )
63: {
64: char *array[LINES_PER_FILE + 1];
65: char buf[1024];
66: int i;
67: FILE *fp;
68:
69: for (line = 0; line < LINES_PER_FILE; line++)
70: {
71: if (!myfgets(buf, sizeof(buf), infp))
72: break;
72: array[line] = strdup(buf);
74: }
75: array[line] = NULL;
76: sortStrings(array);
77: fp = openFile(fileName(nfiles++), “w”);
78: for (i = 0; i < line; i++)
79: myfputs(array[i], fp);
80: fclose(fp);
81: }
82: return nfiles;
83: }
84:
85: /*
86: * Merges two sorted input files into
87: * one sorted output file.
88: */
89: void
90: merge(FILE *outfp, FILE *fp1, FILE *fp2)
91: {
92: char buf1[1024];
93: char buf2[1024];
94: char *first;
95: char *second;
96:
97: first = myfgets(buf1, sizeof(buf1), fp1);
98: second = myfgets(buf2, sizeof(buf2), fp2);
99: while (first && second)
100: {
101: if (strcmp(first, second) > 0)
102: {
103: myfputs(second, outfp);
104: second = myfgets(buf2, sizeof(buf2),
105: fp2);
106: }
107: else
108: {
109: myfputs(first, outfp);
110: first = myfgets(buf1, sizeof(buf1),
111: fp1);
112: }
113: }
114: while (first)
115: {
116: myfputs(first, outfp);
117: first = myfgets(buf1, sizeof(buf1), fp1);
118: }
119: while (second)
120: {
121: myfputs(second, outfp);
122: second = myfgets(buf2, sizeof(buf2), fp2);
123: }
124: }
125:
126: /*
127: * Takes nfiles files and merges pairs of them.
128: * Returns new number of files.
129: */
130: int
131: mergePairs(int nfiles)
132: {
133: int i;
134: int out = 0;
135:
136: for (i = 0; i < nfiles - 1; i += 2)
137: {
138: FILE *temp;
139: FILE *fp1;
140: FILE *fp2;
141: const char *first;
142: const char *second;
143:
144: temp = openFile(“temp”, “w”);
145: fp1 = openFile(fileName(i), “r”);
146: fp2 = openFile(fileName(i + 1), “r”);
147: merge(temp, fp1, fp2);
148: fclose(fp1);
92: char buf1[1024];
93: char buf2[1024];
94: char *first;
95: char *second;
96:
97: first = myfgets(buf1, sizeof(buf1), fp1);
98: second = myfgets(buf2, sizeof(buf2), fp2);
99: while (first && second)
100: {
101: if (strcmp(first, second) > 0)
102: {
103: myfputs(second, outfp);
104: second = myfgets(buf2, sizeof(buf2),
105: fp2);
106: }
107: else
108: {
109: myfputs(first, outfp);
110: first = myfgets(buf1, sizeof(buf1),
111: fp1);
112: }
113: }
114: while (first)
115: {
116: myfputs(first, outfp);
117: first = myfgets(buf1, sizeof(buf1), fp1);
118: }
119: while (second)
120: {
121: myfputs(second, outfp);
122: second = myfgets(buf2, sizeof(buf2), fp2);
123: }
124: }
125:
126: /*
127: * Takes nfiles files and merges pairs of them.
128: * Returns new number of files.
129: */
130: int
131: mergePairs(int nfiles)
132: {
133: int i;
134: int out = 0;
135:
136: for (i = 0; i < nfiles - 1; i += 2)
137: {
138: FILE *temp;
139: FILE *fp1;
140: FILE *fp2;
141: const char *first;
142: const char *second;
143:
144: temp = openFile(“temp”, “w”);
145: fp1 = openFile(fileName(i), “r”);
146: fp2 = openFile(fileName(i + 1), “r”);
147: merge(temp, fp1, fp2);
148: fclose(fp1);
149: fclose(fp2);
150: fclose(temp);
151: unlink(fileName(i));
152: unlink(fileName(i + 1));
153: rename(“temp”, fileName(out++));
154: }
155: if (i < nfiles)
156: {
157: char *tmp = strdup(fileName(i));
158: rename(tmp, fileName(out++));
159: free(tmp);
160: }
161: return out;
162: }
163:
164: int
165: main(int argc, char **argv)
166: {
167: char buf2[1024];
168: int nfiles;
169: int line;
170: int in;
171: int out;
172: FILE *infp;
173:
174: if (argc != 2)
175: {
176: fprintf(stderr, “usage: %s file\n”, argv[0]);
177: exit(1);
178: }
179: infp = openFile(argv[1], “r”);
180: nfiles = split(infp);
181: fclose(infp);
182: while (nfiles > 1)
183: nfiles = mergePairs(nfiles);
184: rename(fileName(0), argv[1]);
185: return 0;
186: }
150: fclose(temp);
151: unlink(fileName(i));
152: unlink(fileName(i + 1));
153: rename(“temp”, fileName(out++));
154: }
155: if (i < nfiles)
156: {
157: char *tmp = strdup(fileName(i));
158: rename(tmp, fileName(out++));
159: free(tmp);
160: }
161: return out;
162: }
163:
164: int
165: main(int argc, char **argv)
166: {
167: char buf2[1024];
168: int nfiles;
169: int line;
170: int in;
171: int out;
172: FILE *infp;
173:
174: if (argc != 2)
175: {
176: fprintf(stderr, “usage: %s file\n”, argv[0]);
177: exit(1);
178: }
179: infp = openFile(argv[1], “r”);
180: nfiles = split(infp);
181: fclose(infp);
182: while (nfiles > 1)
183: nfiles = mergePairs(nfiles);
184: rename(fileName(0), argv[1]);
185: return 0;
186: }
Cross Reference:
III.1: What is the easiest sorting method to use?
III.2: What is the quickest sorting method to use?
III.7: How can I sort a linked list?
No comments:
Post a Comment