summaryrefslogtreecommitdiffhomepage
path: root/coreutils/wc.c
blob: de3c895bd90ef4422f66b395aff0d19607a2e597 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
/* vi: set sw=4 ts=4: */
/*
 * wc implementation for busybox
 *
 * Copyright (C) 2003  Manuel Novoa III  <mjn3@codepoet.org>
 *
 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
 */

/* BB_AUDIT SUSv3 _NOT_ compliant -- option -m is not currently supported. */
/* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */

/* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
 *
 * Rewritten to fix a number of problems and do some size optimizations.
 * Problems in the previous busybox implementation (besides bloat) included:
 *  1) broken 'wc -c' optimization (read note below)
 *  2) broken handling of '-' args
 *  3) no checking of ferror on EOF returns
 *  4) isprint() wasn't considered when word counting.
 *
 * TODO:
 *
 * When locale support is enabled, count multibyte chars in the '-m' case.
 *
 * NOTES:
 *
 * The previous busybox wc attempted an optimization using stat for the
 * case of counting chars only.  I omitted that because it was broken.
 * It didn't take into account the possibility of input coming from a
 * pipe, or input from a file with file pointer not at the beginning.
 *
 * To implement such a speed optimization correctly, not only do you
 * need the size, but also the file position.  Note also that the
 * file position may be past the end of file.  Consider the example
 * (adapted from example in gnu wc.c)
 *
 *      echo hello > /tmp/testfile &&
 *      (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
 *
 * for which 'wc -c' should output '0'.
 */

#include "libbb.h"

#if ENABLE_LOCALE_SUPPORT
#define isspace_given_isprint(c) isspace(c)
#else
#undef isspace
#undef isprint
#define isspace(c) ((((c) == ' ') || (((unsigned int)((c) - 9)) <= (13 - 9))))
#define isprint(c) (((unsigned int)((c) - 0x20)) <= (0x7e - 0x20))
#define isspace_given_isprint(c) ((c) == ' ')
#endif

#if ENABLE_FEATURE_WC_LARGE
#define COUNT_T unsigned long long
#define COUNT_FMT "llu"
#else
#define COUNT_T unsigned
#define COUNT_FMT "u"
#endif

enum {
	WC_LINES	= 0,
	WC_WORDS	= 1,
	WC_CHARS	= 2,
	WC_LENGTH	= 3
};

int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
int wc_main(int argc ATTRIBUTE_UNUSED, char **argv)
{
	FILE *fp;
	const char *s, *arg;
	const char *start_fmt = " %9"COUNT_FMT + 1;
	const char *fname_fmt = " %s\n";
	COUNT_T *pcounts;
	COUNT_T counts[4];
	COUNT_T totals[4];
	unsigned linepos;
	unsigned u;
	int num_files = 0;
	int c;
	smallint status = EXIT_SUCCESS;
	smallint in_word;
	unsigned print_type;

	print_type = getopt32(argv, "lwcL");

	if (print_type == 0) {
		print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_CHARS);
	}

	argv += optind;
	if (!argv[0]) {
		*--argv = (char *) bb_msg_standard_input;
		fname_fmt = "\n";
		if (!((print_type-1) & print_type)) /* exactly one option? */
			start_fmt = "%"COUNT_FMT;
	}

	memset(totals, 0, sizeof(totals));

	pcounts = counts;

	while ((arg = *argv++) != 0) {
		++num_files;
		fp = fopen_or_warn_stdin(arg);
		if (!fp) {
			status = EXIT_FAILURE;
			continue;
		}

		memset(counts, 0, sizeof(counts));
		linepos = 0;
		in_word = 0;

		do {
			/* Our -w doesn't match GNU wc exactly... oh well */

			++counts[WC_CHARS];
			c = getc(fp);
			if (isprint(c)) {
				++linepos;
				if (!isspace_given_isprint(c)) {
					in_word = 1;
					continue;
				}
			} else if (((unsigned int)(c - 9)) <= 4) {
				/* \t  9
				 * \n 10
				 * \v 11
				 * \f 12
				 * \r 13
				 */
				if (c == '\t') {
					linepos = (linepos | 7) + 1;
				} else {			/* '\n', '\r', '\f', or '\v' */
				DO_EOF:
					if (linepos > counts[WC_LENGTH]) {
						counts[WC_LENGTH] = linepos;
					}
					if (c == '\n') {
						++counts[WC_LINES];
					}
					if (c != '\v') {
						linepos = 0;
					}
				}
			} else if (c == EOF) {
				if (ferror(fp)) {
					bb_simple_perror_msg(arg);
					status = EXIT_FAILURE;
				}
				--counts[WC_CHARS];
				goto DO_EOF;		/* Treat an EOF as '\r'. */
			} else {
				continue;
			}

			counts[WC_WORDS] += in_word;
			in_word = 0;
			if (c == EOF) {
				break;
			}
		} while (1);

		if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
			totals[WC_LENGTH] = counts[WC_LENGTH];
		}
		totals[WC_LENGTH] -= counts[WC_LENGTH];

		fclose_if_not_stdin(fp);

	OUTPUT:
		/* coreutils wc tries hard to print pretty columns
		 * (saves results for all files, find max col len etc...)
		 * we won't try that hard, it will bloat us too much */
		s = start_fmt;
		u = 0;
		do {
			if (print_type & (1 << u)) {
				printf(s, pcounts[u]);
				s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
			}
			totals[u] += pcounts[u];
		} while (++u < 4);
		printf(fname_fmt, arg);
	}

	/* If more than one file was processed, we want the totals.  To save some
	 * space, we set the pcounts ptr to the totals array.  This has the side
	 * effect of trashing the totals array after outputting it, but that's
	 * irrelavent since we no longer need it. */
	if (num_files > 1) {
		num_files = 0;				/* Make sure we don't get here again. */
		arg = "total";
		pcounts = totals;
		--argv;
		goto OUTPUT;
	}

	fflush_stdout_and_exit(status);
}