/* striptags.c
 * $Id: striptags.c,v 1.3 2004/05/03 02:57:15 perette Exp $
 * This program strips the tags from an HTML file.  It's not intelligent
 * about it in the least, so if there's mismatched < and >'s then it
 * will screw up.  If the HTML is okay, then this will work.
 */

#include <stdio.h>
#include <string.h>

#define MAXTAGLEN 30
#define NULLCH ((char)0)
#define STREQ(s1,s2) (strcasecmp((s1),(s2)) == 0)

int parsechar(void) {
	auto int pos = 0;
	auto int ch;
	auto char tagname [MAXTAGLEN + 1];
	while (((ch = getchar()) != EOF) && (ch != ';')) {
		if (pos < MAXTAGLEN)
			tagname[pos++] = ch;
	}
	if (ch == EOF)
		return (0);
	tagname[pos++] = NULLCH;
	if (STREQ(tagname, "lt"))
		ch = '<';
	else if (STREQ(tagname, "gt"))
		ch = '>';
	else if (STREQ(tagname, "amp"))
		ch = '&';
	else if (STREQ(tagname, "nbsp"))
		ch = ' ';
	else
		ch = NULLCH;

	if (ch)
		putchar(ch);
	return (1);
}


int parsetag(void) {
	auto int pos = 0;
	auto int eot = 0;
	auto int ch;
	auto char tagname [MAXTAGLEN + 1];
	/* HTML spec says tag must immediately follow <, so we
	   don't need to skip whitespace */
	while (((ch = getchar()) != EOF) && (ch != '>')) {
		if (isspace(ch))
			eot = 1;
		if ((!eot) && (pos < MAXTAGLEN))
			tagname[pos++] = ch;
	}
	tagname[pos++] = NULLCH;
	if (STREQ(tagname, "P")) {
		putchar('\n');
		putchar('\n');
	}
	putchar(' ');
	return (ch == '>');
}

int strip (const char *file) {
	int status = 1;
	auto int ch;
	while ((ch = getchar()) != EOF) {
		if (ch == '<') {
			if (!parsetag()) {
				fprintf (stderr, "%s: Faulty tag found.\n",
					file);
				status = 0;
			}
		} else if (ch == '&') {
			if (!parsechar()) {
				fprintf (stderr, "%s: Faulty &char; found.\n",
					file);
				status = 0;
			}
		} else
			putchar(ch);
	}
	return (status);
}



int main (int argc, const char *argv[], const char *env[]) {
	int status = 0;
	while (*(++argv)) {
		if (freopen (*argv, "r", stdin)) {
			if (!strip (*argv))
				status = 1;
		} else {
			perror (*argv);
			status = 1;
		}
	}
	if (argc == 1) {
		if (!strip ("stdin"))
			status = 1;
	}
	return (status);
}




