/* striptags.c
 * $Id$
 * This program strips the tags from an HTML file.  It's not intelligent
 * about it in the least, so if there's mismatched < and >'s then it
 * will screw up.  If the HTML is okay, the this will work.  This only 
 * reads from standard in.  (It's really stupid.)
 */

#include <stdio.h>
#include <string.h>

#define MAXTAGLEN 30
#define NULLCH ((char)0)
#define	NULLP(type) ((type *)0)
#define STREQ(s1,s2) (strcasecmp((s1),(s2)) == 0)

void parsechar(void) {
	auto int pos = 0;
	auto int ch;
	auto char tagname [MAXTAGLEN + 1];
	while (((ch = getchar()) != EOF) && (ch != ';')) {
		if (pos < MAXTAGLEN)
			tagname[pos++] = ch;
	}
	tagname[pos++] = NULLCH;
	if (STREQ(tagname, "lt"))
		ch = '<';
	else if (STREQ(tagname, "gt"))
		ch = '>';
	else if (STREQ(tagname, "amp"))
		ch = '&';
	else
		ch = NULLCH;

	if (ch)
		putchar(ch);
}

void parsetag(void) {
	auto int pos = 0;
	auto int eot = 0;
	auto int endtag = 0;
	auto int ch;
	auto char *out=NULLP(char);
	auto char tagname [MAXTAGLEN + 1];
	/* HTML spec says tag must immediately follow <, so we
	   don't need to skip whitespace */
	if ((ch = getc(stdin)) == '/')
		endtag = 1;
	else
		ungetc(ch, stdin);
	while (((ch = getc(stdin)) != EOF) && (ch != '>')) {
		if (isspace(ch))
			eot = 1;
		if ((!eot) && (pos < MAXTAGLEN))
			tagname[pos++] = ch;
	}
	tagname[pos++] = NULLCH;
	if (STREQ(tagname, "P") || STREQ(tagname, "LI")) {
		out = "\n\n";
	} if (STREQ(tagname, "BR")) {
		out = "\n";
	} else if (STREQ(tagname,"B")) {
		out = (endtag ? "\016" : "\002" );
	} else if (STREQ(tagname,"I")) {
		out = (endtag ? "\016" : "\005" );
	} else if (STREQ(tagname,"SUB")) {
		out = (endtag ? "\016" : "\021" );
	} else if (STREQ(tagname,"SUP")) {
		out = (endtag ? "\016" : "\023" );
	} else if (STREQ(tagname,"U")) {
		out = (endtag ? "\016" : "\026" );
	} else if (STREQ(tagname,"EM")) {
		out = (endtag ? "\016" : "\005" );
	} else if (STREQ(tagname,"STRONG")) {
		out = (endtag ? "\016" : "\002" );
	} else if (STREQ(tagname,"TT") || STREQ(tagname, "CODE")) {
		out = (endtag ? "\016" : "\024" );
	} else if (STREQ(tagname,"BIG")) {
		out = (endtag ? "\016" : "\001" );
	} else if (STREQ(tagname,"SMALL")) {
		out = (endtag ? "\017ENDS\016" : "\017SIZE 0.7\016" );
	} else if (STREQ(tagname,"PRE")) {
		out = (endtag ? "\n\017END VERBATIM\016"
		    : "\n\017BEGIN VERBATIM\016" );
	} else if (STREQ(tagname,"ADDRESS")) {
		out = (endtag ? "\n\017END ADDRESS\016"
		    : "\n\017BEGIN ADDRESS\016" );
	} else if (STREQ(tagname,"UL")) {
		out = (endtag ? "\n\017END BULLETS\016"
		    : "\n\017BEGIN BULLETS\016" );
	} else if (STREQ(tagname,"OL")) {
		out = (endtag ? "\n\017END NUMBERED\016" 
		    : "\n\017BEGIN NUMBERED\016" );
	} else if (STREQ(tagname,"IMG")) {
		out = "\n ## IMAGE GOES HERE ## \n";
	}
	if (out)
		fputs(out, stdout);
}

int main (int argc, char *argv[], char *env[]) {
	auto int ch;
	while ((ch = getchar()) != EOF) {
		if (ch == '<')
			parsetag();
		else if (ch == '&')
			parsechar();
		else if (ch == '\n')
			putchar(' ');
		else
			putchar(ch);
	}
}

