blob: e6596adc2079e70861ffdf14418a423e267b4cb0 [file] [log] [blame]
/*
* HTML Entity & Encoding normalization.
*
* Copyright (C) 2006 Török Edvin <edwin@clamav.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
*/
#ifndef _ENTITIES_H
#define _ENTITIES_H
#include "cltypes.h"
#include "hashtab.h"
#define UCS4_1234 (const unsigned char*)"UCS-4LE"
#define UCS4_4321 (const unsigned char*)"UCS-4BE"
#define UCS4_2143 (const unsigned char*)"UCS4"
#define UCS4_3412 (const unsigned char*)"UCS-4"
#define UTF16_BE (const unsigned char*)"UTF-16BE"
#define UTF16_LE (const unsigned char*)"UTF-16LE"
#define UTF8 (const unsigned char*)"UTF-8"
#define UNDECIDED_32_1234 UCS4_1234
#define UNDECIDED_32_4321 UCS4_4321
#define UNDECIDED_32_2143 UCS4_2143
#define UNDECIDED_32_3412 UCS4_3412
#define UNDECIDED_16_BE UTF16_BE
#define UNDECIDED_16_LE UTF16_LE
#define UNDECIDED_8 (const unsigned char*)"ISO-8859-1"
#define EBCDIC (const unsigned char*)"EBCDIC-US"
#define UNKNOWN (const unsigned char*)"\0"
#define OTHER (const unsigned char*)"OTHER"
enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META};
enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2134,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8,E_UNKNOWN,E_OTHER};
#define MAX_ENTITY_SIZE 22
struct entity_conv {
unsigned char* encoding;
const unsigned char* autodetected;
enum encoding_priority priority;
unsigned short int encoding_specific;/* sub-encoding, used for ISO*/
const struct hashtable* ht;
uint8_t has_bom;
uint8_t enc_bytes;
uint8_t bytes_read;
uint8_t bom_cnt;
uint32_t partial;
unsigned char bom[4];
#if 0
char* buffer;
char* buffer2;
#endif
size_t buffer_size;
size_t buffer_cnt;
uint8_t entity_buffcnt;
char entity_buff[MAX_ENTITY_SIZE+2];
m_area_t tmp_area;
m_area_t out_area;
m_area_t norm_area;
int msg_zero_shown;
};
int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size);
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
int entity_norm_done(struct entity_conv* conv);
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen);
unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* entity);
int entitynorm_init(void);
#endif