欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

libxml2中文支持

程序员文章站 2022-04-09 21:42:50
...

  LibXML2自身已经支持了中文编码,只是他的所有api处理的数据都是UTF-8类型的,所以只要在读入和写入数据时进行相应装换即可!

代码1是使用Linux下C API进行编码转换;

代码2因为libxml2已经融合了iconv,使用了libxml2的函数来进行编码转换。

/*
compile: gcc -l/usr/include/libxml2 -lxml2 iconv.c
input:
    test.xml
        <?xml version="1.0" encoding="gb2312"?>
        <parent>测试</parent>
output:
    测试
1) iconv
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <arpa/inet.h>
#include <iconv.h>

char *Convert(char *encFrom, char *encTo, const char *in)
{
    static char bufin[1024], bufout[1024], *sin, *sout;
    int mode, lenin, lenout, ret, nline;
    iconv_t c_pt;

    if ((c_pt = iconv_open(encTo, encFrom)) == (iconv_t)-1) {
        printf("iconv_open false: %s ==> %s\n", encFrom, encTo);
        return NULL;
    }
    iconv(c_pt, NULL, NULL, NULL, NULL);

    lenin = strlen(in) + 1;
    lenout = 1024;
    sin = (char *)in;
    sout = bufout;
    ret = iconv(c_pt, &sin, (size_t *)&lenin, &sout, (size_t *)&lenout);

    if (ret == -1) {
        return NULL;
    }

    iconv_close(c_pt);
    return bufout;
}

int main(void) {
    xmlDocPtr doc = NULL;
    xmlNodePtr cur = NULL;

    doc = xmlParseFile("test.xml");
    cur = xmlDocGetRootElement(doc);
    printf("%s\n", (char *)xmlNodeGetContent(cur));
    //printf(%s\n", Convert("utf-8", "gb2312", (char*)xmlNodeGetContent(cur)));
}

2) xmlFindCharEncodingHandler

使用数据类型: xmlCharEncodingHandlerPtr

/*******************************************
* compile: gcc -l/usr/include/libxml2/ -lxml2 convert.c
* usage: convert utf-8 string or null
* input: /convert 测试
* output:
    [[email protected] test]$ ./convert 测试
    ISO-8859-1: 
        虏芒脢脭
<?xml version="1.0" encoding="ISO-8859-1"?>
<root>测试</root>
**************************************/

#include <libxml/encoding.h>
/**
 * function name: ConvertInput
 * input:
    @in: string in a given encoding
    @encoding: the encoding used
* description: Converts @in into UTF-8 for processing with libxml2 APIs
* return: returns the converted UTF-8 string, or NULL in case of error.
**/
unsigned char *ConvertInput(const char *in, const char *encoding)
{
    unsigned char *out;
    int ret;
    int size;
    int out_size;
    int temp;
    xmlCharEncodingHandlerPtr handler;

    if (in == 0) return 0;

    handler = xmlFindCharEncodingHandler(encoding);

    if (!handler) {
        printf("ConvertInput: no encoding handler found for '%s'\n",
            encoding ? encoding : "");
        return 0;
    }

    size = (int) strlen(in) + 1;
    out_size = size * 2 - 1;
    out = (unsigned char *)xmlMalloc((size_t) out_size);

    if (out != 0) {
        temp = size - 1;
        ret = handler->input(out, &out_size, (const unsigned char *)in, &temp);
        if ((ret < 0) || (temp - size + 1)) {
            if (ret < 0) {
                printf("ConvertInput: conversion wasn't successful.\n");
            } else {
                printf("ConvertInput: conversion wasn't successful. converted: %i octets.\n", temp);
            }
            xmlFree(out);
            out = 0;
        } else {
            out = (unsigned char *)xmlRealloc(out, out_size + 1);
            out[out_size] = 0; /* null terminating out */
        }
    } else {
        printf("ConvertInput: no mem\n");
    }

    return out;
}

int main(int argc, char **argv)
{
    unsigned char *content, *out;
    xmlDocPtr doc;
    xmlNodePtr rootnode;
    char *encoding = "ISO-8859-1"; //utf-8, ISO-8859-1
    if (argc <= 1) {
        printf("Usage: %s content\n", argv[0]);
        return (0);
    }
    content = argv[1];

    out = ConvertInput(content, encoding);
    printf("%s:%s\n", encoding, out);
    doc = xmlNewDoc("1.0");
    rootnode = xmlNewDocNode(doc, NULL, (const xmlChar*)"root", out);
    xmlDocSetRootElement(doc, rootnode);
    xmlSaveFormatFileEnc("=", doc, encoding, 1);
    return 1;
}

 

相关标签: linux