Micro patch to download utf16/utf32 with wget
It may be helpful. Some pages are being stored in long character formats that confuse wget.
Since I don't have enough time to extend wget, I provide a minimalistic patch that worked for me in the pages I was trying to obtain, and may also work for you with simple web pages that use utf16 or utf32.
I provide the patch with no Warranty under GNU GPL v3.
The patch can be applied to wget_1.12.orig.tar.gz from the debian packages at:
http://ftp.de.debian.org/debian/pool/main/w/wget/wget_1.12.orig.tar.gz
Save the patch as long_chars.patch
download wget-1.12.orig.tar.gz
tar xfz wget-1.12.orig.tar.gz
cd wget-1.12
patch -p1 < long_chars.patch
./configure
make
(remember to uninstall wget before install)
make install
#The patch starts below:
--- wget-1.12/src/html-parse.c 2009-09-04 12:31:54.000000000 -0400
+++ wget-1.12-patched_clean/src/html-parse.c 2013-01-10 17:08:19.000000000 -0500
@@ -811,6 +811,34 @@
static int tag_backout_count;
#endif
+int count_zeroes( const char *ptr, int size){
+ int i, n=0;
+ for (i = 0; i
+ n++;
+ }
+ return n;
+}
+
+void copy_skip_zeroes( const char *origin, char* destination, int size){
+ int i;
+ for (i = 0; i< size ; i++){
+ if(*origin){
+ *(destination++) = *origin;
+ }
+ origin ++;
+ }
+}
+
+const char * suppress_zeros(const char *ptr, int size, int *converted_size)
+{
+ int n = count_zeroes(ptr,size);
+ char *suppressed = (char *)malloc(size - n);
+ copy_skip_zeroes( ptr, suppressed, size - n);
+ *converted_size = size - n;
+ return suppressed;
+}
+
/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
MAPFUN will be called with two arguments: pointer to an initialized
struct taginfo, and MAPARG.
@@ -836,9 +864,10 @@
too little, POOL_APPEND allocates more with malloc. */
char pool_initial_storage[256];
struct pool pool;
-
- const char *p = text;
- const char *end = text + size;
+ int converted_size = size;
+ char * converted_text = suppress_zeros(text, size, &converted_size);
+ const char *p = converted_text;
+ const char *end = converted_text + size;
struct attr_pair attr_pair_initial_storage[8];
int attr_pair_size = countof (attr_pair_initial_storage);
0 Comments:
Post a Comment
<< Home