Thursday, January 10, 2013

Micro patch to download utf16/utf32 with wget


It may be helpful. Some pages are being stored in long character formats that confuse wget.
Since I don't have enough time to extend wget, I provide a minimalistic patch that worked for me in the pages I was trying to obtain, and may also work for you with simple web pages that use utf16 or utf32.
I provide the patch with no Warranty under GNU GPL v3.

The patch can be applied to wget_1.12.orig.tar.gz from the debian packages at:
http://ftp.de.debian.org/debian/pool/main/w/wget/wget_1.12.orig.tar.gz

Save the patch as long_chars.patch
download wget-1.12.orig.tar.gz
tar xfz  wget-1.12.orig.tar.gz
cd wget-1.12
patch -p1 < long_chars.patch
./configure
make
(remember to uninstall wget before install)
make install

#The patch starts below:


--- wget-1.12/src/html-parse.c  2009-09-04 12:31:54.000000000 -0400
+++ wget-1.12-patched_clean/src/html-parse.c    2013-01-10 17:08:19.000000000 -0500
@@ -811,6 +811,34 @@
 static int tag_backout_count;
 #endif

+int count_zeroes( const char *ptr, int size){
+  int i, n=0;
+  for (i = 0; i+    if (*ptr == 0)
+      n++;
+  }
+  return n;
+}
+
+void copy_skip_zeroes( const char *origin, char* destination, int size){
+  int i;
+  for (i = 0; i< size ; i++){
+    if(*origin){
+      *(destination++) = *origin;
+    }
+    origin ++;
+  }
+}
+
+const char * suppress_zeros(const char *ptr, int size, int *converted_size)
+{
+  int n = count_zeroes(ptr,size);
+  char *suppressed = (char *)malloc(size - n);
+  copy_skip_zeroes( ptr, suppressed, size - n);
+  *converted_size = size - n;
+  return suppressed;
+}
+
 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
    MAPFUN will be called with two arguments: pointer to an initialized
    struct taginfo, and MAPARG.
@@ -836,9 +864,10 @@
      too little, POOL_APPEND allocates more with malloc. */
   char pool_initial_storage[256];
   struct pool pool;
-
-  const char *p = text;
-  const char *end = text + size;
+  int converted_size = size;
+  char * converted_text = suppress_zeros(text, size, &converted_size);
+  const char *p = converted_text;
+  const char *end = converted_text + size;

   struct attr_pair attr_pair_initial_storage[8];
   int attr_pair_size = countof (attr_pair_initial_storage);