List Info

Thread: RE: faster Unicode.xs decode_xs




RE: faster Unicode.xs decode_xs
user name
2007-10-17 11:39:40
 
Here's a revised patch against "Unicode.xs,v 2.3".
It also
includes indentation fixes to return it back to that of
<=2.2.


# %<

--- Unicode.xs-orig	2007-10-10 12:07:35.294849100 +0100
+++ Unicode.xs	2007-10-17 16:47:01.058716500 +0100
 -24,28
+24,28 
     U8 *s = *sp;
     UV v = 0;
     if (s+size > e) {
-    croak("Partial character %c",(char) endian);
+	croak("Partial character %c",(char) endian);
     }
     switch(endian) {
     case 'N':
-    v = *s++;
-    v = (v << 8) | *s++;
+	v = *s++;
+	v = (v << 8) | *s++;
     case 'n':
-    v = (v << 8) | *s++;
-    v = (v << 8) | *s++;
-    break;
+	v = (v << 8) | *s++;
+	v = (v << 8) | *s++;
+	break;
     case 'V':
     case 'v':
-    v |= *s++;
-    v |= (*s++ << 8);
-    if (endian == 'v')
-        break;
-    v |= (*s++ << 16);
-    v |= (*s++ << 24);
-    break;
+	v |= *s++;
+	v |= (*s++ << 8);
+	if (endian == 'v')
+	    break;
+	v |= (*s++ << 16);
+	v |= (*s++ << 24);
+	break;
     default:
-    croak("Unknown endian %c",(char) endian);
-    break;
+	croak("Unknown endian %c",(char) endian);
+	break;
     }
     *sp = s;
     return v;
 -58,25
+58,25 
     switch(endian) {
     case 'v':
     case 'V':
-    d += SvCUR(result);
-    SvCUR_set(result,SvCUR(result)+size);
-    while (size--) {
-        *d++ = (U8)(value & 0xFF);
-        value >>= 8;
-    }
-    break;
+	d += SvCUR(result);
+	SvCUR_set(result,SvCUR(result)+size);
+	while (size--) {
+	    *d++ = (U8)(value & 0xFF);
+	    value >>= 8;
+	}
+	break;
     case 'n':
     case 'N':
-    SvCUR_set(result,SvCUR(result)+size);
-    d += SvCUR(result);
-    while (size--) {
-        *--d = (U8)(value & 0xFF);
-        value >>= 8;
-    }
-    break;
+	SvCUR_set(result,SvCUR(result)+size);
+	d += SvCUR(result);
+	while (size--) {
+	    *--d = (U8)(value & 0xFF);
+	    value >>= 8;
+	}
+	break;
     default:
-    croak("Unknown endian %c",(char) endian);
-    break;
+	croak("Unknown endian %c",(char) endian);
+	break;
     }
 }
 
 -95,123
+95,172 
 CODE:
 {
     U8 endian   = *((U8
*)SvPV_nolen(attr("endian", 6)));
-    int size    =   SvIV(attr("size",   4));
-    int ucs2    = SvTRUE(attr("ucs2",   4));
-    int renewed = SvTRUE(attr("renewed",  7));
+    int size    = SvIV(attr("size", 4));
+    int ucs2    = -1; /* only needed in the event of
surrogate pairs */
     SV *result  = newSVpvn("",0);
+    STRLEN usize = (size > 0 ? size : 1); /* protect
against rogue
size<=0 */
     STRLEN ulen;
+    STRLEN resultbuflen;
+    U8 *resultbuf;
     U8 *s = (U8 *)SvPVbyte(str,ulen);
     U8 *e = (U8 *)SvEND(str);
+    /* Optimise for the common case of being called from
PerlIOEncode_fill()
+       with a 1024 byte buffer (this value comes from
PerlIOEncode_get_base()).
+       In this case the result SV's buffer is only used
temporarily, so
we can
+       afford to allocate the maximum needed and not care
about unused
space.
+     */
+    bool temp_result = (ulen == 1024);
+
     ST(0) = sv_2mortal(result);
     SvUTF8_on(result);
 
     if (!endian && s+size <= e) {
-    UV bom;
-    endian = (size == 4) ? 'N' : 'n';
-    bom = enc_unpack(aTHX_ &s,e,size,endian);
+	UV bom;
+	endian = (size == 4) ? 'N' : 'n';
+	bom = enc_unpack(aTHX_ &s,e,size,endian);
         if (bom != BOM_BE) {
-        if (bom == BOM16LE) {
-        endian = 'v';
-        }
-        else if (bom == BOM32LE) {
-        endian = 'V';
-        }
-        else {
-        croak("%"SVf":Unrecognised BOM
%"UVxf,
+	    if (bom == BOM16LE) {
+		endian = 'v';
+	    }
+	    else if (bom == BOM32LE) {
+		endian = 'V';
+	    }
+	    else {
+		croak("%"SVf":Unrecognised BOM
%"UVxf,
                       *hv_fetch((HV
*)SvRV(obj),"Name",4,0),
-              bom);
-        }
-    }
+		      bom);
+	    }
+	}
 #if 1
-    /* Update endian for next sequence */
-    if (renewed) {
-        hv_store((HV
*)SvRV(obj),"endian",6,newSVpv((char
*)&endian,1),0);
-    }
+	/* Update endian for next sequence */
+	if (SvTRUE(attr("renewed", 7))) {
+	    hv_store((HV
*)SvRV(obj),"endian",6,newSVpv((char
*)&endian,1),0);
+	}
 #endif
     }
+
+    if (temp_result) {
+	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; /* <14Kb
*/
+    } else {
+	/* Preallocate the buffer assuming 1 utf8 byte per
unicode
character. */
+	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
+    }
+    resultbuf = (U8 *) SvGROW(result, resultbuflen);
+
     while (s < e && s+size <= e) {
-    UV ord = enc_unpack(aTHX_ &s,e,size,endian);
-    U8 *d;
-    if (issurrogate(ord)) {
-        if (ucs2 || size == 4) {
-        if (check) {
-            croak("%"SVf":no surrogates
allowed %"UVxf,
-              *hv_fetch((HV
*)SvRV(obj),"Name",4,0),
-              ord);
-        }
-        if (s+size <= e) {
+	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
+	U8 *d;
+	if (issurrogate(ord)) {
+	    if (ucs2 == -1) {
+		ucs2 = SvTRUE(attr("ucs2", 4));
+	    }
+	    if (ucs2 || size == 4) {
+		if (check) {
+		    croak("%"SVf":no surrogates allowed
%"UVxf,
+			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+			  ord);
+		}
+		if (s+size <= e) {
                     /* skip the next one as well */
-            enc_unpack(aTHX_ &s,e,size,endian);
-        }
-        ord = FBCHAR;
-        }
-        else {
-        UV lo;
-        if (!isHiSurrogate(ord)) {
-            if (check) {
-            croak("%"SVf":Malformed HI
surrogate %"UVxf,
-                  *hv_fetch((HV
*)SvRV(obj),"Name",4,0),
-                  ord);
-            }
-            else {
-            ord = FBCHAR;
-            }
-        }
-            else {
-            if (s+size > e) {
-            /* Partial character */
-            s -= size;   /* back up to 1st half */
-            break;       /* And exit loop */
-            }
-            lo = enc_unpack(aTHX_ &s,e,size,endian);
-            if (!isLoSurrogate(lo)){
-            if (check) {
-                croak("%"SVf":Malformed LO
surrogate %"UVxf,
-                  *hv_fetch((HV
*)SvRV(obj),"Name",4,0),
-                  ord);
-            }
-            else {
-                ord = FBCHAR;
-            }
-            }
-            else {
-            ord = 0x10000 + ((ord - 0xD800) << 10) +
(lo - 0xDC00);
-            }
-        }
-        }
-    }
-
-    if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0
&& ord <= 0xFDEF)) {
-        if (check) {
-        croak("%"SVf":Unicode character
%"UVxf" is illegal",
-              *hv_fetch((HV
*)SvRV(obj),"Name",4,0),
-              ord);
-        } else {
-        ord = FBCHAR;
-        }
-    }
-
-    d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
-    d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
-    SvCUR_set(result,d - (U8 *)SvPVX(result));
+		    enc_unpack(aTHX_ &s,e,size,endian);
+		}
+		ord = FBCHAR;
+	    }
+	    else {
+		UV lo;
+		if (!isHiSurrogate(ord)) {
+		    if (check) {
+			croak("%"SVf":Malformed HI surrogate
%"UVxf,
+			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+			      ord);
+		    }
+		    else {
+			ord = FBCHAR;
+		    }
+		}
+	        else {
+		    if (s+size > e) {
+			/* Partial character */
+			s -= size;   /* back up to 1st half */
+			break;       /* And exit loop */
+		    }
+		    lo = enc_unpack(aTHX_ &s,e,size,endian);
+		    if (!isLoSurrogate(lo)){
+			if (check) {
+			    croak("%"SVf":Malformed LO surrogate
%"UVxf,
+				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+				  ord);
+			}
+			else {
+			    ord = FBCHAR;
+			}
+		    }
+		    else {
+			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo -
0xDC00);
+		    }
+		}
+	    }
+	}
+
+	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0
&& ord <=
0xFDEF)) {
+	    if (check) {
+		croak("%"SVf":Unicode character
%"UVxf" is illegal",
+		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+		      ord);
+	    } else {
+		ord = FBCHAR;
+	    }
+	}
+
+	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
+	    /* Do not allocate >8Mb more than the minimum
needed.
+	       This prevents allocating too much in the rogue case
of a
large
+	       input consisting initially of long sequence
uft8-byte
unicode
+	       chars followed by single utf8-byte chars.
+	     */
+	    STRLEN remaining = (e - s)/usize;
+	    STRLEN max_alloc = remaining + (8*1024*1024);
+	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
+	    STRLEN newlen = SvLEN(result) + /* min(max_alloc,
est_alloc)
*/
+		(est_alloc > max_alloc ? max_alloc : est_alloc);
+	    resultbuf = (U8 *) SvGROW(result, newlen);
+	    resultbuflen = SvLEN(result);
+	}
+
+	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0);
+	SvCUR_set(result, d - (U8 *)SvPVX(result));
     }
+
     if (s < e) {
-    /* unlikely to happen because it's fixed-length --
dankogai */
-    if (check & ENCODE_WARN_ON_ERR){
-        Perl_warner(aTHX_
packWARN(WARN_UTF8),"%"SVf":Partial
character",
-            *hv_fetch((HV
*)SvRV(obj),"Name",4,0));
-    }
+	/* unlikely to happen because it's fixed-length --
dankogai */
+	if (check & ENCODE_WARN_ON_ERR){
+	    Perl_warner(aTHX_
packWARN(WARN_UTF8),"%"SVf":Partial
character",
+			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
+	}
     }
     if (check && !(check & ENCODE_LEAVE_SRC)){
-    if (s < e) {
-        Move(s,SvPVX(str),e-s,U8);
-        SvCUR_set(str,(e-s));
-    }
-    else {
-        SvCUR_set(str,0);
+	if (s < e) {
+	    Move(s,SvPVX(str),e-s,U8);
+	    SvCUR_set(str,(e-s));
+	}
+	else {
+	    SvCUR_set(str,0);
+	}
+	*SvEND(str) = '';
     }
-    *SvEND(str) = '';
+
+    /* Avoid wasting too much space in the result buffer
*/
+    if (!temp_result && (SvLEN(result) > 42 +
SvCUR(result))) {
+	char *buf;
+	STRLEN datalen = 1 + SvCUR(result); /* include the NUL
byte */
+	STRLEN buflen = PERL_STRLEN_ROUNDUP(datalen);
+	Newx(buf, buflen, char);
+	Copy(SvPVX(result), buf, datalen, char);
+	Safefree(SvPVX(result));
+	SvPV_set(result, buf);
+	SvLEN_set(result, buflen);
     }
+
     XSRETURN(1);
 }
 
 -232,65
+281,64 
     U8 *e = (U8 *)SvEND(utf8);
     ST(0) = sv_2mortal(result);
     if (!endian) {
-    endian = (size == 4) ? 'N' : 'n';
-    enc_pack(aTHX_ result,size,endian,BOM_BE);
+	endian = (size == 4) ? 'N' : 'n';
+	enc_pack(aTHX_ result,size,endian,BOM_BE);
 #if 1
-    /* Update endian for next sequence */
-    if (renewed){
-        hv_store((HV
*)SvRV(obj),"endian",6,newSVpv((char
*)&endian,1),0);
-    }
+	/* Update endian for next sequence */
+	if (renewed){
+	    hv_store((HV
*)SvRV(obj),"endian",6,newSVpv((char
*)&endian,1),0);
+	}
 #endif
     }
     while (s < e && s+UTF8SKIP(s) <= e) {
-    STRLEN len;
-    UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
+	STRLEN len;
+	UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
         s += len;
-    if (size != 4 && invalid_ucs2(ord)) {
-        if (!issurrogate(ord)){
-        if (ucs2) {
-            if (check) {
-            croak("%"SVf":code point
"\x{%"UVxf"}" too high",
-                  *hv_fetch((HV
*)SvRV(obj),"Name",4,0),ord);
-            }
-            enc_pack(aTHX_ result,size,endian,FBCHAR);
-        }else{
-            UV hi = ((ord - 0x10000) >> 10)   +
0xD800;
-            UV lo = ((ord - 0x10000) & 0x3FF) +
0xDC00;
-            enc_pack(aTHX_ result,size,endian,hi);
-            enc_pack(aTHX_ result,size,endian,lo);
-        }
-        }
-        else {
-        /* not supposed to happen */
-        enc_pack(aTHX_ result,size,endian,FBCHAR);
-        }
-    }
-    else {
-        enc_pack(aTHX_ result,size,endian,ord);
-    }
+	if (size != 4 && invalid_ucs2(ord)) {
+	    if (!issurrogate(ord)){
+		if (ucs2) {
+		    if (check) {
+			croak("%"SVf":code point
"\x{%"UVxf"}" too
high",
+				  *hv_fetch((HV
*)SvRV(obj),"Name",4,0),ord);
+		    }
+		    enc_pack(aTHX_ result,size,endian,FBCHAR);
+		}else{
+		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
+		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
+		    enc_pack(aTHX_ result,size,endian,hi);
+		    enc_pack(aTHX_ result,size,endian,lo);
+		}
+	    }
+	    else {
+		/* not supposed to happen */
+		enc_pack(aTHX_ result,size,endian,FBCHAR);
+	    }
+	}
+	else {
+	    enc_pack(aTHX_ result,size,endian,ord);
+	}
     }
     if (s < e) {
-    /* UTF-8 partial char happens often on PerlIO.
-       Since this is okay and normal, we do not warn.
-       But this is critical when you choose to LEAVE_SRC
-       in which case we die */
-    if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
-        Perl_croak(aTHX_ "%"SVf":partial
character is not allowed "
-               "when CHECK = 0x%" UVuf,
-               *hv_fetch((HV
*)SvRV(obj),"Name",4,0), check);
-    }
-    
+	/* UTF-8 partial char happens often on PerlIO.
+	   Since this is okay and normal, we do not warn.
+	   But this is critical when you choose to LEAVE_SRC
+	   in which case we die */
+	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
+	    Perl_croak(aTHX_ "%"SVf":partial
character is not allowed "
+		       "when CHECK = 0x%" UVuf,
+		       *hv_fetch((HV *)SvRV(obj),"Name",4,0),
check);
+	}
+
     }
     if (check && !(check & ENCODE_LEAVE_SRC)){
-    if (s < e) {
-        Move(s,SvPVX(utf8),e-s,U8);
-        SvCUR_set(utf8,(e-s));
-    }
-    else {
-        SvCUR_set(utf8,0);
+	if (s < e) {
+	    Move(s,SvPVX(utf8),e-s,U8);
+	    SvCUR_set(utf8,(e-s));
+	}
+	else {
+	    SvCUR_set(utf8,0);
+	}
+	*SvEND(utf8) = '';
     }
-    *SvEND(utf8) = '';
-    } 
     XSRETURN(1);
 }
-

# %>



Also, here's a timing test script to play with:

# %<
use warnings;
use strict;
use Time::HiRes;
use Encode;
$| =1;
my $f = 'utf16le_data.txt';
unless (-f $f) {
	# create some UTF16
	open F, ">:raw:perlio:encoding(utf16le)", $f
		or die "cannot write $f: $!n";
	my $line = ("a" x 80) . "n";
	for (1..10_000) {	
		print F $line;
	}
	close F;
}
# Time how long to read UTF16 via encoding layer.
open F, "<:raw:perlio:encoding(utf16le)", $f
	or die "cannot open $f: $!n";
my $start = Time::HiRes::time;
for (1..10) {
	seek F, 0, 0 or die "cannot seek: $!n";
	while (<F>) {}
}
print "Took: ", (Time::HiRes::time - $start),
"n";
close F;
# Read UTF16 into buffer to decode
open F, "<", $f
	or die "cannot open $f: $!n";
binmode F;
read F, my $utf16_data, 10_000_000
	or die "can't read: $!n";
close F;
# Time how long to go thro' explicit call to decode()
$start = Time::HiRes::time;
for (1..10) {
	my $utf8 = Encode::decode("utf16le",
$utf16_data);
}
print "Took: ", (Time::HiRes::time - $start),
"n";

# >%


Cheers, alex.

[1]

about | contact  Other archives ( Real Estate discussion Medical topics )