root/releases/0.6rc2/lib/kses.php

Revision 269, 18.9 kB (checked in by ben, 3 years ago)

--

  • Property svn:eol-style set to native
Line 
1 <?php
2
3 # kses 0.2.2 - HTML/XHTML filter that only allows some elements and attributes
4 # Copyright (C) 2002, 2003, 2005  Ulf Harnhammar
5 #
6 # This program is free software and open source software; you can redistribute
7 # it and/or modify it under the terms of the GNU General Public License as
8 # published by the Free Software Foundation; either version 2 of the License,
9 # or (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14 # more details.
15 #
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  or visit
19 # http://www.gnu.org/licenses/gpl.html
20 #
21 # *** CONTACT INFORMATION ***
22 #
23 # E-mail:      metaur at users dot sourceforge dot net
24 # Web page:    http://sourceforge.net/projects/kses
25 # Paper mail:  Ulf Harnhammar
26 #              Ymergatan 17 C
27 #              753 25  Uppsala
28 #              SWEDEN
29 #
30 # [kses strips evil scripts!]
31
32
33 function kses($string, $allowed_html, $allowed_protocols =
34                array('http', 'https', 'ftp', 'news', 'nntp', 'telnet',
35                      'gopher', 'mailto'))
36 ###############################################################################
37 # This function makes sure that only the allowed HTML element names, attribute
38 # names and attribute values plus only sane HTML entities will occur in
39 # $string. You have to remove any slashes from PHP's magic quotes before you
40 # call this function.
41 ###############################################################################
42 {
43   $string = kses_no_null($string);
44   $string = kses_js_entities($string);
45   $string = kses_normalize_entities($string);
46   $string = kses_hook($string);
47   $allowed_html_fixed = kses_array_lc($allowed_html);
48   return kses_split($string, $allowed_html_fixed, $allowed_protocols);
49 } # function kses
50
51
52 function kses_hook($string)
53 ###############################################################################
54 # You add any kses hooks here.
55 ###############################################################################
56 {
57   return $string;
58 } # function kses_hook
59
60
61 function kses_version()
62 ###############################################################################
63 # This function returns kses' version number.
64 ###############################################################################
65 {
66   return '0.2.2';
67 } # function kses_version
68
69
70 function kses_split($string, $allowed_html, $allowed_protocols)
71 ###############################################################################
72 # This function searches for HTML tags, no matter how malformed. It also
73 # matches stray ">" characters.
74 ###############################################################################
75 {
76   return preg_replace('%(<'.   # EITHER: <
77                       '[^>]*'. # things that aren't >
78                       '(>|$)'. # > or end of string
79                       '|>)%e', # OR: just a >
80                       "kses_split2('\\1', \$allowed_html, ".
81                       '$allowed_protocols)',
82                       $string);
83 } # function kses_split
84
85
86 function kses_split2($string, $allowed_html, $allowed_protocols)
87 ###############################################################################
88 # This function does a lot of work. It rejects some very malformed things
89 # like <:::>. It returns an empty string, if the element isn't allowed (look
90 # ma, no strip_tags()!). Otherwise it splits the tag into an element and an
91 # attribute list.
92 ###############################################################################
93 {
94   $string = kses_stripslashes($string);
95
96   if (substr($string, 0, 1) != '<')
97     return '&gt;';
98     # It matched a ">" character
99
100   if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches))
101     return '';
102     # It's seriously malformed
103
104   $slash = trim($matches[1]);
105   $elem = $matches[2];
106   $attrlist = $matches[3];
107
108   if (!@isset($allowed_html[strtolower($elem)]))
109     return '';
110     # They are using a not allowed HTML element
111
112   if ($slash != '')
113     return "<$slash$elem>";
114   # No attributes are allowed for closing elements
115
116   return kses_attr("$slash$elem", $attrlist, $allowed_html,
117                    $allowed_protocols);
118 } # function kses_split2
119
120
121 function kses_attr($element, $attr, $allowed_html, $allowed_protocols)
122 ###############################################################################
123 # This function removes all attributes, if none are allowed for this element.
124 # If some are allowed it calls kses_hair() to split them further, and then it
125 # builds up new HTML code from the data that kses_hair() returns. It also
126 # removes "<" and ">" characters, if there are any left. One more thing it
127 # does is to check if the tag has a closing XHTML slash, and if it does,
128 # it puts one in the returned code as well.
129 ###############################################################################
130 {
131 # Is there a closing XHTML slash at the end of the attributes?
132
133   $xhtml_slash = '';
134   if (preg_match('%\s/\s*$%', $attr))
135     $xhtml_slash = ' /';
136
137 # Are any attributes allowed at all for this element?
138
139   if (@count($allowed_html[strtolower($element)]) == 0)
140     return "<$element$xhtml_slash>";
141
142 # Split it
143
144   $attrarr = kses_hair($attr, $allowed_protocols);
145
146 # Go through $attrarr, and save the allowed attributes for this element
147 # in $attr2
148
149   $attr2 = '';
150
151   foreach ($attrarr as $arreach)
152   {
153     if (!@isset($allowed_html[strtolower($element)]
154                             [strtolower($arreach['name'])]))
155       continue; # the attribute is not allowed
156
157     $current = $allowed_html[strtolower($element)]
158                             [strtolower($arreach['name'])];
159
160     if (!is_array($current))
161       $attr2 .= ' '.$arreach['whole'];
162     # there are no checks
163
164     else
165     {
166     # there are some checks
167       $ok = true;
168       foreach ($current as $currkey => $currval)
169         if (!kses_check_attr_val($arreach['value'], $arreach['vless'],
170                                  $currkey, $currval))
171         { $ok = false; break; }
172
173       if ($ok)
174         $attr2 .= ' '.$arreach['whole']; # it passed them
175     } # if !is_array($current)
176   } # foreach
177
178 # Remove any "<" or ">" characters
179
180   $attr2 = preg_replace('/[<>]/', '', $attr2);
181
182   return "<$element$attr2$xhtml_slash>";
183 } # function kses_attr
184
185
186 function kses_hair($attr, $allowed_protocols)
187 ###############################################################################
188 # This function does a lot of work. It parses an attribute list into an array
189 # with attribute data, and tries to do the right thing even if it gets weird
190 # input. It will add quotes around attribute values that don't have any quotes
191 # or apostrophes around them, to make it easier to produce HTML code that will
192 # conform to W3C's HTML specification. It will also remove bad URL protocols
193 # from attribute values.
194 ###############################################################################
195 {
196   $attrarr = array();
197   $mode = 0;
198   $attrname = '';
199
200 # Loop through the whole attribute list
201
202   while (strlen($attr) != 0)
203   {
204     $working = 0; # Was the last operation successful?
205
206     switch ($mode)
207     {
208       case 0: # attribute name, href for instance
209
210         if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
211         {
212           $attrname = $match[1];
213           $working = $mode = 1;
214           $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
215         }
216
217         break;
218
219       case 1: # equals sign or valueless ("selected")
220
221         if (preg_match('/^\s*=\s*/', $attr)) # equals sign
222         {
223           $working = 1; $mode = 2;
224           $attr = preg_replace('/^\s*=\s*/', '', $attr);
225           break;
226         }
227
228         if (preg_match('/^\s+/', $attr)) # valueless
229         {
230           $working = 1; $mode = 0;
231           $attrarr[] = array
232                         ('name'  => $attrname,
233                          'value' => '',
234                          'whole' => $attrname,
235                          'vless' => 'y');
236           $attr = preg_replace('/^\s+/', '', $attr);
237         }
238
239         break;
240
241       case 2: # attribute value, a URL after href= for instance
242
243         if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match))
244          # "value"
245         {
246           $thisval = kses_bad_protocol($match[1], $allowed_protocols);
247
248           $attrarr[] = array
249                         ('name'  => $attrname,
250                          'value' => $thisval,
251                          'whole' => "$attrname=\"$thisval\"",
252                          'vless' => 'n');
253           $working = 1; $mode = 0;
254           $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
255           break;
256         }
257
258         if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match))
259          # 'value'
260         {
261           $thisval = kses_bad_protocol($match[1], $allowed_protocols);
262
263           $attrarr[] = array
264                         ('name'  => $attrname,
265                          'value' => $thisval,
266                          'whole' => "$attrname='$thisval'",
267                          'vless' => 'n');
268           $working = 1; $mode = 0;
269           $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
270           break;
271         }
272
273         if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match))
274          # value
275         {
276           $thisval = kses_bad_protocol($match[1], $allowed_protocols);
277
278           $attrarr[] = array
279                         ('name'  => $attrname,
280                          'value' => $thisval,
281                          'whole' => "$attrname=\"$thisval\"",
282                          'vless' => 'n');
283                          # We add quotes to conform to W3C's HTML spec.
284           $working = 1; $mode = 0;
285           $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
286         }
287
288         break;
289     } # switch
290
291     if ($working == 0) # not well formed, remove and try again
292     {
293       $attr = kses_html_error($attr);
294       $mode = 0;
295     }
296   } # while
297
298   if ($mode == 1)
299   # special case, for when the attribute list ends with a valueless
300   # attribute like "selected"
301     $attrarr[] = array
302                   ('name'  => $attrname,
303                    'value' => '',
304                    'whole' => $attrname,
305                    'vless' => 'y');
306
307   return $attrarr;
308 } # function kses_hair
309
310
311 function kses_check_attr_val($value, $vless, $checkname, $checkvalue)
312 ###############################################################################
313 # This function performs different checks for attribute values. The currently
314 # implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless"
315 # with even more checks to come soon.
316 ###############################################################################
317 {
318   $ok = true;
319
320   switch (strtolower($checkname))
321   {
322     case 'maxlen':
323     # The maxlen check makes sure that the attribute value has a length not
324     # greater than the given value. This can be used to avoid Buffer Overflows
325     # in WWW clients and various Internet servers.
326
327       if (strlen($value) > $checkvalue)
328         $ok = false;
329       break;
330
331     case 'minlen':
332     # The minlen check makes sure that the attribute value has a length not
333     # smaller than the given value.
334
335       if (strlen($value) < $checkvalue)
336         $ok = false;
337       break;
338
339     case 'maxval':
340     # The maxval check does two things: it checks that the attribute value is
341     # an integer from 0 and up, without an excessive amount of zeroes or
342     # whitespace (to avoid Buffer Overflows). It also checks that the attribute
343     # value is not greater than the given value.
344     # This check can be used to avoid Denial of Service attacks.
345
346       if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
347         $ok = false;
348       if ($value > $checkvalue)
349         $ok = false;
350       break;
351
352     case 'minval':
353     # The minval check checks that the attribute value is a positive integer,
354     # and that it is not smaller than the given value.
355
356       if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
357         $ok = false;
358       if ($value < $checkvalue)
359         $ok = false;
360       break;
361
362     case 'valueless':
363     # The valueless check checks if the attribute has a value
364     # (like <a href="blah">) or not (<option selected>). If the given value
365     # is a "y" or a "Y", the attribute must not have a value.
366     # If the given value is an "n" or an "N", the attribute must have one.
367
368       if (strtolower($checkvalue) != $vless)
369         $ok = false;
370       break;
371   } # switch
372
373   return $ok;
374 } # function kses_check_attr_val
375
376
377 function kses_bad_protocol($string, $allowed_protocols)
378 ###############################################################################
379 # This function removes all non-allowed protocols from the beginning of
380 # $string. It ignores whitespace and the case of the letters, and it does
381 # understand HTML entities. It does its work in a while loop, so it won't be
382 # fooled by a string like "javascript:javascript:alert(57)".
383 ###############################################################################
384 {
385   $string = kses_no_null($string);
386   $string = preg_replace('/\xad+/', '', $string); # deals with Opera "feature"
387   $string2 = $string.'a';
388
389   while ($string != $string2)
390   {
391     $string2 = $string;
392     $string = kses_bad_protocol_once($string, $allowed_protocols);
393   } # while
394
395   return $string;
396 } # function kses_bad_protocol
397
398
399 function kses_no_null($string)
400 ###############################################################################
401 # This function removes any NULL characters in $string.
402 ###############################################################################
403 {
404   $string = preg_replace('/\0+/', '', $string);
405   $string = preg_replace('/(\\\\0)+/', '', $string);
406
407   return $string;
408 } # function kses_no_null
409
410
411 function kses_stripslashes($string)
412 ###############################################################################
413 # This function changes the character sequence  \"  to just  "
414 # It leaves all other slashes alone. It's really weird, but the quoting from
415 # preg_replace(//e) seems to require this.
416 ###############################################################################
417 {
418   return preg_replace('%\\\\"%', '"', $string);
419 } # function kses_stripslashes
420
421
422 function kses_array_lc($inarray)
423 ###############################################################################
424 # This function goes through an array, and changes the keys to all lower case.
425 ###############################################################################
426 {
427   $outarray = array();
428
429   foreach ($inarray as $inkey => $inval)
430   {
431     $outkey = strtolower($inkey);
432     $outarray[$outkey] = array();
433
434     foreach ($inval as $inkey2 => $inval2)
435     {
436       $outkey2 = strtolower($inkey2);
437       $outarray[$outkey][$outkey2] = $inval2;
438     } # foreach $inval
439   } # foreach $inarray
440
441   return $outarray;
442 } # function kses_array_lc
443
444
445 function kses_js_entities($string)
446 ###############################################################################
447 # This function removes the HTML JavaScript entities found in early versions of
448 # Netscape 4.
449 ###############################################################################
450 {
451   return preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
452 } # function kses_js_entities
453
454
455 function kses_html_error($string)
456 ###############################################################################
457 # This function deals with parsing errors in kses_hair(). The general plan is
458 # to remove everything to and including some whitespace, but it deals with
459 # quotes and apostrophes as well.
460 ###############################################################################
461 {
462   return preg_replace('/^("[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*/', '', $string);
463 } # function kses_html_error
464
465
466 function kses_bad_protocol_once($string, $allowed_protocols)
467 ###############################################################################
468 # This function searches for URL protocols at the beginning of $string, while
469 # handling whitespace and HTML entities.
470 ###############################################################################
471 {
472   return preg_replace('/^((&[^;]*;|[\sA-Za-z0-9])*)'.
473                       '(:|&#0*58;|&#[Xx]3[Aa];)\s*/e',
474                       'kses_bad_protocol_once2("\\1", $allowed_protocols)',
475                       $string);
476 } # function kses_bad_protocol_once
477
478
479 function kses_bad_protocol_once2($string, $allowed_protocols)
480 ###############################################################################
481 # This function processes URL protocols, checks to see if they're in the white-
482 # list or not, and returns different data depending on the answer.
483 ###############################################################################
484 {
485   $string2 = kses_decode_entities($string);
486   $string2 = preg_replace('/\s/', '', $string2);
487   $string2 = kses_no_null($string2);
488   $string2 = preg_replace('/\xad+/', '', $string2);
489    # deals with Opera "feature"
490   $string2 = strtolower($string2);
491
492   $allowed = false;
493   foreach ($allowed_protocols as $one_protocol)
494     if (strtolower($one_protocol) == $string2)
495     {
496       $allowed = true;
497       break;
498     }
499
500   if ($allowed)
501     return "$string2:";
502   else
503     return '';
504 } # function kses_bad_protocol_once2
505
506
507 function kses_normalize_entities($string)
508 ###############################################################################
509 # This function normalizes HTML entities. It will convert "AT&T" to the correct
510 # "AT&amp;T", "&#00058;" to "&#58;", "&#XYZZY;" to "&amp;#XYZZY;" and so on.
511 ###############################################################################
512 {
513 # Disarm all entities by converting & to &amp;
514
515   $string = str_replace('&', '&amp;', $string);
516
517 # Change back the allowed entities in our entity whitelist
518
519   $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]{0,19});/',
520                          '&\\1;', $string);
521   $string = preg_replace('/&amp;#0*([0-9]{1,5});/e',
522                          'kses_normalize_entities2("\\1")', $string);
523   $string = preg_replace('/&amp;#([Xx])0*(([0-9A-Fa-f]{2}){1,2});/',
524                          '&#\\1\\2;', $string);
525
526   return $string;
527 } # function kses_normalize_entities
528
529
530 function kses_normalize_entities2($i)
531 ###############################################################################
532 # This function helps kses_normalize_entities() to only accept 16 bit values
533 # and nothing more for &#number; entities.
534 ###############################################################################
535 {
536   return (($i > 65535) ? "&amp;#$i;" : "&#$i;");
537 } # function kses_normalize_entities2
538
539
540 function kses_decode_entities($string)
541 ###############################################################################
542 # This function decodes numeric HTML entities (&#65; and &#x41;). It doesn't
543 # do anything with other entities like &auml;, but we don't need them in the
544 # URL protocol whitelisting system anyway.
545 ###############################################################################
546 {
547   $string = preg_replace('/&#([0-9]+);/e', 'chr("\\1")', $string);
548   $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', 'chr(hexdec("\\1"))',
549                          $string);
550
551   return $string;
552 } # function kses_decode_entities
553
554 ?>
555
Note: See TracBrowser for help on using the browser.