XMPP/JID validation

From StatusNet
Jump to: navigation, search

Copyright 2009 Patrick Georgi <patrick@georgi-clan.de> Licensed under ISC-L, which is compatible with everything else that keeps the copyright notice intact.

<?php
$jids=array(
"a",
"a@b",
"a@@b",
"a.@b",
"a @b",
"a@b/c",
"a/b",
"a@b/c/d",
"a@b/c@d",
"a@b/c@d/e"
);

/* arrays must be sorted */
$forbidden_for_node_and_resource=array(
// C.2.1
"\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x09", "\x0a", "\x0b", "\x0c", "\x0d", "\x0e", "\x0f",
"\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a", "\x1b", "\x1c", "\x1d", "\x1e", "\x1f",
// C.1.1
"\x20",
// C.2.1
"\x7f",
// C.2.2
"\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8a", "\x8b", "\x8c", "\x8d", "\x8e", "\x8f",
"\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9a", "\x9b", "\x9c", "\x9d", "\x9e", "\x9f",
// C.1.2
"\xa0"
);

$forbidden_for_node=array(
"\x22",
"\x26",
"\x27",
"\x2f",
"\x3a",
"\x3c",
"\x3e",
"\x40"
);

function validatejid($jid) {
        global $forbidden_for_node_and_resource, $forbidden_for_node;
        $parts=explode("/", $jid, 2);
        $resource=$parts[1];

        $node=explode("@", $parts[0]);
        if ((count($node)>2) || (count($node)==0)) return false;
        if (count($node)==1) {
                $server=$node[0];
                $node="";
        } else {
                $server=$node[1];
                $node=$node[0];
        }
        $n=str_split($node);
        sort($n);
        $idx=0;
        foreach ($n as $c) {
                while (($idx < count($forbidden_for_node_and_resource)) && ($forbidden_for_node_and_resource[$idx]<$c)) $idx++;
                if ($forbidden_for_node_and_resource[$idx]==$c) return false;
                if ($idx > count($forbidden_for_node_and_resource)) break;
        }
        $idx=0;
        foreach ($n as $c) {
                while (($idx < count($forbidden_for_node)) && ($forbidden_for_node[$idx]<$c)) $idx++;
                if ($forbidden_for_node[$idx]==$c) return false;
                if ($idx >= count($forbidden_for_node)) break;
        }
        $r=str_split($resource);
        sort($r);
        $idx=0;
        foreach ($r as $c) {
                while (($idx < count($forbidden_for_node_and_resource)) && ($forbidden_for_node_and_resource[$idx]<$c)) $idx++;
                if ($forbidden_for_node_and_resource[$idx]==$c) return false;
                if ($idx > count($forbidden_for_node_and_resource)) break;
        }
        return true;
}
foreach ($jids as $jid) {
        print validatejid($jid)." ".$jid."\n";
}
?>

Same test using preg, and full unicode tables

function validatejid2($jid) {
        /* the following definitions come from stringprep, Appendix C,
           which is used in its entirety by nodeprop, Chapter 5, "Prohibited Output" */
        /* C1.1 ASCII space characters */
        $chars .= "\x{20}";
        /* C1.2 Non-ASCII space characters */
        $chars .= "\x{a0}\x{1680}\x{2000}-\x{200b}\x{202f}\x{205f}\x{3000a}";
        /* C2.1 ASCII control characters */
        $chars .= "\x{00}-\x{1f}\x{7f}";
        /* C2.2 Non-ASCII control characters */
        $chars .= "\x{80}-\x{9f}\x{6dd}\x{70f}\x{180e}\x{200c}\x{200d}\x{2028}\x{2029}\x{2060}-\x{2063}\x{206a}-\x{206f}\x{feff}\x{fff9}-\x{fffc}\x{1d173}-\x{1d17a}";
        /* C3 - Private Use */
        $chars .= "\x{e000}-\x{f8ff}\x{f0000}-\x{ffffd}\x{100000}-\x{10fffd}";
        /* C4 - Non-character code points */
        $chars .= "\x{fdd0}-\x{fdef}\x{fffe}\x{ffff}\x{1fffe}\x{1ffff}\x{2fffe}\x{2ffff}\x{3fffe}\x{3ffff}\x{4fffe}\x{4ffff}\x{5fffe}\x{5ffff}\x{6fffe}\x{6ffff}\x{7fffe}\x{7ffff}\x{8fffe}\x{8ffff}\x{9fffe}\x{9ffff}\x{afffe}\x{affff}\x{bfffe}\x{bffff}\x{cfffe}\x{cffff}\x{dfffe}\x{dffff}\x{efffe}\x{effff}\x{ffffe}\x{fffff}\x{10fffe}\x{10ffff}";
        /* C5 - Surrogate codes */
        $chars .= "\x{d800}-\x{dfff}";
        /* C6 - Inappropriate for plain text */
        $chars .= "\x{fff9}-\x{fffd}";
        /* C7 - Inappropriate for canonical representation */
        $chars .= "\x{2ff0}-\x{2ffb}";
        /* C8 - Change display properties or are deprecated */
        $chars .= "\x{340}\x{341}\x{200e}\x{200f}\x{202a}-\x{202e}\x{206a}-\x{206f}";
        /* C9 - Tagging characters */
        $chars .= "\x{e0001}\x{e0020}-\x{e007f}";

        /* Nodeprep forbids some more characters */
        $nodeprepchars = $chars;
        $nodeprepchars .= "\x{22}\x{26}\x{27}\x{2f}\x{3a}\x{3c}\x{3e}\x{40}";

        $parts=preg_split("/\//", $jid, 2);
        $resource=$parts[1];
        $node=preg_split("/@/", $parts[0]);
        if ((count($node)>2) || (count($node)==0)) return false;
        if (count($node)==1) {
                $server=$node[0];
                $node="";
        } else {
                $server=$node[1];
                $node=$node[0];
        }

        if (preg_match("/[".$nodeprepchars."]/u", $node)) return false;
        if (preg_match("/[".$chars."]/u", $resource)) return false;
        return true;
}
Personal tools
Namespaces
Variants
Actions
Navigation
Status.net
Toolbox