// // urlparse.h - util::url - fairly usable http/ftp URL parser. // Copyright (c) 2006 Roger Clark // #ifndef URLPARSE_H #define URLPARSE_H #include #include #include namespace util { // NOTE: replace these with something a little more robust. template inline int tointeger(const tChar* str); template<> inline int tointeger(const char* str) { return atoi(str); } template<> inline int tointeger(const wchar_t* str) { return _wtoi(str); } template inline int tointeger(const std::basic_string& str) { return tointeger(str.c_str()); } // this class is pretty brittle as far as the parsing goes, // but it can be templated over the character type and is // useful for parsing a URL, modifying a component, and then // dumping back out. template class url { public: typedef tChar char_type; typedef std::basic_string string_type; static const int invalid_port = -1; url() : port_(invalid_port) { } url(const string_type& text) : port_(invalid_port) { set_url(text); } string_type get_url() const { std::basic_stringstream stream; stream << protocol_ << "://"; if (!username_.empty()) { stream << username_; if (!password_.empty()) stream << ":" << password_; stream << "@"; } stream << hostname_; if (port_ != invalid_port) stream << ":" << port_; if (!path_.empty()) { stream << "/" << path_; if (!query_.empty()) stream << "?" << query_; } return stream.str(); } string_type get_protocol() const { return protocol_; } string_type get_username() const { return username_; } string_type get_password() const { return password_; } string_type get_hostname() const { return hostname_; } int get_port() const { return port_; } string_type get_path() const { return path_; } string_type get_query() const { return query_; } void set_url(const string_type& text) { protocol_.clear(); username_.clear(); password_.clear(); hostname_.clear(); port_ = invalid_port; path_.clear(); query_.clear(); parse_url(text); } void set_protocol(const string_type& prot) { protocol_ = prot; } void set_username(const string_type& user) { username_ = user; } void set_password(const string_type& pass) { password_ = pass; } void set_hostname(const string_type& host) { hostname_ = host; } void set_port(int port) { port_ = port; } void set_path(const string_type& path) { path_ = path; } void set_query(const string_type& query) { query_ = query; } url& operator=(url& operand) { protocol_ = operand.protocol_; username_ = operand.username_; password_ = operand.password_; hostname_ = operand.hostname_; port_ = operand.port_; path_ = operand.path_; query_ = operand.query_; return (*this); } private: void parse_url(const string_type& text) { // parse a url of the form: // protocol://[username[:password]@]hostname[:port][/path[?query]] // this still assumes that the character set is a superset of ascii, which is true for // ANSI (US OEM) and most MBCS sets, as well as UTF-8/16/32. char_type protocol_separator_chars[4] = { ':', '/', '/', 0 }; size_t protocol_separator = text.find(protocol_separator_chars); if (protocol_separator == string_type::npos) return; protocol_ = text.substr(0, protocol_separator); size_t credentials_separator = text.find_first_of('@', protocol_separator + 3); if (credentials_separator != string_type::npos) { size_t password_separator = text.find_first_of(':', protocol_separator + 1); if ((password_separator != string_type::npos) && (password_separator < credentials_separator)) { username_ = text.substr(protocol_separator + 3, password_separator - (protocol_separator + 3)); password_ = text.substr(password_separator + 1, credentials_separator - (password_separator + 1)); } else { username_ = text.substr(protocol_separator + 3, credentials_separator - (protocol_separator + 3)); } } else credentials_separator = protocol_separator + 2; string_type strport; char_type pathport_separator_chars[3] = { ':', '/', 0 }; size_t pathport_separator = text.find_first_of(pathport_separator_chars, credentials_separator + 1); if (pathport_separator != string_type::npos) { hostname_ = text.substr(credentials_separator + 1, pathport_separator - (credentials_separator + 1)); bool parse_path = true; size_t path_separator; if (text[pathport_separator] == ':') { path_separator = text.find_first_of('/', pathport_separator + 1); if (path_separator != string_type::npos) { strport = text.substr(pathport_separator + 1, path_separator - (pathport_separator + 1)); } else { parse_path = false; strport = text.substr(pathport_separator + 1, text.length() - (pathport_separator + 1)); } port_ = tointeger(strport); } else path_separator = pathport_separator; if (parse_path) { size_t query_separator = text.find_first_of('?', path_separator + 1); if (query_separator != string_type::npos) { path_ = text.substr(path_separator + 1, query_separator - (path_separator + 1)); query_ = text.substr(query_separator + 1, text.length() - (query_separator + 1)); } else path_ = text.substr(path_separator + 1, text.length() - (path_separator + 1)); } } else hostname_ = text.substr(credentials_separator + 1, text.length() - (credentials_separator + 1)); } string_type protocol_; string_type username_; string_type password_; string_type hostname_; int port_; string_type path_; string_type query_; }; }; #endif