How to parse URLs with pure JavaScript

One of the standard tasks I needed often is “parsing URLs to its components”. Here you find my simple RegExp driven solution for this problem.

advantages

  • small & fast
  • ready to use
  • pure JS: uses only build in standard features
  • it will work in more than 98% of URLs you will encounter in the Web…

disadvantages / known flaws

They are accepted to keep the code small & fast and to work with a majority of URLs:

  • no error handling (when it breaks at the last 2% of the URLs… it breaks)
  • a sub domain, the domain-name and the top-level-domain (eg. “.COM”) are not separable without much more code because of the possibility of multi level TLDs like “.CO.UK” on the one side and¬† multiple sub domains on the other side
  • it could not be distinguished between URLs without protocol of the form “//HOST/PATH” and an relative URL starting with two slashes instead of one “//SECOND_PATH_ELEMENT/MORE_PATH” -> at the second variant the¬†SECOND_PATH_ELEMENT will be parsed as HOST
  • other protocols like mail, tel, app or news and other special cases are not implemented
  • the not RFC conform login mechanism via USER:PWD@HOST is not implemented
  1. function parse_url(url) {
  2.     var match = url.match(/^(http|https|ftp)?(?:[\:\/]*)([a-z0-9\.-]*)(?:\:([0-9]+))?(\/[^?#]*)?(?:\?([^#]*))?(?:#(.*))?$/i);
  3.     var ret   = new Object();
  4.  
  5.     ret['protocol'] = '';
  6.     ret['host']     = match[2];
  7.     ret['port']     = '';
  8.     ret['path']     = '';
  9.     ret['query']    = '';
  10.     ret['fragment'] = '';
  11.    
  12.     if(match[1]){
  13.         ret['protocol'] = match[1];
  14.     }
  15.  
  16.     if(match[3]){
  17.         ret['port']     = match[3];
  18.     }
  19.  
  20.     if(match[4]){
  21.         ret['path']     = match[4];
  22.     }
  23.  
  24.     if(match[5]){
  25.         ret['query']    = match[5];
  26.     }
  27.  
  28.     if(match[6]){
  29.         ret['fragment'] = match[6];
  30.     }
  31.  
  32.     return ret;
  33. }
  34.  
  35. var url_parts = parse_url(urls);
  36.  
  37. var protocol  = url_parts['protocol'];
  38. var host      = url_parts['host'];
  39. var port      = url_parts['port'];
  40. var path      = url_parts['path'];
  41. var query     = url_parts['query'];
  42. var fragment  = url_parts['fragment'];
function parse_url(url) {
	var match = url.match(/^(http|https|ftp)?(?:[\:\/]*)([a-z0-9\.-]*)(?:\:([0-9]+))?(\/[^?#]*)?(?:\?([^#]*))?(?:#(.*))?$/i);
	var ret   = new Object();

	ret['protocol'] = '';
	ret['host']     = match[2];
	ret['port']     = '';
	ret['path']     = '';
	ret['query']    = '';
	ret['fragment'] = '';
	
	if(match[1]){
		ret['protocol'] = match[1];
	}

	if(match[3]){
		ret['port']     = match[3];
	}

	if(match[4]){
		ret['path']     = match[4];
	}

	if(match[5]){
		ret['query']    = match[5];
	}

	if(match[6]){
		ret['fragment'] = match[6];
	}

	return ret;
}

var url_parts = parse_url(urls);

var protocol  = url_parts['protocol'];
var host      = url_parts['host'];
var port      = url_parts['port'];
var path      = url_parts['path'];
var query     = url_parts['query'];
var fragment  = url_parts['fragment'];

You are free to use this code if you respect the following simple license. This function was developed for use in Pentaho, you can download an test transformation with example URLs here.