What's the best way to clean up a URL with a title in it?
What's the best way to clean up a URL? I am looking for a URL like this
what_is_the_best_headache_medication
My current code
public string CleanURL(string str)
{
str = str.Replace("!", "");
str = str.Replace("@", "");
str = str.Replace("#", "");
str = str.Replace("$", "");
str = str.Replace("%", "");
str = str.Replace("^", "");
str = str.Replace("&", "");
str = str.Replace("*", "");
str = str.Replace("(", "");
str = str.Replace(")", "");
str = str.Replace("-", "");
str = str.Replace("_", "");
str = str.Replace("+", "");
str = str.Replace("=", "");
str = str.Replace("{", "");
str = str.Replace("[", "");
str = str.Replace("]", "");
str = str.Replace("}", "");
str = str.Replace("|", "");
str = str.Replace(@"\", "");
str = str.Replace(":", "");
str = str.Replace(";", "");
str = str.Replace(@"\", "");
str = str.Replace("'", "");
str = str.Replace("<", "");
str = str.Replace(">", "");
str = str.Replace(",", "");
str = str.Replace(".", "");
str = str.Replace("`", "");
str = str.Replace("~", "");
str = str.Replace("/", "");
str = str.Replace("?", "");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", " ");
str = str.Replace(" ", "_");
return str;
}
source to share
Regular expressions:
public string CleanURL(string str)
{
str = Regex.Replace(str, "[^a-zA-Z0-9 ]", "");
str = Regex.Replace(str, " +", "_");
return str;
}
(Not really verified, head-on.)
Let me explain:
The first line removes anything that is not an alphanumeric character (upper or lower case) or space. The second line replaces any sequence of spaces (1 or more, sequentially) with a single underscore.
source to share
It is generally best to go with the regular whitelist expression approach instead of removing all unwanted characters, because you will definitely miss out on some of them.
The answers here are fine, but I personally didn't want to completely remove the umlauts and accented characters. So the final solution I came up with looks like this:
public static string CleanUrl(string value)
{
if (value.IsNullOrEmpty())
return value;
// replace hyphens to spaces, remove all leading and trailing whitespace
value = value.Replace("-", " ").Trim().ToLower();
// replace multiple whitespace to one hyphen
value = Regex.Replace(value, @"[\s]+", "-");
// replace umlauts and eszett with their equivalent
value = value.Replace("ß", "ss");
value = value.Replace("ä", "ae");
value = value.Replace("ö", "oe");
value = value.Replace("ü", "ue");
// removes diacritic marks (often called accent marks) from characters
value = RemoveDiacritics(value);
// remove all left unwanted chars (white list)
value = Regex.Replace(value, @"[^a-z0-9\s-]", String.Empty);
return value;
}
The method used is RemoveDiacritics
based on the answer from Blair Conrad :
public static string RemoveDiacritics(string value)
{
if (value.IsNullOrEmpty())
return value;
string normalized = value.Normalize(NormalizationForm.FormD);
StringBuilder sb = new StringBuilder();
foreach (char c in normalized)
{
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
sb.Append(c);
}
Encoding nonunicode = Encoding.GetEncoding(850);
Encoding unicode = Encoding.Unicode;
byte[] nonunicodeBytes = Encoding.Convert(unicode, nonunicode, unicode.GetBytes(sb.ToString()));
char[] nonunicodeChars = new char[nonunicode.GetCharCount(nonunicodeBytes, 0, nonunicodeBytes.Length)];
nonunicode.GetChars(nonunicodeBytes, 0, nonunicodeBytes.Length, nonunicodeChars, 0);
return new string(nonunicodeChars);
}
Hope this helps someone to challenge it by adding urls and storing umlauts and friends at the same time with their url equivalent.
source to share
Or, a little more verbose, but this only allows alphanumeric and spaces (which are replaced with "-")
string Cleaned = String.Empty;
foreach (char c in Dirty)
if (((c >= 'a') && (c <= 'z')) ||
(c >= 'A') && (c <= 'Z') ||
(c >= '0') && (c <= '9') ||
(c == ' '))
Cleaned += c;
Cleaned = Cleaned.Replace(" ", "-");
source to share
The way stackoverflow does it can be found here:
fooobar.com/questions/29019 / ...
optimized for speed ("This is the second version, deployed 5x more performance") and taking care of a lot of special characters.
source to share