What's the best way to clean up a URL with a title in it?

What's the best way to clean up a URL? I am looking for a URL like this

what_is_the_best_headache_medication

My current code

public string CleanURL(string str)
{
    str = str.Replace("!", "");
    str = str.Replace("@", "");
    str = str.Replace("#", "");
    str = str.Replace("$", "");
    str = str.Replace("%", "");
    str = str.Replace("^", "");
    str = str.Replace("&", "");
    str = str.Replace("*", "");
    str = str.Replace("(", "");
    str = str.Replace(")", "");
    str = str.Replace("-", "");
    str = str.Replace("_", "");
    str = str.Replace("+", "");
    str = str.Replace("=", "");
    str = str.Replace("{", "");
    str = str.Replace("[", "");
    str = str.Replace("]", "");
    str = str.Replace("}", "");
    str = str.Replace("|", "");
    str = str.Replace(@"\", "");
    str = str.Replace(":", "");
    str = str.Replace(";", "");
    str = str.Replace(@"\", "");
    str = str.Replace("'", "");
    str = str.Replace("<", "");
    str = str.Replace(">", "");
    str = str.Replace(",", "");
    str = str.Replace(".", "");
    str = str.Replace("`", "");
    str = str.Replace("~", "");
    str = str.Replace("/", "");
    str = str.Replace("?", "");
    str = str.Replace("  ", " ");
    str = str.Replace("   ", " ");
    str = str.Replace("    ", " ");
    str = str.Replace("     ", " ");
    str = str.Replace("      ", " ");
    str = str.Replace("       ", " ");
    str = str.Replace("        ", " ");
    str = str.Replace("         ", " ");
    str = str.Replace("          ", " ");
    str = str.Replace("           ", " ");
    str = str.Replace("            ", " ");
    str = str.Replace("             ", " ");
    str = str.Replace("              ", " ");
    str = str.Replace(" ", "_");
    return str;
}

      

+2


source to share


7 replies


Regular expressions:

public string CleanURL(string str)
{
    str = Regex.Replace(str, "[^a-zA-Z0-9 ]", "");
    str = Regex.Replace(str, " +", "_");
    return str;
}

      

(Not really verified, head-on.)



Let me explain:

The first line removes anything that is not an alphanumeric character (upper or lower case) or space. The second line replaces any sequence of spaces (1 or more, sequentially) with a single underscore.

+3


source


It is generally best to go with the regular whitelist expression approach instead of removing all unwanted characters, because you will definitely miss out on some of them.

The answers here are fine, but I personally didn't want to completely remove the umlauts and accented characters. So the final solution I came up with looks like this:

public static string CleanUrl(string value)
{
    if (value.IsNullOrEmpty())
        return value;

    // replace hyphens to spaces, remove all leading and trailing whitespace
    value = value.Replace("-", " ").Trim().ToLower();

    // replace multiple whitespace to one hyphen
    value = Regex.Replace(value, @"[\s]+", "-");

    // replace umlauts and eszett with their equivalent
    value = value.Replace("ß", "ss");
    value = value.Replace("ä", "ae");
    value = value.Replace("ö", "oe");
    value = value.Replace("ü", "ue");

    // removes diacritic marks (often called accent marks) from characters
    value = RemoveDiacritics(value);

    // remove all left unwanted chars (white list)
    value = Regex.Replace(value, @"[^a-z0-9\s-]", String.Empty);

    return value;
}

      



The method used is RemoveDiacritics

based on the answer from Blair Conrad :

public static string RemoveDiacritics(string value)
{
    if (value.IsNullOrEmpty())
        return value;

    string normalized = value.Normalize(NormalizationForm.FormD);
    StringBuilder sb = new StringBuilder();

    foreach (char c in normalized)
    {
        if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
            sb.Append(c);
    }

    Encoding nonunicode = Encoding.GetEncoding(850);
    Encoding unicode = Encoding.Unicode;

    byte[] nonunicodeBytes = Encoding.Convert(unicode, nonunicode, unicode.GetBytes(sb.ToString()));
    char[] nonunicodeChars = new char[nonunicode.GetCharCount(nonunicodeBytes, 0, nonunicodeBytes.Length)];
    nonunicode.GetChars(nonunicodeBytes, 0, nonunicodeBytes.Length, nonunicodeChars, 0);

    return new string(nonunicodeChars);
}

      

Hope this helps someone to challenge it by adding urls and storing umlauts and friends at the same time with their url equivalent.

+3


source


You should use a regular expression instead. This is much more efficient than what you are trying to do above.

Read more about regular expressions here .

+2


source


  • How do you define a "friendly" URL? I assume you want to remove _ etc.
  • I would look into regex here.

If you want to continue with the method above, I would suggest going to StringBuilder by line. This is because each of the replace operations creates a new row.

0


source


I can tighten one piece of this:

while (str.IndexOf("  ") > 0)
    str = str.Replace("  ", " ");

      

... instead of your infinite number of replacements " "

. But you almost certainly want to use a regular expression.

0


source


Or, a little more verbose, but this only allows alphanumeric and spaces (which are replaced with "-")

string Cleaned = String.Empty;
foreach (char c in Dirty)
    if (((c >= 'a') && (c <= 'z')) ||
         (c >= 'A') && (c <= 'Z') ||
         (c >= '0') && (c <= '9') ||
         (c == ' '))
           Cleaned += c;
Cleaned = Cleaned.Replace(" ", "-");

      

0


source


The way stackoverflow does it can be found here:

fooobar.com/questions/29019 / ...

optimized for speed ("This is the second version, deployed 5x more performance") and taking care of a lot of special characters.

0


source







All Articles