One problem I've had with slugification (new word!) is collisions. If I have a blog post, for instance, called "Stack-Overflow" and one called "Stack Overflow", the slugs of those two titles are the same. Therefore, my slug generator usually has to involve the database in some way. This might be why you don't see more generic solutions out there.
Here you find a way to generate url slug in c#. This function remove all accents(Marcel's answer), replace spaces, remove invalid chars, trim dashes from end and replace double occurences of "-" or "_"
Code:
public static string ToUrlSlug(string value){
//First to lower case
value = value.ToLowerInvariant();
//Remove all accents
var bytes = Encoding.GetEncoding("Cyrillic").GetBytes(value);
value = Encoding.ASCII.GetString(bytes);
//Replace spaces
value = Regex.Replace(value, @"\s", "-", RegexOptions.Compiled);
//Remove invalid chars
value = Regex.Replace(value, @"[^a-z0-9\s-_]", "",RegexOptions.Compiled);
//Trim dashes from end
value = value.Trim('-', '_');
//Replace double occurences of - or _
value = Regex.Replace(value, @"([-_]){2,}", "$1", RegexOptions.Compiled);
return value ;
}
Explicit Regex caching for modest speed improvements.
More word separators recognized and normalized to hyphens.
Here is the code:
public class UrlSlugger
{
// white space, em-dash, en-dash, underscore
static readonly Regex WordDelimiters = new Regex(@"[\s—–_]", RegexOptions.Compiled);
// characters that are not valid
static readonly Regex InvalidChars = new Regex(@"[^a-z0-9\-]", RegexOptions.Compiled);
// multiple hyphens
static readonly Regex MultipleHyphens = new Regex(@"-{2,}", RegexOptions.Compiled);
public static string ToUrlSlug(string value)
{
// convert to lower case
value = value.ToLowerInvariant();
// remove diacritics (accents)
value = RemoveDiacritics(value);
// ensure all word delimiters are hyphens
value = WordDelimiters.Replace(value, "-");
// strip out invalid characters
value = InvalidChars.Replace(value, "");
// replace multiple hyphens (-) with a single hyphen
value = MultipleHyphens.Replace(value, "-");
// trim hyphens (-) from ends
return value.Trim('-');
}
/// See: http://www.siao2.com/2007/05/14/2629747.aspx
private static string RemoveDiacritics(string stIn)
{
string stFormD = stIn.Normalize(NormalizationForm.FormD);
StringBuilder sb = new StringBuilder();
for (int ich = 0; ich < stFormD.Length; ich++)
{
UnicodeCategory uc = CharUnicodeInfo.GetUnicodeCategory(stFormD[ich]);
if (uc != UnicodeCategory.NonSpacingMark)
{
sb.Append(stFormD[ich]);
}
}
return (sb.ToString().Normalize(NormalizationForm.FormC));
}
}
This still does not solve the non-latin character issue. A completely alternative solution would be to use Uri.EscapeDataString to convert the the string its hex representation:
string original = "测试公司";
// %E6%B5%8B%E8%AF%95%E5%85%AC%E5%8F%B8
string converted = Uri.EscapeDataString(original);