注冊|登錄

聯系電話:024-31891684  13390130939
沈陽軟件公司--沈陽軟件定制

沈陽軟件開發_沈陽軟件公司_沈陽軟件定制/軟件/最新技術

Latest technology最新技術

辦公OA--正文提取中用到的正則表達式

瀏覽量:2592

CRM定制 辦公OA

#region 相關正則表達式

 
/// <summary>
/// 去掉所有html標簽
/// </summary>
private static readonly Regex FilterAll = new Regex(
@"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase); //(?<Link><a[\s\S]*?</a>)|
//(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)
 
/// <summary>
/// 找出title標簽
/// </summary>
private static readonly Regex FindTitle = new Regex(
@"<\s*/?title\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出title標簽內容
/// </summary>
private static readonly Regex FindTitleContent = new Regex(
@"<\s*/?title\s*>(?<Content>[\s\S]*?)<\s*/?title\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出h 和Strong標簽
/// </summary>
private static readonly Regex FindHStrong = new Regex(
@"<\s*/?h\s*>|<\s*/?strong\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出p 和br標簽
/// </summary>
private static readonly Regex FindPB = new Regex(
@"<\s*/?p\s*>|<\s*br\s*/?>|<\s*/?tr\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出nbsp標簽
/// </summary>
private static readonly Regex FindNbsp = new Regex(
@"&nbsp",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出結尾標簽
/// </summary>
private static readonly Regex FindS = new Regex(
@"(?<Content>[\s\S]*?)\$",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為標準句
/// </summary>
private static readonly Regex IsSen = new Regex(
@"[,.,。!!;;::……??《》“”""]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為垃圾句[strong][h]標簽過多的
/// </summary>
private static readonly Regex IsWs = new Regex(
@"\[\(h\)\]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為垃圾句冒號和·-過多的
/// </summary>
private static readonly Regex IsWsM = new Regex(
@"\[·]|[-]|[::]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為BBS特征
/// </summary>
private static readonly Regex IsBbsInfo = new Regex(
@"第[^樓]{1,50}樓|Powered\s*/?by[\s\S]*?Dvbbs|Powered\s*/?by[\s\S]*?Discuz",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 取KEYWORD
/// </summary>
private static readonly Regex mKeyWord = new Regex(
@"<meta\s*name\s*=\s*['""]?keywords['""]?\s*content\s*=\s*['""]?(?<KeyWords>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<KeyWords>[^'"">]*)['""]?\s*name\s*=\s*['""]?keywords['""]?\s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
 
/// <summary>
/// 取DESCRIPTION
/// </summary>
private static readonly Regex mDescription = new Regex(
@"<meta\s*name\s*=\s*['""]?description['""]?\s*content\s*=\s*['""]?(?<description>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<description>[^'"">]*)['""]?\s*name\s*=\s*['""]?description['""]?\s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
 
/// <summary>
/// 取Tags
/// </summary>
private static readonly Regex mTag = new Regex(
@"<meta\s*name\s*=\s*['""]?tagwords['""]?\s*content\s*=\s*['""]?(?<tagwords>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<tagwords>[^'"">]*)['""]?\s*name\s*=\s*['""]?tagwords['""]?\s*[^>]*>
", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為垃圾句:后字符號過少,:號前無“說”字,:號后無"關于"
/// </summary>
private static readonly Regex IsWsMM = new Regex(
@"^[^說\s]{0,8}?[::].{0,10}$",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出spider寫入的url標記
/// </summary>
private static readonly Regex txtUrl = new Regex(
@"當前URL為:http://(?<URL>.*)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出spider寫入的錨點描述標記
/// </summary>
private static readonly Regex txtDescription = new Regex(
@"當前鏈接描述為:(?<Describe>.*)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
///// <summary>
///// 取需要a標簽
///// </summary>
//private static readonly Regex cleanFirst = new Regex(
// @"([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])(?<Robbish1><a\s+[^>]*>)[^<]{1,6}(?<Robbish2></a>)([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
 
#endregion

沈陽團購網|營口網站制作|沈陽軟件公司|軟件定制|網站建設|加盟易勢|提交問題

主站蜘蛛池模板: aaaa级少妇高潮大片在线观看| 国内自拍青青草| 亚洲人成7777影视在线观看| 美村妇真湿夹得我好爽| 娇小xxxxx性开放| 久久人人爽人人爽人人片av不| 蜜臀AV在线播放一区二区三区| 无码人妻H动漫中文字幕| 亚洲中文字幕久久精品无码a| 美女扒开尿口让男人捅| 国产欧美一区二区精品久久久 | 国产小呦泬泬99精品| chinese真实露脸hotmilf| 我要看WWW免费看插插视频| 亚洲国产成人资源在线软件 | 国产精品成人无码免费| 99这里只有精品66视频| 日本一道高清不卡免费| 亚洲av中文无码乱人伦在线视色 | 国产成人亚洲午夜电影| GOGO人体大胆全球少妇| 成人免费午夜视频| 久久99精品久久久久久久野外 | 亚洲欧美视频一级| 狠狠精品久久久无码中文字幕 | 男人影院天堂网址| 再深点灬舒服了灬太大了乡村| 色天使色婷婷在线影院亚洲| 国产午夜精品久久久久免费视| 免费观看激色视频网站(性色) | 久久亚洲国产欧洲精品一| 日韩视频一区二区| 亚洲免费观看网站| 粗壮挺进邻居人妻| 国产性生大片免费观看性| 97人妻人人揉人人躁人人| 天堂资源在线官网| 中文字幕在线网站| 晓雪老师下面好紧好湿| 亚洲第一极品精品无码久久| 美女毛片在线观看|