注冊|登錄

聯系電話:024-31891684  13390130939
沈陽軟件公司--沈陽軟件定制

沈陽軟件開發_沈陽軟件公司_沈陽軟件定制/軟件/最新技術

Latest technology最新技術

正文提取中用到的正則表達式

瀏覽量:2873

#region 相關正則表達式

 
/// <summary>
/// 去掉所有html標簽
/// </summary>
private static readonly Regex FilterAll = new Regex(
@"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase); //(?<Link><a[\s\S]*?</a>)|
//(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)
 
/// <summary>
/// 找出title標簽
/// </summary>
private static readonly Regex FindTitle = new Regex(
@"<\s*/?title\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出title標簽內容
/// </summary>
private static readonly Regex FindTitleContent = new Regex(
@"<\s*/?title\s*>(?<Content>[\s\S]*?)<\s*/?title\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出h 和Strong標簽
/// </summary>
private static readonly Regex FindHStrong = new Regex(
@"<\s*/?h\s*>|<\s*/?strong\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出p 和br標簽
/// </summary>
private static readonly Regex FindPB = new Regex(
@"<\s*/?p\s*>|<\s*br\s*/?>|<\s*/?tr\s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出nbsp標簽
/// </summary>
private static readonly Regex FindNbsp = new Regex(
@"&nbsp",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出結尾標簽
/// </summary>
private static readonly Regex FindS = new Regex(
@"(?<Content>[\s\S]*?)\$",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為標準句
/// </summary>
private static readonly Regex IsSen = new Regex(
@"[,.,。!!;;::……??《》“”""]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為垃圾句[strong][h]標簽過多的
/// </summary>
private static readonly Regex IsWs = new Regex(
@"\[\(h\)\]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為垃圾句冒號和·-過多的
/// </summary>
private static readonly Regex IsWsM = new Regex(
@"\[·]|[-]|[::]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為BBS特征
/// </summary>
private static readonly Regex IsBbsInfo = new Regex(
@"第[^樓]{1,50}樓|Powered\s*/?by[\s\S]*?Dvbbs|Powered\s*/?by[\s\S]*?Discuz",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 取KEYWORD
/// </summary>
private static readonly Regex mKeyWord = new Regex(
@"<meta\s*name\s*=\s*['""]?keywords['""]?\s*content\s*=\s*['""]?(?<KeyWords>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<KeyWords>[^'"">]*)['""]?\s*name\s*=\s*['""]?keywords['""]?\s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
 
/// <summary>
/// 取DESCRIPTION
/// </summary>
private static readonly Regex mDescription = new Regex(
@"<meta\s*name\s*=\s*['""]?description['""]?\s*content\s*=\s*['""]?(?<description>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<description>[^'"">]*)['""]?\s*name\s*=\s*['""]?description['""]?\s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
 
/// <summary>
/// 取Tags
/// </summary>
private static readonly Regex mTag = new Regex(
@"<meta\s*name\s*=\s*['""]?tagwords['""]?\s*content\s*=\s*['""]?(?<tagwords>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<tagwords>[^'"">]*)['""]?\s*name\s*=\s*['""]?tagwords['""]?\s*[^>]*>
", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出是否為垃圾句:后字符號過少,:號前無“說”字,:號后無"關于"
/// </summary>
private static readonly Regex IsWsMM = new Regex(
@"^[^說\s]{0,8}?[::].{0,10}$",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出spider寫入的url標記
/// </summary>
private static readonly Regex txtUrl = new Regex(
@"當前URL為:http://(?<URL>.*)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
/// <summary>
/// 找出spider寫入的錨點描述標記
/// </summary>
private static readonly Regex txtDescription = new Regex(
@"當前鏈接描述為:(?<Describe>.*)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);
 
///// <summary>
///// 取需要a標簽
///// </summary>
//private static readonly Regex cleanFirst = new Regex(
// @"([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])(?<Robbish1><a\s+[^>]*>)[^<]{1,6}(?<Robbish2></a>)([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
 
#endregion
 

CRM定制 辦公OA找沈陽易勢科技有限公司

沈陽團購網|營口網站制作|沈陽軟件公司|軟件定制|網站建設|加盟易勢|提交問題

主站蜘蛛池模板: 2018天天干夜夜操| 中文字幕水野优香在线网在线| 男人天堂2023| 国产AV无码专区亚洲AV麻豆| 国产情侣一区二区| 国产美女91视频| segui久久综合精品| 精品人妻中文字幕有码在线| 国产女人高潮叫床视频| 2019天天干| 在线观看国产小视频| 一本一道久久a久久精品综合| 日本三级韩国三级欧美三级 | 制服丝袜自拍偷拍| 天天做天天躁天天躁| 亚洲av无码欧洲av无码网站| 污污的软件下载| 免费一级毛片在线视频观看| 美女被免费视频网站a| 国产美女口爆吞精普通话| www.尤物在线| 成人乱码一区二区三区AV| 久久久99视频| 欧美成人免费tv在线播放| 人妻av综合天堂一区| 精品一区二区视频在线观看| 啊~用力cao我cao烂我小婷| 超污视频在线观看| 国产在线精品网址你懂的| 国产亚洲欧美在在线人成| 国产精品久久久久久亚洲影视| 99久久久国产精品免费牛牛四川| 女人是男人的未来1分29分| 一级毛片aaaaaa视频免费看| 无码免费一区二区三区免费播放| 久久夜色精品国产嚕嚕亚洲av| 日韩精品无码一本二本三本| 亚洲视频天天射| 青草热在线精品视频99app| 国产成人无码精品一区在线观看| 五月激情综合网|