Hej, Jag tror jag i princip löst problemet nu (se kod nedan), däremot vore jag tacksam för tips om någon känner för att kommentera koden (i vilka sammanhang fungerar den t.ex. inte etc.)Hitta taggar som inte matchas (ex. < /b > utan tidigare < b >) m.h.a reg
Jag har sökt en del på regexp, orphans, html-tags, horungar etc. både här på forumet och på nätet, men hittar ingen bra info om hur man löser följande problem:
Har en text som jag rensar från oönskade html-taggar, problemet är att koden jag använder lämnar "sluttaggarna" kvar (evt. kan det också bli så att starttagg inte matchas av en motsvarande sluttagg)
Ex. kan texten när den rensats ha ex. </strong>, eller </em> utan matchande start-tagg.
Likaså vill jag egentligen kunna rensa bort all kod som står mellan <script-taggar också..., som funktionen ser ut just nu så lämnar den kvar själva kod-texten.
Funktionen ser f.n. ut så här:
' borttagen funktion - se nästa inlägg...
Sv: Hitta taggar som inte matchas (ex. < /b > utan tidigare < b >) m.h.a reg
MVH / Anette
' ****************************************************************************************
' Function wordTidy(doc)
' Description: cleans unwanted html-tags when for ex. pasting text into editor from word
' Use this when saving into database from text created by the RTE-Editor
' Sample:
' Dim myString, newString
' newString = wordTidy(myString)
' ****************************************************************************************
Function wordTidy(doc)
Dim loRegExp
Set loRegExp = New RegExp
With loRegExp
.Global = True
.IgnoreCase = True
' ******************************************
' First clean up the code a bit...
' Replace the
.Pattern = " "
doc = .Replace(doc," ")
' Replace doublespaces with singlespace
.Pattern = " +"
doc = .Replace(doc, " ")
' Replace TABS
.Pattern = "[\t]?"
doc = .Replace(doc, "")
' Replace multiple linefeeds
.Pattern = "[\n\r]+"
doc = .Replace(doc, vbCrLf)
' Remove whitespace from the beginning
.Pattern = "^\s+"
doc = .Replace(doc, "^")
' Remove whitespace from the end
.Pattern = "\s+$"
doc = .Replace(doc, "$")
' Remove whitespace after vbCrLf.
.Pattern = vbCrLf & "[ \t\v]"
doc = .Replace(doc, vbCrLf)
' Remove whitespace before vbCrLf
.Pattern = "[ \t\v]" & vbCrLf
doc = .Replace(doc, vbCrLf)
' Replace whitespaces before
.Pattern = " +<"
doc = .Replace(doc, "<")
' *************************************
' then remove html-page tags, if any...
' Delete all HTML tags
.Pattern = "<\/?HTML[^>]*>"
doc = .Replace(doc,"")
' Delete all HEAD tags
.Pattern = "<\/?HEAD[^>]*>"
doc = .Replace(doc,"")
' Delete all TITLE tags
.Pattern = "<\/?TITLE[^>]*>"
doc = .Replace(doc,"")
' Delete all META tags
.Pattern = "<\/?META[^>]*>"
doc = .Replace(doc,"")
' Delete all BODY tags
.Pattern = "<\/?BODY[^>]*>"
doc = .Replace(doc,"")
' Delete all IFRAME tags
.Pattern = "<\/?IFRAME[^>]*>"
doc = .Replace(doc,"")
' Delete all DIV tags
.Pattern = "<\/?DIV[^>]*>"
doc = .Replace(doc, "")
' // Remove all SPAN tags
.Pattern = "<\/?SPAN[^>]*>"
doc = .Replace(doc,"")
' // Remove all SCRIPT tags
.Pattern = "<script[^>]*>(\w|\W)*?</script[^>]*>"
doc = .Replace(doc,"")
' // Remove all STYLE tags
.Pattern = "<style[^>]*>(\w|\W)*?</style[^>]*>"
doc = .Replace(doc,"")
' // Remove all FORM tags
.Pattern = "<form[^>]*>(\w|\W)*?</form[^>]*>"
doc = .Replace(doc,"")
' *******************************************
' remove attributes
' Remove Class attributes
.Pattern = "<(\w[^>]*) class=([^ |>]*)([^>]*)"
doc = .Replace(doc,"<$1$3")
' Remove Style attributes
.Pattern = "<(\w[^>]*) style=""([^""]*)""([^>]*)"
doc = .Replace(doc,"<$1$3")
' Remove Lang attributes
.Pattern = "<(\w[^>]*) lang=([^ |>]*)([^>]*)"
doc = .Replace(doc,"<$1$3")
' Remove Size attributes
.Pattern = "<(\w[^>]*) size=([^ |>]*)([^>]*)"
doc = .Replace(doc,"<$1$3")
' Remove ID attributes
.Pattern = "<(\w[^>]*) id=([^ |>]*)([^>]*)"
doc = .Replace(doc,"<$1$3")
' ****************************************************
' take care of XML...
' Remove XML elements and declarations
.Pattern = "<\\?\?xml[^>]*>"
doc = .Replace(doc,"")
' Remove Tags with XML namespace declarations: <o:p></o:p>
.Pattern = "<\/?\w+:[^>]*>"
doc = .Replace(doc,"")
' ****************************************************
' Transform <P> to <DIV> - inactivated
' .Pattern = "(<P)([^>]*>.*?)(<\/P>)"
' doc = .Replace(doc,"<div$2</div>")
' ****************************************************
' remove header-tags // inactivated so far
'.Pattern = "<\/?h[1-6][^>]*>"
'doc = .Replace(doc, "")
' ****************************************************
' Clean up and Remove empty tags etc...
' Remove empty tags
' Note: ? = space may occur once or not at all (in the original pattern the ? was an *)
.Pattern = "<[^/][^>]*>[\s]?<*</[^>]*>"
doc = .Replace(doc, "")
' Remove vbcrLfs between tags
.Pattern = "<[^/][^>]*>[\n\r]?<*</[^>]*>"
doc = .Replace(doc, "")
' Remove empty tags
' Note: ? = space may occur once or not at all (in the original pattern the ? was an *)
.Pattern = "<[^/][^>]*>[\s]?<*</[^>]*>"
doc = .Replace(doc, "")
' replace xml end-tags
.Pattern = "[\s]?/>"
doc = .Replace(doc, ">")
' Replace multiple linefeeds
.Pattern = "[\n\r]+"
doc = .Replace(doc, vbCrLf)
End With
Set loRegExp = Nothing
wordTidy = doc
End Function