mirror of
https://github.com/discourse/discourse.git
synced 2025-04-03 05:39:41 +08:00
Zoho importer: cleanup post content
This commit is contained in:
parent
75eb3ae664
commit
8aa02dabc4
@ -127,8 +127,8 @@ class ImportScripts::Zoho < ImportScripts::Base
|
|||||||
id: import_post_id(row),
|
id: import_post_id(row),
|
||||||
user_id: user_id,
|
user_id: user_id,
|
||||||
category: category_id,
|
category: category_id,
|
||||||
title: row.topic_title,
|
title: CGI.unescapeHTML(row.topic_title),
|
||||||
raw: row.content,
|
raw: cleanup_post(row.content),
|
||||||
created_at: Time.zone.parse(row.posted_time)
|
created_at: Time.zone.parse(row.posted_time)
|
||||||
}
|
}
|
||||||
# created_post callback will be called
|
# created_post callback will be called
|
||||||
@ -136,7 +136,7 @@ class ImportScripts::Zoho < ImportScripts::Base
|
|||||||
{
|
{
|
||||||
id: import_post_id(row),
|
id: import_post_id(row),
|
||||||
user_id: user_id,
|
user_id: user_id,
|
||||||
raw: row.content,
|
raw: cleanup_post(row.content),
|
||||||
created_at: Time.zone.parse(row.posted_time),
|
created_at: Time.zone.parse(row.posted_time),
|
||||||
topic_id: @topic_mapping[row.permalink]
|
topic_id: @topic_mapping[row.permalink]
|
||||||
}
|
}
|
||||||
@ -155,6 +155,47 @@ class ImportScripts::Zoho < ImportScripts::Base
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Note that Zoho doesn't render code blocks the same way all the time,
|
||||||
|
# but this seems to catch the most common format:
|
||||||
|
ZOHO_CODE_BLOCK_START = /<ol style="list-style-position: outside;(.)*">/
|
||||||
|
|
||||||
|
TOO_MANY_LINE_BREAKS = /[\n ]{3,}/
|
||||||
|
STYLE_ATTR = /(\s)*style="(.)*"/
|
||||||
|
|
||||||
|
def cleanup_post(raw)
|
||||||
|
|
||||||
|
# Check if Zoho's most common form of a code block is present.
|
||||||
|
# If so, don't clean up the post as much because we can't tell which markup
|
||||||
|
# is inside the code block. These posts will look worse than others.
|
||||||
|
has_code_block = !!(raw =~ ZOHO_CODE_BLOCK_START)
|
||||||
|
|
||||||
|
x = raw.gsub(STYLE_ATTR, '')
|
||||||
|
|
||||||
|
if has_code_block
|
||||||
|
# We have to assume all lists in this post are meant to be code blocks
|
||||||
|
# to make it somewhat readable.
|
||||||
|
x.gsub!(/( )*<ol>(\s)*/, "")
|
||||||
|
x.gsub!(/( )*<\/ol>/, "")
|
||||||
|
x.gsub!('<li>', '')
|
||||||
|
x.gsub!('</li>', '')
|
||||||
|
else
|
||||||
|
# No code block (probably...) so clean up more aggressively.
|
||||||
|
x.gsub!("\n", " ")
|
||||||
|
x.gsub!('<div>', "\n\n")
|
||||||
|
x.gsub('</div>', ' ')
|
||||||
|
x.gsub!("<br />", "\n")
|
||||||
|
x.gsub!('<span>', '')
|
||||||
|
x.gsub!('</span>', '')
|
||||||
|
x.gsub!(/<font ([^>]*)>/, '')
|
||||||
|
x.gsub!('</font>', '')
|
||||||
|
end
|
||||||
|
|
||||||
|
x.gsub!(TOO_MANY_LINE_BREAKS, "\n\n")
|
||||||
|
|
||||||
|
CGI.unescapeHTML(x)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
def import_post_id(row)
|
def import_post_id(row)
|
||||||
# Try to make up a unique id based on the data Zoho gives us.
|
# Try to make up a unique id based on the data Zoho gives us.
|
||||||
# The posted_time seems to be the same for all posts in a topic, so we can't use that.
|
# The posted_time seems to be the same for all posts in a topic, so we can't use that.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user