troy.baker

Context Matters: Give Your Customer Pathing Teeth with Text Analysis

Blog Post created by troy.baker on Apr 18, 2017

We know that understanding the customer journey is important, but how can we be sure we are telling that story to the best of our ability? It could be impossible to completely understand everything that affects your customer. BUT! Therein lies nearly unlimited ability to improve how we model the customer experience. Free-form text in customer surveys and reviews, for instance, can be tricky to analyze. Even so, it represents one of the clearest pictures of the customer's perspective attainable. In this example, We will be using customer survey data, topic modeling, and sentiment analysis to create a contextual customer touchpoint.

 

--Create new table from surveys in Hadoop. Compress and analyze.
CREATE VIEW tb.survey_raw
AS
SELECT * FROM load_from_hcatalog (
     ON public.mr_driver
     SERVER ('myserv.teradata.com')
     USERNAME ('hive')
     DBNAME ('default')
     TABLENAME ('web_surveys')
);

DROP TABLE IF EXISTS tb.survey;
CREATE TABLE tb.survey
DISTRIBUTE BY HASH(survey_id)
COMPRESS LOW
AS
SELECT survey_id
     , user_id
     , nps
     , survey_text
     , survey_timestamp
     FROM tb.survey_raw
;
ANALYZE tb.survey;


--Tokenize surveys with Text Parser
DROP TABLE IF EXISTS tb.survey_tp;
CREATE TABLE tb.survey_tp
DISTRIBUTE BY HASH (survey_id)
COMPRESS LOW
AS
SELECT * FROM Text_Parser (
ON tb.survey
Text_Column ('survey_text')
Case_Insensitive ('true')
Stemming ('true')
Total ('true')
Punctuation ('[\\\[.,?\!:;~()\\\]]+')
Remove_Stop_Words ('true')
Accumulate ('survey_id')
List_Positions ('true')
)
;
ANALYZE tb.survey_tp;


--Check top words
--Words that occur too frequently may not be useful in our analysis
--It is usually a good practice to remove these
--This will depend on your own use case and data
SELECT token, SUM(frequency)
FROM tb.survey_tp
GROUP BY 1
ORDER BY 2 DESC
LIMIT 200;


--Check lowest word counts
--Words that only occur very few times should also be considered for removal
SELECT token, SUM(frequency)
FROM tb.survey_tp
GROUP BY 1
HAVING SUM(frequency) <= 2500
ORDER BY 2 ASC
LIMIT 2000;


--Removing tokens with less than 2500 appearances
--or those in the top 19 appearances.
--Before we reach this point, stop words should
--already be removed.
DELETE FROM tb.survey_tp
WHERE token IN(
     SELECT token
     FROM(
          SELECT token, SUM(frequency)
          FROM tb.survey_tp
          GROUP BY 1
          HAVING SUM(frequency) <= 2500) a
     )
OR token IN(
     SELECT token
     FROM(
          SELECT token, SUM(frequency)
          FROM tb.survey_tp
          GROUP BY 1
          ORDER BY 2 DESC
          LIMIT 19) a
     )
;
VACUUM tb.survey_tp;
ANALYZE tb.survey_tp;


--Create LDA Model
--Training converged after 37 iterate steps with delta 9.028843880595204E-5
--There are 1274580 documents with 23101982 words in the training set, the perplexity is 317.822466
--Elapsed Time: 03:52.518
DROP TABLE IF EXISTS oap.tb_lda_model;
SELECT * FROM LDATrainer (
ON (SELECT 1) PARTITION BY 1
InputTable ('tb.survey_tp')
ModelTable ('tb.survey_lda_model')
TopicNumber ('20')
DocIDColumn ('survey_id')
WordColumn ('token')
CountColumn ('frequency')
Seed (3)
);
ANALYZE tb.survey_lda_model;

--Show model summary
--The model table is trained with the parameters: topicNumber:20, vocabularySize:1155, alpha:0.100000, eta:0.100000
--There are 1274580 documents with 23101982 words in the training set, the perplexity is 317.822466
SELECT * FROM LDATopicPrinter (
ON tb.survey_lda_model PARTITION by 1
ShowSummary ('true')
);

--Show model detail
--Create table of highly weighted words
DROP TABLE IF EXISTS tb.survey_lda_topics;
CREATE TABLE tb.survey_lda_topics
DISTRIBUTE BY REPLICATION
COMPRESS LOW
AS
SELECT * FROM LDATopicPrinter (
ON tb.survey_lda_model PARTITION by 1
ShowWordWeight ('true')
ShowWordCount ('true')
)
WHERE wordweight > .025
ORDER BY topicid, wordweight DESC;


--Test Model
--There are 424487 valid documents with 7712740 recognized words in the input, the perplexity is 318.322719
--Outputtable "oap"."tb_lda_inf" is created successfully.
DROP TABLE IF EXISTS tb.survey_lda_inf;
SELECT * FROM LDAInference (
ON (SELECT 1) PARTITION by 1
InputTable ('tb.survey_tp')
ModelTable ('tb.survey_lda_model')
OutputTable ('tb.survey_lda_inf')
DocIDColumn ('survey_id')
WordColumn ('token')
CountColumn ('frequency')
);
ANALYZE tb.survey_lda_inf;

--Find and explore strong topic matches
SELECT * FROM tb.survey_lda_inf LIMIT 500;

--Accumulate topic words for easy analysis
DROP TABLE IF EXISTS tb.survey_lda_acc;
CREATE TABLE tb.survey_lda_acc
DISTRIBUTE BY REPLICATION
COMPRESS LOW
AS
SELECT * FROM NPATH(
     ON tb.survey_lda_topics
     PARTITION BY topicid
     MODE (NONOVERLAPPING)
     PATTERN ('T*')
     SYMBOLS (
          TRUE AS T
          )
     RESULT (
          FIRST (topicid OF T) AS topicid,
          ACCUMULATE (word OF ANY(T)) AS words
     )
);

--Sometimes even if you include a customer satisfaction score
--in your survey, the actual customer text may tell a
--different story!
--We can join sentiment scores to LDA topic scores
--to infer positive and negative elements about
--a customer's experience.
--We categorize our documents here using the highest
--ranking LDA topic score.
--Add sentiment and topic columns
DROP TABLE IF EXISTS tb.survey_se;
CREATE TABLE tb.survey_se
DISTRIBUTE BY HASH(survey_id)
COMPRESS LOW
AS
SELECT se.*, lda.topicid, lda.topicweight FROM ExtractSentiment (
ON tb.survey
Text_Column ('survey_text')
Accumulate ('survey_id', 'user_id', 'nps', 'survey_timestamp')
Level ('DOCUMENT')
) se
, (SELECT docid, topicid, topicweight
     , RANK() OVER (PARTITION BY docid ORDER BY topicweight DESC)
     topicrank
     FROM tb.survey_lda_inf) lda
WHERE se.survey_id = lda.docid
AND lda.topicrank = 1;
ANALYZE tb.survey_se;

SELECT * FROM tb.survey_se LIMIT 500;

--What relationship does text sentiment have with NPS?
DROP TABLE IF EXISTS tb.nps_se;
CREATE TABLE tb_nps_se
DISTRIBUTE BY HASH(survey_id)
COMPRESS LOW
AS
SELECT survey_id
     , nps
     , out_polarity
     , CASE
          WHEN nps > 0
          AND out_polarity = 'POS'
          THEN 1
          ELSE 0
          END both_pos
     , CASE
          WHEN nps < 0
          AND out_polarity = 'NEG'
          THEN 1
          ELSE 0
          END both_neg
     , CASE
          WHEN nps > 0
          AND out_polarity = 'NEG'
          THEN 1
          WHEN nps < 0
          AND out_polarity = 'POS'
          THEN 1
          ELSE 0
          END conflict
FROM tb.survey_se;
ANALYZE oap.tb_nps_se;

SELECT * FROM oap.tb_nps_se LIMIT 200;

--How often do the metrics agree? Not taking into account neutral measures.
SELECT pos_total, (100 * pos_total / cnt)::DECIMAL(4,1) pos_pct,
     neg_total, (100 * neg_total / cnt)::DECIMAL(4,1) neg_pct,
     con_total, (100 * con_total / cnt)::DECIMAL(4,1) con_pct,
     cnt
FROM
(SELECT SUM(both_pos) pos_total, SUM(both_neg) neg_total, SUM(conflict) con_total, COUNT(*) cnt
FROM tb.nps_se) a;

--Does the inferred topic of a document imply a change in sentiment?
SELECT a.*,b.words
FROM
(SELECT topicid, AVG(nps), AVG(out_strength)
FROM tb.survey_se
GROUP BY 1
ORDER BY 1) a,
tb.lda_acc b
WHERE a.topicid = b.topicid;


--Finally, we can create a table of contextual touchpoints.
--These records can be easily be used with nPath logic
--to craft a complete customer experience.
DROP TABLE IF EXISTS tb.survey_events;
CREATE TABLE tb.survey_events
DISTRIBUTE BY HASH(user_id)
COMPRESS LOW
AS
SELECT survey_id
     , user_id
     , topicid
     , survey_text
     , out_polarity sen_polarity
     , survey_timestamp
FROM tb.survey_se
;

Outcomes