<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-3227782676487079407</id><updated>2012-02-27T20:22:02.312-08:00</updated><category term='RE'/><category term='Python'/><category term='Talend'/><category term='Geo'/><category term='Analytics'/><category term='SQL'/><category term='ETL'/><category term='Calpont'/><category term='NS'/><category term='Google SEM SEO Yahoo MSN'/><category term='Date'/><category term='Keywords'/><category term='SQLServer'/><category term='Windows'/><category term='Report'/><category term='Thrift'/><category term='Log'/><category term='Toad'/><category term='Non-Profit'/><category term='Eureqa'/><category term='Weblog'/><category term='Hive'/><category term='Code'/><category term='Access'/><category term='File'/><category term='SEM'/><category term='Creative'/><category term='Index'/><category term='Variables'/><category term='Hadoop'/><category term='Derby'/><category term='Case'/><category term='Adwords'/><category term='IP'/><category term='Data Warehouse'/><category term='Campaign'/><category term='Yahoo'/><category term='Docs'/><category term='PPC'/><category term='MySQL'/><category term='DW'/><category term='Data pull'/><category term='Keyword'/><category term='Bid'/><category term='Share'/><category term='RegEx'/><category term='Perl'/><category term='Java'/><category term='Fun'/><category term='InfiniDB'/><category term='API'/><category term='Google'/><category term='SEM Yahoo'/><category term='Tool'/><category term='C#'/><category term='Pig'/><category term='SEO'/><category term='Collation'/><category term='Character Set'/><category term='Ruby'/><category term='Database'/><category term='Predictive Analysis'/><category term='Linux'/><category term='Timezone'/><category term='Ubuntu'/><category term='Herfindahl'/><category term='Regular Expression'/><category term='Excel'/><title type='text'>Search, Data and Technology</title><subtitle type='html'>Search, BI, Data Warehouse, Languages, and Tech Musings</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://www.hiregion.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><link rel='next' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default?start-index=26&amp;max-results=25'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>53</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-8733709060923543880</id><published>2011-12-02T18:57:00.001-08:00</published><updated>2011-12-03T00:17:25.295-08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Perl'/><category scheme='http://www.blogger.com/atom/ns#' term='RegEx'/><category scheme='http://www.blogger.com/atom/ns#' term='RE'/><title type='text'>Life with Regular Expressions!</title><content type='html'>Damn this is good!&lt;br /&gt;(Credit: &amp;nbsp;xkcd.com)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;img border="0" height="640" src="http://4.bp.blogspot.com/-qXBxO-0vMPE/TtmQ21-dLPI/AAAAAAAACs4/dicntXQcxq4/s640/xkcd_regular_expressions.png" width="632" /&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-8733709060923543880?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/8733709060923543880/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2011/12/life-with-regular-expressions.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8733709060923543880'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8733709060923543880'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2011/12/life-with-regular-expressions.html' title='Life with Regular Expressions!'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/-qXBxO-0vMPE/TtmQ21-dLPI/AAAAAAAACs4/dicntXQcxq4/s72-c/xkcd_regular_expressions.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-4433915415009860518</id><published>2011-11-14T23:15:00.000-08:00</published><updated>2011-12-03T00:13:58.587-08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><category scheme='http://www.blogger.com/atom/ns#' term='Date'/><category scheme='http://www.blogger.com/atom/ns#' term='Timezone'/><title type='text'>TImezone conversion for hourly report</title><content type='html'>We recently built a real time hourly report where data is streamed in and summarized through multiple jobs and pushed into reporting tables. &amp;nbsp;The business users wanted the reports as quickly as possible and first phase of the project needed to be implemented in short time. &amp;nbsp;With very limited time from concept to design and implementation, I utilized 3rd party web charting service (www.chart.io) and pushed the highly summarized data to set of tables.&lt;br /&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;During a final reporting, I needed to convert date and hour in UTC to PST. &amp;nbsp;The table had date (date type) and hour (tinyint) columns. &amp;nbsp;Chart.io uses simple x, y chart (line, bar, etc.) and takes any co-ordinates passed for them and displays them. &amp;nbsp;See below for one of final chart.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-pDqLOftwdBY/TtnQGAglNUI/AAAAAAAACtA/wbqUla_V4KU/s1600/chartio_2011.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="248" src="http://3.bp.blogspot.com/-pDqLOftwdBY/TtnQGAglNUI/AAAAAAAACtA/wbqUla_V4KU/s640/chartio_2011.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;X axis is date and hour with format 'YYYY-MM-DD HH'. &amp;nbsp;The following SQL converted the date and hour from corresponding 2 columns with format YYYY-MM-DD and HH respectively in UTC timezone to PST.&amp;nbsp;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;SELECT substr(convert_tz(concat(date, ' ', if(length(hour) = 1, concat('0',hour), hour)), 'GMT', 'America/Los_Angeles'), 1, 13) as x,&lt;/div&gt;&lt;div&gt;...&lt;/div&gt;&lt;div&gt;FROM &lt;br /&gt;&lt;div&gt;WHERE ...&lt;/div&gt;&lt;div&gt;GROUP BY x...&lt;/div&gt;&lt;div&gt;ORDER BY x ASC&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Since maintaining the order is important. For example '2011-11-11 02' hour is right after '2011-11-11 01' hour and before '2011-11-11 03', single digit hours (1 through 9) are concatenated with "0'. &amp;nbsp;Then convert_tz converted the GMT timezone to PST (America/Los_Angeles) time. &amp;nbsp;There was suggestion to simply use -8.00 in hardcoded timezone difference but will result in incorrect hour during the months day light saving time (summer). &amp;nbsp; Since data is stored for each hour in tables over many months and years, by having the above technique one doesn't need to worry about this issue. &amp;nbsp;The system will automatically and correctly offset the hours in the report. &amp;nbsp;Handling timezone conversion is a not a trivial one and see &lt;a href="https://en.wikipedia.org/wiki/Time_zone"&gt;timezone wiki&lt;/a&gt; for different timezones around the world. &amp;nbsp;For timezone conversion to work in MySQL, one should load the appropriate data into tables &lt;a href="http://www.hiregion.com/2011/07/loading-timezones-into-mysql-or.html"&gt;like this&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The reports/charts generated were very useful to business user and helped them take some quick decisive decisions and help build better user experience and improve sales and rev!&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;table&gt;&lt;/table&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-4433915415009860518?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/4433915415009860518/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2011/11/timezone-conversion-for-hourly-report.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4433915415009860518'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4433915415009860518'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2011/11/timezone-conversion-for-hourly-report.html' title='TImezone conversion for hourly report'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/-pDqLOftwdBY/TtnQGAglNUI/AAAAAAAACtA/wbqUla_V4KU/s72-c/chartio_2011.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-3694843997504332689</id><published>2011-11-03T19:25:00.000-07:00</published><updated>2011-12-02T20:33:36.835-08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><category scheme='http://www.blogger.com/atom/ns#' term='Date'/><title type='text'>Converting 12 hour time Unix timestamp</title><content type='html'>Many a times I need to transform time in AM/PM format to unix timestamp for better performance. &amp;nbsp;The external data coming in has date and time in two separate columns like '2011-01-01' as date and '11:59:59 PM' as time. &amp;nbsp; And some other time, I also need to convert timezones typically from a UTC to PST and to do that check this out.&lt;br /&gt;&lt;br /&gt;For example, converting "2011-01-01 11:59:59 PM" into corresponding unix timestamp results in an integer&amp;nbsp;1293926399.&lt;br /&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;# Create a temp table...&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;create table tmp.test_dt (&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&amp;nbsp; d date,&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&amp;nbsp; t varchar(20)&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;)engine=InnoDB&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;# Insert a value to test...&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;insert into tmp.test_dt&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;(d, t) values&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;('2011-01-01', '11:59:59 PM')&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;# Run the conversion sql...&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;SELECT&amp;nbsp;&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&amp;nbsp; &amp;nbsp;unix_timestamp&lt;span class="Apple-style-span" style="background-color: white; color: blue;"&gt;(&lt;/span&gt;concat&lt;span class="Apple-style-span" style="color: red;"&gt;(&lt;/span&gt;cast(d as char),&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&amp;nbsp;' ',&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&amp;nbsp;cast(time_format(str_to_date(t,'%r'),'%T') as char)&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="color: red; font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&amp;nbsp;)&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; color: blue; font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;)&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;FROM tmp.test_dt&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;# Output: 1293926399&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;# To test&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;SELECT from_unixtime(1293926399)&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;"&gt;# Output: "2011-01-01 23:59:59"&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif; font-size: x-small;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: Times, 'Times New Roman', serif;"&gt;In the conversion select statement, time_format converts the time (t column) from 11:59:59 PM to 23:59:59 of datetime type which is then cast as char to concatenate with date. &amp;nbsp;After concat the string looks like "2011-01-01 23:59:59" which becomes input to unix_timestamp.&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: Times, 'Times New Roman', serif;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-3694843997504332689?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/3694843997504332689/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2011/11/converting-12-hour-time-unix-timestamp.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3694843997504332689'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3694843997504332689'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2011/11/converting-12-hour-time-unix-timestamp.html' title='Converting 12 hour time Unix timestamp'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-4889348728715908597</id><published>2011-07-28T18:24:00.000-07:00</published><updated>2011-07-28T18:48:08.503-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><category scheme='http://www.blogger.com/atom/ns#' term='Timezone'/><category scheme='http://www.blogger.com/atom/ns#' term='Calpont'/><category scheme='http://www.blogger.com/atom/ns#' term='InfiniDB'/><title type='text'>Loading timezones into MySQL or InfiniDB</title><content type='html'>If you are running queries or reports that utilize mysql timezone convertion function (CONVERT_TZ) make sure that you have run mysql utility (mysql_tzinfo_to_sql) that loads timezone information from the system into mysql server first.  Without having loaded the timezone information one will simply get NULL return value and no message or error.  This is expected behavior because the required tables for MySQL already exist but they are not populated!  &lt;br /&gt;&lt;br /&gt;In many threads and forums I have seen developers asking question on how to convert one timezone to another while all they get is null return value.&amp;nbsp; I myself have bitten by it couple of times when I moved to new MySQL servers where the following steps were not done.&lt;br /&gt;&lt;br /&gt;Here I am using Ubuntu which has timezone files under /usr/share/zoneinfo directory.  In many flavors of unix including Mac OS you will find it there, if not search for it with find command ;)&lt;br /&gt;&lt;br /&gt;Also, note you can load all timezones of the world in one go.  The SQL created by this utility is large with more than 142K lines of SQL code!  After completing the task the following tables must have been populated.&lt;br /&gt;&lt;ol&gt;&lt;li&gt;mysql.time_zone&lt;/li&gt;&lt;li&gt;mysql.time_zone_name&lt;/li&gt;&lt;li&gt;mysql.time_zone_transition&lt;/li&gt;&lt;li&gt;mysql.time_zone_transition_type&lt;/li&gt;&lt;/ol&gt;There is another related table mysql.time_zone_leap_second which we don't need at this point. &lt;br /&gt;&lt;pre class="php" name="code"&gt;# To load all timezone&lt;br /&gt;shell&amp;gt; mysql_tzinfo_to_sql /usr/share/zoneinfo | mysql -u root mysql&lt;br /&gt;&lt;br /&gt;# You may see output like:&lt;br /&gt;Warning: Unable to load '/usr/share/zoneinfo/Asia/Riyadh87' as time zone. Skipping it.&lt;br /&gt;Warning: Unable to load '/usr/share/zoneinfo/Asia/Riyadh88' as time zone. Skipping it.&lt;br /&gt;Warning: Unable to load '/usr/share/zoneinfo/Asia/Riyadh89' as time zone. Skipping it.&lt;br /&gt;# On my system some specific timezone of Riyadh are not defined.  And you can ignore these warnings.&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Restart mysql or infinidb server to bring in the timezones and run a test. &lt;br /&gt;&lt;pre class="php" name="code"&gt;# In case of infinidb idbmysql is just an alias&lt;br /&gt;shell&amp;gt; alias idbmysql&lt;br /&gt;alias idbmysql='/usr/local/Calpont/mysql/bin/mysql --defaults-file=/usr/local/Calpont/mysql/my.cnf -u root'&lt;br /&gt;&lt;br /&gt;shell&amp;gt; idbmysql&lt;br /&gt;Welcome to the MySQL monitor.  Commands end with ; or \g.&lt;br /&gt;Your MySQL connection id is 16&lt;br /&gt;Server version: 5.1.39 MySQL Community / Calpont InfiniDB Community 2.2-2 Final (COSS LA)&lt;br /&gt;&lt;br /&gt;Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.&amp;nbsp;&lt;/pre&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;mysql&amp;gt; SELECT utc_timestamp(), CONVERT_TZ(utc_timestamp(),'UTC','MET');&lt;br /&gt;+---------------------+-----------------------------------------+&lt;br /&gt;| utc_timestamp()     | CONVERT_TZ(utc_timestamp(),'UTC','MET') |&lt;br /&gt;+---------------------+-----------------------------------------+&lt;br /&gt;| 2011-07-29 01:14:40 | 2011-07-29 03:14:40                     |&lt;br /&gt;+---------------------+-----------------------------------------+&lt;br /&gt;1 row in set (0.00 sec)&lt;br /&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-4889348728715908597?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/4889348728715908597/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2011/07/loading-timezones-into-mysql-or.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4889348728715908597'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4889348728715908597'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2011/07/loading-timezones-into-mysql-or.html' title='Loading timezones into MySQL or InfiniDB'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-7025223680618237592</id><published>2011-04-10T21:12:00.000-07:00</published><updated>2011-04-10T21:37:02.875-07:00</updated><title type='text'>Linux 20 Years</title><content type='html'>It has been while since blogged but couldn't resist having this clip embedded in my blog as a tribute to Linux that made the freeware accessible to many.&lt;br /&gt;Enjoy,&lt;br /&gt;Shiva&lt;br /&gt;&lt;br /&gt;&lt;div align="center" class="Content"&gt;&lt;br /&gt;&lt;iframe title="YouTube video player" width="224" height="169" src="http://www.youtube.com/embed/5ocq6_3-nEw" frameborder="0" allowfullscreen&gt;&lt;/iframe&gt;&lt;/div&gt;&lt;br /&gt;&lt;div align="center" class="Content"&gt;&lt;a href="http://www.linuxfoundation.org/20th/images/linux20infographic.png" rel="lightbox" title="Memorable Linux Milestones"&gt;&lt;img src="http://www.linuxfoundation.org/20th/images/linux20infographic.png" border="0" align="middle" width="224" height="188" alt="Memorable Linux Milestones Infographic" /&gt; &lt;/a&gt; &lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-7025223680618237592?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/7025223680618237592/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2011/04/linux-20-years.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/7025223680618237592'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/7025223680618237592'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2011/04/linux-20-years.html' title='Linux 20 Years'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://img.youtube.com/vi/5ocq6_3-nEw/default.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-3856115634890579238</id><published>2010-10-14T12:27:00.000-07:00</published><updated>2010-10-15T15:26:15.539-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><title type='text'>MySQL Connection over SSH</title><content type='html'>When MySQL server is setup to accept only localhost connection through bind_address in /etc/mysql/mysql.cnf (configuration)&lt;br /&gt;&lt;pre class="perl" name="code"&gt;# Instead of skip-networking the default is now to listen only on&lt;br /&gt;# localhost which is more compatible and is not less secure.&lt;br /&gt;bind-address&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; = 127.0.0.1&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;or you want to connect as "root" to mysql and use GUI clients like Workbench or Toad, etc. you can use SSH tunneling.&lt;br /&gt;&lt;br /&gt;In operating systems that provide built-in ssh support it is made little easier.&amp;nbsp; Mac OS is one such OS, which I will use as an example below.&lt;br /&gt;&lt;br /&gt;Assuming you have already setup ssh connection, run the following command from the local terminal. If you have not setup the ssh, see this brief &lt;a href="http://inside.mines.edu/%7Egmurray/HowTo/sshNotes.html"&gt;introductory ssh article&lt;/a&gt;. Or run &amp;gt; man ssh for more details.&lt;br /&gt;&lt;pre class="perl" name="code"&gt;prompt&amp;gt;&amp;nbsp; ssh -L 3307:localhost:3306&amp;nbsp; user@remote.com&lt;br /&gt;&lt;/pre&gt;and then let this terminal running.&amp;nbsp; Here I am using ssh -L [bind_address:]port:host:hostport&amp;nbsp; parameter and skipped bind_address which is needed only when you have more than one IP attached (multi-IP) to your local system which is typically not the case.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3307 - The port that you will use on the other clients that need to connect to remote mysql. For example, Workbench.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; localhost:3306 - The remote login details as if you had connected to it through ssh and connecting to mysql instance running on port 3306.&amp;nbsp; If you had successfully, ssh-ed to remote host at the prompt you would have run &amp;gt; mysql -h   localhost -u user_name -P 3306&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; user@remote.com - Remote user name and remote host&lt;br /&gt;&lt;br /&gt;Now start Workbench and setup new connection string by providing the details as if you are connecting to a mysql instance on your local machine running on port 3307. See figure 1.&amp;nbsp; ssh will automatically tunnels your connection to remote host's mysql! See figure 2.&amp;nbsp; On ending the tunneling setup either my closing the terminal where you were running "ssh -L" command and killing it will disconnect the connection to Workbench.&lt;br /&gt;&lt;br /&gt;Figure 1.&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_kzso6KW6Rx4/TLdYdkudGdI/AAAAAAAACn4/6sCa3pJSffQ/s1600/workbench_screen.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="364" src="http://4.bp.blogspot.com/_kzso6KW6Rx4/TLdYdkudGdI/AAAAAAAACn4/6sCa3pJSffQ/s640/workbench_screen.jpg" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;Figure 2.&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_kzso6KW6Rx4/TLdYsmyaLjI/AAAAAAAACn8/UpQ_lIVK7ok/s1600/workbench_screen_2.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="186" src="http://1.bp.blogspot.com/_kzso6KW6Rx4/TLdYsmyaLjI/AAAAAAAACn8/UpQ_lIVK7ok/s400/workbench_screen_2.jpg" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;Note: Same technique can be used to for other client applications to connect to any of remote applications/databases over SSH.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Have fun,&lt;br /&gt;&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-3856115634890579238?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/3856115634890579238/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/10/mysql-connection-over-ssh.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3856115634890579238'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3856115634890579238'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/10/mysql-connection-over-ssh.html' title='MySQL Connection over SSH'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_kzso6KW6Rx4/TLdYdkudGdI/AAAAAAAACn4/6sCa3pJSffQ/s72-c/workbench_screen.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-3307968858179137976</id><published>2010-08-09T12:30:00.000-07:00</published><updated>2010-08-21T19:02:15.080-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><category scheme='http://www.blogger.com/atom/ns#' term='IP'/><category scheme='http://www.blogger.com/atom/ns#' term='ETL'/><category scheme='http://www.blogger.com/atom/ns#' term='DW'/><category scheme='http://www.blogger.com/atom/ns#' term='SQL'/><title type='text'>DW solution - IP mapping to Geo Locations</title><content type='html'>In my &lt;a href="http://www.hiregion.com/2010/07/ip-address-conversions.html"&gt;previous few posts&lt;/a&gt; on the blog, I covered how to convert IP addresses to corresponding integers and back. When performing a Data Warehouse (DW) tables join between IP addresses for location analysis joining on integer IP column is much faster than joining on string column.&amp;nbsp; As noted in other posts, this article covers IPv4 but technique could be used for IPv6 addresses on using 2 BIGINT.&lt;br /&gt;&lt;br /&gt;For geo location info, you can use MaxMind's Geo-City data&amp;nbsp;that is provided in&amp;nbsp;two entities - one with mapping IP block to a location ID and another one with location ID to city/state/country.&amp;nbsp; It is one to many (1:M) relationship and to find an IP address location one need to find the matching IP block and then map it to city or country.&amp;nbsp; There are posts that describe in detail how to perform it effeciently for a given IP.&amp;nbsp; These techniques will work for an OLTP environment where you pass known IP address from UI or application layer to database and gets its city/country.&lt;br /&gt;&lt;ol&gt;&lt;li&gt;&lt;a href="http://andy.wordpress.com/2007/12/16/fast-mysql-range-queries-on-maxmind-geoip-tables/"&gt;Fast MySQL Range Queries on MaxMind GeoIP Tables&lt;/a&gt; byAndy Skelton&lt;/li&gt;&lt;li&gt;&lt;a href="http://bartomedia.blogspot.com/2007/11/maxmind-geoip-setup-tutorial-using.html"&gt;MaxMind GeoIP Install Setup Tutorial&lt;/a&gt; by Bartomedia&lt;/li&gt;&lt;li&gt;&lt;a href="http://code.openark.org/blog/mysql/sql-finding-a-users-countryregion-based-on-ip"&gt;Finding a user's country/region based on IP&lt;/a&gt; by Shlomi Noach &lt;/li&gt;&lt;li&gt;&lt;a href="http://on%20efficiently%20geo-referencing%20ips%20with%20maxmind%20geoip%20and%20mysql%20gis/"&gt;On efficiently geo-referencing IPs with MaxMind GeoIP and MySQL GIS&lt;/a&gt; by Jeremy Cole &lt;/li&gt;&lt;/ol&gt;Geo-City data has following fields seperated by comma and row terminated by new line (\n):&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Location ID &lt;/li&gt;&lt;li&gt;Country Code &lt;/li&gt;&lt;li&gt;State/Region&lt;/li&gt;&lt;li&gt;City Name &lt;/li&gt;&lt;li&gt;Postal Code (US only)&lt;/li&gt;&lt;li&gt;Latitude &lt;/li&gt;&lt;li&gt;Longitude &lt;/li&gt;&lt;li&gt;Metro Code &lt;/li&gt;&lt;li&gt;Area Code &lt;/li&gt;&lt;/ul&gt;IP Block has:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Start IP Address&lt;/li&gt;&lt;li&gt;End IP Address&lt;/li&gt;&lt;li&gt;Location ID&lt;/li&gt;&lt;/ul&gt;While loading this dimensional data during ETL process, you should also do the IP address transformation to integer as explained in "&lt;a href="http://www.hiregion.com/2010/07/converting-ip-address-to-integer-and.html"&gt;Converting an IP address to an integer&lt;/a&gt;".&amp;nbsp; And Geo-City dimensional table will also have partition range (the reason for this post) and surrogate key (id).&amp;nbsp; You should also perform other data validation like non-overlapping ranges, valid country codes, etc. which are all typical standard activities of DW ETL process.&amp;nbsp; The time, resource and effort put into cleaning and validating the data before pushing the data into final fact tables will pay well within short period of time.&amp;nbsp; To fix a data issue and reconcile the reports in DW/BI environment always leads to lower ROI.&lt;br /&gt;&lt;br /&gt;You can&amp;nbsp;also&amp;nbsp;denormalize the above tables and create a single table.&amp;nbsp; In either case by joining on range of IP address for a given set of IPs will result in the same slow performance.&lt;br /&gt;&lt;br /&gt;dim_geo_ip_city_location holds around 5.9million rows with following attributes:&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;CREATE TABLE dim_geo_ip_city_location (&lt;br /&gt;    location_id int NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; start_ip varchar(16) NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; end_ip&amp;nbsp; varchar(16) NOT NULL,&lt;br /&gt;&amp;nbsp; &amp;nbsp; country char(2),&lt;br /&gt;&amp;nbsp; &amp;nbsp; region varchar(64), &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; city varchar(128),&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; postal_code varchar(32),&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; latitude decimal(8,4),&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; longitude decimal(8,4),&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; metro_code varchar(8),&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; area_code varchar(8),&lt;br /&gt;    ...&lt;br /&gt;)ENIGNE=...&lt;br /&gt;&lt;br /&gt;CREATE TABLE dim_ip_block (&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; start_ip_num bigint NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; end_ip_num bigint NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;location_id int NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ...&lt;br /&gt;)ENGINE=...&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;In MySQL (5.0) optimizer performs range operation effeciently with a constant meaning that you have given the query an IP address like "WHERE ip_addr between 1230 and 1234", for example, but not with range operations without constants.&amp;nbsp; For more info on using range optimization see &lt;a href="http://dev.mysql.com/doc/refman/5.1/en/range-optimization.html"&gt;mysql documentation&lt;/a&gt; and for indexing see "&lt;a href="http://assets.en.oreilly.com/1/event/21/Mastering%20the%20Art%20of%20Indexing%20Presentation.pdf"&gt;Mastering the Art of (MySQL) Indexing&lt;/a&gt;".&lt;br /&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;I did not find any article providing some insights into doing DW/BI analysis on IP addresses.&amp;nbsp; In a DW environment the business user is interested in slicing and dicing the hundreds of thousands to million of IPs.&amp;nbsp; Just trying to find few thousand IP address locations (See query 1 below) was taking more than an hour when joined between lookup table (a table similar to dim_geo_ip_city_location but without partition column (ip_range_partition) and id) and ip address join table even with appropriate indexes like index on (start_ip_num, end_ip_num) or start_ip_num and end_ip_num.&lt;br /&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;Query 1:&lt;br /&gt;&lt;pre class="sql" name="code"&gt;-- Simple query to select IP block&lt;br /&gt;SELECT IA.ip_num, B.location_id, B.start_ip_num, B.end_ip_num&lt;br /&gt;FROM ip_addresses IA use index(...)&lt;br /&gt;JOIN dim_ip_block B use index(...)&lt;br /&gt;WHERE IA.ip_num between B.start_ip_num and B.end_ip_num&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;This leads to MySQL performing full scans. And in the case of ip_block table (B) it doesn't use any index. For this to be avoided, I partitioned the dimension (lookup) table into 1000 parts and since this table has ~5.9 Million rows each partition will result in ~5,900 rows in each partition.&amp;nbsp; For more on data set partitions see&lt;a href="http://en.wikipedia.org/wiki/Partition_of_a_set"&gt; its wiki&lt;/a&gt;.&amp;nbsp; The partitions should be numbered in the ascending order with respect to ascending range of ip blocks (start_ip_num).&amp;nbsp; Then also create a another table (dim_ip_range_partition) with start_ip_num and end_ip_num for this each range basically creating super IP blocks.&lt;br /&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;&lt;pre class="sql" name="code"&gt;CREATE TABLE dim_ip_range_partition (&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; partition_id int NOT NULL AUTO_INCREMENT,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; start_ip_num bigint NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; end_ip_num bigint NOT NULL,&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ....&lt;br /&gt;) ENGINE=InnoDB ;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;Also, create&amp;nbsp;a primary key on patition_id&amp;nbsp;and index on (start_ip_num, end_ip_num) on this table.&lt;br /&gt;The solution is based on two things that there is no overlapping IP block and IP blocks are ordered (in ascending order). &lt;br /&gt;&lt;br /&gt;To populate the above table,&lt;br /&gt;&lt;ol&gt;&lt;li&gt;Add id (row_id) column to dim_ip_block&lt;/li&gt;&lt;li&gt;Add partition_id integer column to dim_ip_block&lt;/li&gt;&lt;li&gt;Update id from 1 to n (highest start_ip_num) in ascending order of start_ip_num&lt;/li&gt;&lt;li&gt;Partition "n" ids into P parts (n/P) where n &amp;gt; 1 and 1 &amp;lt; P &amp;lt; n&lt;/li&gt;&lt;li&gt;Assign each part/partition ascending integer value&lt;/li&gt;&lt;li&gt;Now update above table dim_ip_range_partition P_TABLE from dim_ip_block B_TABLE where P_TABLE partition id between 1 and P of B_TABLE and P_TABLE.start_ip_num is lowest B_TABLE.start_ip_num of the corresponding partition and P_TABLE.end_ip_num is highest of corresponding partition.&amp;nbsp; In other words...&lt;/li&gt;&lt;/ol&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;INSERT INTO dim_ip_range_partition &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; (partition_id, start_ip_num, end_ip_num, ...)&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; SELECT partition_id, min(start_ip_num), max(end_ip_num), ...&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FROM dim_ip_block&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; GROUP BY partition_id&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;ORDER BY partition_id asc&lt;br /&gt;&lt;br /&gt;By doing so, we have created a super IP blocks each having&amp;nbsp;n/P blocks from the original ip_blocks.&amp;nbsp;&amp;nbsp; And the following query is one way to extract city/country info for given set&amp;nbsp;of IP addresses.&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;SELECT t.ip_num, t.ip, B.start_ip_num, B.start_ip, B.end_ip_num, B.end_ip&lt;br /&gt;       ,L.city, L.country&lt;br /&gt;FROM &lt;br /&gt;(&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;--- Get the super ip block (partition) &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;SELECT C.ip_num, max(partition_id) partition_id, max(P.start_ip_num) start_ip_num&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;FROM ip_addresses C use index(...)&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;JOIN dim_ip_range_partition P &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;WHERE C.ip_num &amp;gt;= P.start_ip_num&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;GROUP BY C.ip_num&lt;br /&gt;) t&lt;br /&gt;LEFT OUTER JOIN mktg_ip_block B use index(...)&amp;nbsp; &lt;br /&gt;&amp;nbsp; ON  t.partition_id = B.partition_id&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;--- And join on this partition&lt;br /&gt;&amp;nbsp;&amp;nbsp;AND t.ip_num &amp;gt;= B.start_ip_num &lt;br /&gt;&amp;nbsp;&amp;nbsp;AND t.ip_num &amp;lt;= B.end_ip_num&lt;br /&gt;JOIN dim_geo_ip_city_location L&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;   --- Get city/country&lt;br /&gt;&amp;nbsp; ON B.location_id = L.location_id &lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The query that would have taken more than hour returns within 30 seconds now!&amp;nbsp; You can further optimize the above design with say more denormalized relations, having a large number of partitions (this is a trade-off, with highest partition n = P, each partition is same as original table which will result in worse off performance!) on both lookup table and lookedup table.&amp;nbsp; Or extending the partitioning concept to more levels (super super block, ....) you can reduce the number of joining rows with little bit more complex query.&lt;br /&gt;&lt;br /&gt;The solution is a typical &lt;a href="http://en.wikipedia.org/wiki/Computer_science"&gt;Computer Science&lt;/a&gt; way of handling the complex problem by dividing into smaller ones and solving it.&amp;nbsp; In this it is more specific to MySQL solution for large data sets with range operations.&lt;br /&gt;&lt;br /&gt;&lt;div&gt;&lt;/div&gt;Cheers,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-3307968858179137976?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/3307968858179137976/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/08/dw-solution-ip-mapping-to-geo-locations.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3307968858179137976'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3307968858179137976'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/08/dw-solution-ip-mapping-to-geo-locations.html' title='DW solution - IP mapping to Geo Locations'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-8362129925868667164</id><published>2010-08-01T20:46:00.000-07:00</published><updated>2010-08-02T00:28:26.820-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='IP'/><title type='text'>Tools to convert IP address</title><content type='html'>Following tool converts an ip address to other formats.  For more on IP address conversion check out &lt;a href="http://www.hiregion.com/2010/07/converting-ip-address-to-integer-and.html"&gt;part 1&lt;/a&gt; and &lt;a href="http://www.hiregion.com/2010/07/ip-address-conversions.html"&gt;2&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;hr /&gt;&lt;a href="http://www.hiregion.com/2010/07/converting-ip-address-to-integer-and.html"&gt;Converting an IP address to an integer and reverse (Part 1)&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.hiregion.com/2010/07/ip-address-conversions.html"&gt;IP Address conversions (Part 2)&lt;/a&gt;&lt;br /&gt;&lt;hr /&gt;&lt;br /&gt;&lt;table align="center" border="1" cellpadding="5" cellspacing="0" style="width: 80%;"&gt;&lt;tbody&gt;&lt;tr align="center"&gt; &lt;td&gt;&lt;b&gt; IP address converter: &lt;/b&gt;&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;     &lt;td&gt;&lt;br /&gt;&lt;table cellpadding="2" cellspacing="0"&gt;&lt;tbody&gt;&lt;tr&gt;           &lt;td&gt;&lt;br /&gt;&lt;b&gt;Enter IP:&lt;/b&gt;&lt;/td&gt;           &lt;td&gt;&lt;br /&gt;&lt;input name="inputdata" size="40" type="text" value="" /&gt;&lt;/td&gt;           &lt;td&gt;&lt;br /&gt;&lt;input onclick="OnSubmitPluginInput(this,'http://www.webtoolhub.com/plugins/wt561377-ip-address-number-converter.aspx');" type="button" value="Convert" /&gt;&lt;/td&gt;        &lt;/tr&gt;&lt;tr&gt;          &lt;td&gt;&lt;br /&gt;&lt;/td&gt;          &lt;td colspan="2" style="font-size: 9pt;"&gt;&lt;br /&gt;(e.g. 192.168.100.200 or 3232261320 etc)&lt;/td&gt;        &lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;/td&gt;   &lt;/tr&gt;&lt;tr&gt;     &lt;td&gt;&lt;br /&gt;&lt;iframe frameborder="0" name="pluginframe561377" style="height: 350px; width: 100%;"&gt;&lt;/iframe&gt;&lt;/td&gt;   &lt;/tr&gt;&lt;tr&gt;     &lt;td style="font-family: Verdana,Arial; font-size: 9pt;"&gt;&lt;br /&gt;Powered by: &lt;a href="http://www.webtoolhub.com/" title="Free Webmaster Tools"&gt;WebToolHub.com&lt;/a&gt;&lt;/td&gt;   &lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;script src="http://www.webtoolhub.com/plugin.axd" type="text/javascript"&gt;&lt;/script&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;You can also check out 2's complement of a number here.  This will be handy when one needs to check for signed numbers (2's complement).&lt;br /&gt;&lt;br /&gt;&lt;script id="WolframAlphaScript52cf49fea5ff66588408852f65cf8272" src="http://www.wolframalpha.com/widget/widget.jsp?id=52cf49fea5ff66588408852f65cf8272" type="text/javascript"&gt;&lt;/script&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-8362129925868667164?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/8362129925868667164/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/08/tools-to-convert-ip-address.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8362129925868667164'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8362129925868667164'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/08/tools-to-convert-ip-address.html' title='Tools to convert IP address'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-1299472768832897351</id><published>2010-07-30T15:28:00.000-07:00</published><updated>2010-08-03T16:10:43.918-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Geo'/><category scheme='http://www.blogger.com/atom/ns#' term='IP'/><category scheme='http://www.blogger.com/atom/ns#' term='SQLServer'/><title type='text'>IP Address conversions</title><content type='html'>This is the part 2 of converting an IP address to an integer or vice versa. For more details and little introdution &lt;a href="http://www.hiregion.com/2010/07/converting-ip-address-to-integer-and.html"&gt;see Part 1.&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Converting an ip address to integer and back is little straight forward in MySQL compared to SQL Server since SQL Server&amp;nbsp;(2008)&amp;nbsp;doesn't provide bit shift operators. &lt;br /&gt;&lt;br /&gt;In short you can't do,&lt;br /&gt;&lt;pre class="sql" name="code"&gt;SELECT 1&amp;lt;&amp;lt;2 &lt;br /&gt;&lt;/pre&gt;nor&lt;br /&gt;&lt;pre class="sql" name="code"&gt;SELECT 3232261320 -- IP '192.168.100.200'&lt;br /&gt;* (256*256*256*256*256*256) -- Left shift 6 bytes&lt;br /&gt;/ (256*256*256*256*256*256*256) -- Right shift 7 bytes to get 2nd octet&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;One will encounter "Arithmetic overflow error converting expression to data type int". Even casting each integer to bigint will not solve the problem as the multiplication results in overflow of signed bigint. And there is no unsigned bigint in SQL Server.&lt;br /&gt;&lt;br /&gt;Another way to solve this is to do the following in SQL Server by using bit AND. Retain only the byte that is off interest and then right shift the bits appropriately.&lt;br /&gt;&lt;pre class="sql" name="code"&gt;#-- IP '192.168.100.200'&lt;br /&gt;SELECT (cast(3232261320 as bigint) &amp;amp; cast(4278190080 as bigint))/(256*256*256);&lt;br /&gt;SELECT (cast(3232261320 as bigint) &amp;amp; 16711680)/(256*256);&lt;br /&gt;SELECT (cast(3232261320 as bigint) &amp;amp; 65280)/(256);&lt;br /&gt;SELECT (cast(3232261320 as bigint) &amp;amp; 255);&lt;br /&gt;&lt;br /&gt;#- And the output will be&lt;br /&gt;192&lt;br /&gt;168&lt;br /&gt;100&lt;br /&gt;200&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;On systems with unsigned bigint you can also do (MySQL)&lt;br /&gt;&lt;pre class="sql" name="code"&gt;SELECT cast(3232261320*256*256*256*256 as unsigned)&lt;br /&gt;/ (256*256*256*256*256*256*256) 4th_octet;&lt;br /&gt;&lt;br /&gt;SELECT cast(3232261320*256*256*256*256*256*256*256 as unsigned)&lt;br /&gt;/ (256*256*256*256*256*256*256 as unsigned) as 1st_octet;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;They will result in outputs 192 and 200 respectively. &lt;br /&gt;&lt;br /&gt;With given ip you can simply run the following to get its integer value,&lt;br /&gt;&lt;pre class="sql" name="code"&gt;mysql toad&amp;gt;&amp;nbsp; &lt;br /&gt;SELECT ( ((substring_index(ip, '.',1))&amp;lt;&amp;lt;24) &lt;br /&gt;       + ((substring_index(substring_index(ip, '.', 2), '.', -1))&amp;lt;&amp;lt;16)&lt;br /&gt;       + ((substring_index(substring_index(ip, '.', 3), '.', -1))&amp;lt;&amp;lt;8)&lt;br /&gt;       + ((substring_index(substring_index(ip, '.', 4), '.', -1)&amp;lt;&amp;lt;0))&lt;br /&gt;)&lt;br /&gt;FROM&lt;br /&gt;( SELECT '192.168.100.200' as ip&lt;br /&gt;  UNION ALL&lt;br /&gt;  SELECT ip FROM som_ip_table;&lt;br /&gt;) t;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Cheers,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-1299472768832897351?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/1299472768832897351/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/07/ip-address-conversions.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1299472768832897351'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1299472768832897351'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/07/ip-address-conversions.html' title='IP Address conversions'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-6027868237761893644</id><published>2010-07-28T22:52:00.001-07:00</published><updated>2010-08-17T17:01:08.707-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='IP'/><category scheme='http://www.blogger.com/atom/ns#' term='ETL'/><category scheme='http://www.blogger.com/atom/ns#' term='C#'/><category scheme='http://www.blogger.com/atom/ns#' term='DW'/><title type='text'>Converting an IP address to an integer and reverse</title><content type='html'>In dealing with IP address analysis one may required to map IPs to geo location. There are third party applications or software that perform IP to city, state, country mapping and businesses, for example, can make use of them to see where their web traffic is coming from.&amp;nbsp; &lt;br /&gt;&lt;br /&gt;Article covers IPv4 addresses which&amp;nbsp;use 32 bits and similar technique can be used to IPv6 addresses which use 128 bits by using 2 BIGINTs to store (64bits each).&lt;br /&gt;&lt;br /&gt;It is easy to handle reasonable load when one is querying for few IPs or few hundred IPs either from simple user interface entering one or more IP addresses or through APIs. In these cases IP addresses can simply be matched in string format without much of performance issue but it can become a performance bottleneck when dealing with millions of rows loaded through ETL process into Data Warehouse environment and reports are run against fact tables with hundreds of millions of rows. Under these circumstances converting IP to an integer will boost the performance substantially. &lt;br /&gt;&lt;br /&gt;MySQL provides functions to convert IP string to integer and backwards. For example, inet_aton converts string to a number (integer) and inet_ntoa converts number to integer. &lt;br /&gt;&lt;br /&gt;&lt;span style="color: #990000; font-size: large;"&gt;inet_aton and inet_ntoa:&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;mysql&amp;gt; select inet_aton('192.168.100.200');&lt;br /&gt;+------------------------------+&lt;br /&gt;| inet_aton('192.168.100.200') |&lt;br /&gt;+------------------------------+&lt;br /&gt;|             3232261320 |&lt;br /&gt;+------------------------------+&lt;br /&gt;&lt;br /&gt;mysql&amp;gt; select inet_ntoa(3232261320);&lt;br /&gt;+-----------------------+&lt;br /&gt;| inet_ntoa(3232261320) |&lt;br /&gt;+-----------------------+&lt;br /&gt;|     192.168.100.200 |&lt;br /&gt;+-----------------------+&lt;br /&gt;1 row in set (0.00 sec)&lt;br /&gt;&lt;/pre&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;span style="color: #990000; font-size: large;"&gt;Bit operations:&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;The functions are simple to use and at the lower level they basically perform bit shifting. In the above example, '192.168.100.200' is represented as 11000000 10101000 01100100 11001000 in binary form. For more on binary system or to convert a number from decimal to binary see &lt;a href="http://en.wikipedia.org/wiki/Binary_numeral_system"&gt;wiki here&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;To convert string IP to integer function inet_aton multiplies 192 by 256*256*256, 168 by 258*256, 100 by 256 and then adding 200 (last octet in the above ip address), that is,&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;br /&gt;and it is same as 192&amp;lt;&amp;lt;24 i.e., left shift 192 by 24 bits, 168&amp;nbsp;&amp;lt;&amp;lt; 16 and 100 &amp;lt;&amp;lt; 8. Bit shifting is a much faster multiplication since CPU can use its register to move the bit in simple operation.&lt;br /&gt;&lt;br /&gt;In reverse, one can not simply divide the ip integer (3232261320) to get back the ip address in octet string format.&amp;nbsp; But one need to appropriately left and right shift the number (or do multiply and then divide) to get the octet that is of interest. To get the most highest valued position (192), shift the bits to right. For example,&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;span style="background-color: #f1c232;"&gt;11000000&lt;/span&gt; &lt;/b&gt;&lt;span style="background-color: #ffd966;"&gt;10101000&lt;/span&gt; &lt;span style="background-color: #ffe599;"&gt;01100100&lt;/span&gt; &lt;span style="background-color: #fff2cc;"&gt;11001000&lt;/span&gt; &amp;gt;&amp;gt; 24 (right shift 24 bits) gives you&lt;br /&gt;&lt;br /&gt;00000000 00000000 00000000 &lt;b style="background-color: #f1c232;"&gt;11000000&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Therefore, all the right most 24 bits dropped off the register (cliff) and we are left with 4th octet &lt;b style="background-color: #f1c232;"&gt;11000000&lt;/b&gt; prefixed with 24 zeros.&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;mysql&amp;gt; SELECT 3232261320&amp;gt;&amp;gt;24 as 4th_octet;&lt;br /&gt;+-----------+&lt;br /&gt;| 4th_octet |&lt;br /&gt;+-----------+&lt;br /&gt;|     192 |&lt;br /&gt;+-----------+&lt;br /&gt;1 row in set (0.00 sec)&lt;/pre&gt;&lt;br /&gt;By the same logic to get the 3rd octet, we can move the bits 16 times to the right but we will be left with 4th octet as well. Doing&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;span style="background-color: #f1c232;"&gt;11000000&lt;/span&gt; &lt;/b&gt;&lt;span style="background-color: #ffd966;"&gt;10101000&lt;/span&gt; &lt;span style="background-color: #ffe599;"&gt;01100100&lt;/span&gt; &lt;span style="background-color: #fff2cc;"&gt;11001000&lt;/span&gt; &amp;gt;&amp;gt;16 will result in&lt;br /&gt;&lt;br /&gt;00000000 00000000 &lt;b&gt;&lt;span style="background-color: #f1c232;"&gt;11000000&lt;/span&gt; &lt;/b&gt;&lt;span style="background-color: #ffd966;"&gt;10101000&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;So we should first shift the bits to left so that the most highest octet (4th octet) drops off the register.&amp;nbsp; We can do so by shifting left by 8 bits.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;mysql&amp;gt; SELECT 3232261320&amp;lt;&amp;lt;8 as 8blsn;  -- 8blsn: 8 bit left shift number&lt;br /&gt;+--------------+&lt;br /&gt;| 8blsn      |&lt;br /&gt;+--------------+&lt;br /&gt;| 827458897920 |&lt;br /&gt;+--------------+&lt;br /&gt;1 row in set (0.00 sec)&lt;/pre&gt;&lt;br /&gt;Instead of getting a smaller number than before we got a larger number! And obviously, right shifting this number by 16bits to right will not give the 3rd octet of IP.&amp;nbsp;&amp;nbsp; This happens due to fact that MySQL converted int to bigint silently.&amp;nbsp; The 4th octet moved into 5th octet resulting in appending 8 zeros to the right as below.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;span style="background-color: #f1c232;"&gt;11000000&lt;/span&gt; &lt;/b&gt;&lt;span style="background-color: #ffd966; color: red; font-size: large;"&gt;10101000&lt;/span&gt; &lt;span style="background-color: #ffe599;"&gt;01100100&lt;/span&gt; &lt;span style="background-color: #fff2cc;"&gt;11001000&lt;/span&gt; &lt;span style="background-color: #38761d;"&gt;00000000&lt;/span&gt; &lt;br /&gt;&lt;br /&gt;Since bigint is 8 bytes long one need to shift not by 1 byte (8bits) but 5 bytes (40 bits) to delete the highest octet (#192). &lt;br /&gt;&lt;br /&gt;Just with 4 byte (32 bit) left shift the register would look like below.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;span style="background-color: #f1c232;"&gt;11000000&lt;/span&gt; &lt;/b&gt;&lt;span style="background-color: #ffd966; color: red; font-size: large;"&gt;10101000&lt;/span&gt; &lt;span style="background-color: #ffe599;"&gt;01100100&lt;/span&gt; &lt;span style="background-color: #fff2cc;"&gt;11001000&lt;/span&gt; &lt;span style="background-color: #38761d;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #6aa84f;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #93c47d;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #b6d7a8;"&gt;00000000&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;And with 40 bit left shift the register would look like thus removing (#192 - &lt;b&gt;&lt;span style="background-color: #f1c232;"&gt;11000000&lt;/span&gt; )&lt;/b&gt;,&lt;br /&gt;&lt;br /&gt;&lt;span style="background-color: #ffd966; color: red; font-size: large;"&gt;10101000&lt;/span&gt; &lt;span style="background-color: #ffe599;"&gt;01100100&lt;/span&gt; &lt;span style="background-color: #fff2cc;"&gt;11001000&lt;/span&gt; &lt;span style="background-color: #38761d;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #6aa84f;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #93c47d;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #b6d7a8;"&gt;00000000&lt;/span&gt; &lt;span style="background-color: #d9ead3;"&gt;00000000&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;Now right shift 56 bits to get the 3rd octet (#168).&lt;br /&gt;&lt;br /&gt;00000000 00000000 00000000 00000000 00000000 00000000 00000000 &lt;span style="background-color: #ffd966; color: red; font-size: large;"&gt;10101000&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;This is same as &lt;br /&gt;(3232261320 *256 *256 *256 *256 *256) / (256 *256 *256 *256 *256 *256 *256)&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;&amp;nbsp;mysql&amp;gt; SELECT 3232261320&amp;gt;&amp;lt;&amp;lt;40&amp;gt;&amp;gt;56;&lt;br /&gt;+--------------------+&lt;br /&gt;| 3232261320&amp;lt;&amp;lt;40&amp;gt;&amp;gt;56 |&lt;br /&gt;+--------------------+&lt;br /&gt;|            168 |&lt;br /&gt;+--------------------+&lt;br /&gt;1 row in set (0.00 sec)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;To get the 2nd octet first left shift the original number 48 bits and then right shift by 56 bits. For the 1st octet left shift by 56 bits both ways.&lt;br /&gt;&lt;pre class="sql" name="code"&gt;mysql&amp;gt; SELECT 3232261320&amp;lt;&amp;lt;48&amp;gt;&amp;gt;56;&lt;br /&gt;+--------------------+&lt;br /&gt;| 3232261320&amp;lt;&amp;lt;48&amp;gt;&amp;gt;56 |&lt;br /&gt;+--------------------+&lt;br /&gt;|           100 |&lt;br /&gt;+--------------------+&lt;br /&gt;1 row in set (0.00 sec)&lt;br /&gt;&lt;br /&gt;mysql&amp;gt; SELECT 3232261320&amp;lt;&amp;lt;56&amp;gt;&amp;gt;56;&lt;br /&gt;+--------------------+&lt;br /&gt;| 3232261320&amp;lt;&amp;lt;56&amp;gt;&amp;gt;56 |&lt;br /&gt;+--------------------+&lt;br /&gt;|           200 |&lt;br /&gt;+--------------------+&lt;br /&gt;1 row in set (0.00 sec)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;With all &lt;a href="http://en.wikipedia.org/wiki/Imperative_programming"&gt;imperative/procedural languages&lt;/a&gt; providing the bit operators (AND, OR, bit shift, etc), one can and should convert the IP string to integer before being loaded to staging. Thus the parsing or conversion can also be easily parallel processed for large loads.&lt;br /&gt;&lt;br /&gt;An example in C sharp (C#) is shown below.&lt;br /&gt;&lt;pre class="csharp" name="code"&gt;// &lt;br /&gt;// C# program to convert IP address string to an integer.&lt;br /&gt;// &lt;br /&gt;using System;&lt;br /&gt;&lt;br /&gt;namespace IP_addr_to_int&lt;br /&gt;{&lt;br /&gt;    class Program&lt;br /&gt;    {&lt;br /&gt;        static void Main(string[] args)&lt;br /&gt;        {&lt;br /&gt;            // Infinite loop until user enters control-c.&lt;br /&gt;            for (; ; ) {&lt;br /&gt;                string ip = "";&lt;br /&gt;                Console.Write("Enter ip address: ");&lt;br /&gt;                ip = Console.ReadLine();&lt;br /&gt;&lt;br /&gt;                ulong ipNum = 0;&lt;br /&gt;                string[] ipArr = ip.Split('.');&lt;br /&gt;&lt;br /&gt;                if (ipArr.Length != 4) {&lt;br /&gt;                    Console.WriteLine("Error. Enter four octet IP address only.");&lt;br /&gt;                    // Environment.Exit(-1);&lt;br /&gt;                    continue;&lt;br /&gt;                }&lt;br /&gt;                for (int ii = ipArr.Length - 1; ii &amp;gt;= 0; ii--)&lt;br /&gt;                {&lt;br /&gt;                    ulong ipOctet = Convert.ToUInt64(ipArr[3 - ii]);&lt;br /&gt;                    // ipOctet * 256 * 256 * 256;&lt;br /&gt;                    if (ii == 3)      { ipNum += ipOctet &amp;lt;&amp;lt; 24; } &lt;br /&gt;                    // ipOctet * 256 * 256;&lt;br /&gt;                    else if (ii == 2) { ipNum += ipOctet &amp;lt;&amp;lt; 16; } &lt;br /&gt;                    // ipOctet * 256;&lt;br /&gt;                    else if (ii == 1) { ipNum += ipOctet &amp;lt;&amp;lt; 8;  } &lt;br /&gt;                    // ipOctet * 1&lt;br /&gt;                    else              { ipNum += ipOctet } &lt;br /&gt;                    // System.Threading.Thread.Sleep(1000); &lt;br /&gt;                }&lt;br /&gt;                Console.WriteLine &lt;br /&gt;                    (Convert.ToString(ip)+ " converted to integer " + Convert.ToString(ipNum));&lt;br /&gt;            }&lt;br /&gt;        }&lt;br /&gt;    }&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;For more ways to convert an IP &lt;a href="http://www.hiregion.com/2010/07/ip-address-conversions.html"&gt;see Part 2&amp;nbsp; &lt;/a&gt;and for tools see &lt;a href="http://www.hiregion.com/2010/08/tools-to-convert-ip-address.html"&gt;Part 3&lt;/a&gt;. &lt;br /&gt;Cheers,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-6027868237761893644?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/6027868237761893644/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/07/converting-ip-address-to-integer-and.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/6027868237761893644'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/6027868237761893644'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/07/converting-ip-address-to-integer-and.html' title='Converting an IP address to an integer and reverse'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-1976554162482477038</id><published>2010-07-28T15:16:00.000-07:00</published><updated>2010-08-16T14:15:08.010-07:00</updated><title type='text'>NFS Cache - Invisible file issue</title><content type='html'>I ran into a following situation and it took some time to diagnose the issue and help from couple of folks from DBA and operations team to resolve it.&amp;nbsp; Here is what happened.&lt;br /&gt;&lt;br /&gt;I exported a large data set from MySQL to a file in directory /dir_1/dir_2/exported_file.txt, for example, in an application.&amp;nbsp; Then after the file was exported the application went on to consume the file by reading it's content.&amp;nbsp; Since &lt;a href="http://dev.mysql.com/doc/refman/5.0/en/select.html"&gt;MySQL OUTFILE&lt;/a&gt; (exporting data) doesn't overwrite a file if the file name already exists, the code would rename the file to *.bak. &amp;nbsp; See below for pseudo code.&lt;br /&gt;&lt;br /&gt;If OUTFILE exists&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; Move or Rename OUTFILE to OUTFILE.bak&amp;nbsp;&amp;nbsp;&amp;nbsp; /* Step 1 */&lt;br /&gt;Run MySQL export to OUTFILE&amp;nbsp;&amp;nbsp;&amp;nbsp; /*&amp;nbsp; Step 2 */&lt;br /&gt;Check the error code&lt;br /&gt;Read OUTFILE and parse&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;nbsp; &amp;nbsp; /* Step 3 */&lt;br /&gt;&lt;br /&gt;When I ran the application, it would sometime create the output file and go on to parse it correctly but many a times it would fail in step 1 throwing an error like "file already exists" when in fact it was not.&amp;nbsp; Because I had removed the file with 'rm -f' before rerunning the program.&amp;nbsp; Other times it would fail in step 3 indicating that file does not exists even though SQL exported the file successfully in step 2.&amp;nbsp; I even provided sleep time between each step ranging from 5 to 60 seconds but continued to see the same random behavior.&lt;br /&gt;&lt;br /&gt;After spending sometime trying to diagnose what might be going on, ended up debugging NFS caching.&amp;nbsp; The directory /dir_1 was a mounted file system with NFS caching set to few hundred seconds.&amp;nbsp; When the application wrote to NFS directory, the write cache was updated but not the OS directory structure (inode).&amp;nbsp;Reducing the parameter setting (actimeo) to lower number, say 30 seconds, will help alliviate the delay. If sys admins are reluctant to change the older mounted system settings, you should get a new mount point with actimeo set (30).&amp;nbsp;&amp;nbsp; Once these changes were made application was able to run smoothly with the application sleep set to little higher than actimeo timings.&amp;nbsp; Note, using actimeo sets all of acregmin, acregmax, acdirmin, and acdirmax to the same value. There is no default value.&amp;nbsp;See &lt;a href="http://unixhelp.ed.ac.uk/CGI/man-cgi?nfs+5"&gt;man pages&lt;/a&gt; for more details.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Cheers,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-1976554162482477038?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/1976554162482477038/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/07/nfs-cache-invisible-file-issue.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1976554162482477038'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1976554162482477038'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/07/nfs-cache-invisible-file-issue.html' title='NFS Cache - Invisible file issue'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-3901381456115120846</id><published>2010-06-16T18:12:00.000-07:00</published><updated>2010-06-17T16:11:36.367-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Weblog'/><category scheme='http://www.blogger.com/atom/ns#' term='RegEx'/><category scheme='http://www.blogger.com/atom/ns#' term='Hadoop'/><title type='text'>Hourly web log analysis through Hadoop</title><content type='html'>Many a time one may want to parse the weblogs for doing some quick analysis on AB tests or for &lt;a href="http://searchsecurity.techtarget.com/tip/0,289483,sid14_gci1354038_mem1,00.html"&gt;security/fraud alerts&lt;/a&gt; or recent advertisement or campaign analysis. There are many applications or utilities that perform web log analysis but more often than not regular expressions provide a powerful and elegant ways to analyse these logs and especially handy when one is dealing with massive and quickly rotating web logs.&amp;nbsp;&amp;nbsp; Check out&amp;nbsp;this &lt;a href="http://en.wikipedia.org/wiki/Web_analytics"&gt;wiki&lt;/a&gt; for more general info on web analytics.&lt;br /&gt;&lt;br /&gt;When each weblog is multi-gigabytes and moved to archive every couple of hours and there is farm of web servers (hundreds or thousands of web servers), many of the vendor or 3rd party applications don't scale up either. Hadoop streaming with simple utilities can provide insights into what otherwise would have been a costly experiment.&lt;br /&gt;&lt;br /&gt;Here is an regular expession used to extract HOUR and the string that is of interest from the Apache web logs. Each entry in the log has the format similar to&amp;nbsp;the ones below.&lt;br /&gt;&lt;hr /&gt;01/Jun/2010:&lt;span style="font-size: large;"&gt;&lt;strong&gt;07&lt;/strong&gt;&lt;/span&gt;:09:26 -0500] - 127.0.0.1 "GET /apache_pb.gif HTTP/1.1" status=200 size=2326 ab=test&lt;span style="font-size: large;"&gt;&lt;strong&gt;1AB&lt;/strong&gt;&lt;/span&gt; .... &lt;br /&gt;&lt;br /&gt;01/Jun/2010:&lt;strong&gt;&lt;span style="font-size: large;"&gt;07&lt;/span&gt;&lt;/strong&gt;:09:26 -0500] - 127.0.0.1 "GET /apache_pb.gif HTTP/1.1" status=200 size=2326 ab=test-&lt;strong&gt;&lt;span style="font-size: large;"&gt;2CD&lt;/span&gt;&lt;/strong&gt; ....&lt;br /&gt;&lt;hr /&gt;&lt;br /&gt;For more details on Apache log configuration, take a look at its &lt;a href="http://httpd.apache.org/docs/2.2/logs.html"&gt;specs&lt;/a&gt; and &lt;a href="http://httpd.apache.org/docs/2.2/mod/mod_log_config.html#customlog"&gt;custom logs&lt;/a&gt;.&amp;nbsp; Along with time of the day of the site hit, user IP address, http request (GET or POST), page requested, protocol used, one can configure the web server to log many more details including referer, user agent (browser), environment variables, etc.&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre class="perl" name="code"&gt;m/(?&amp;lt;=\d{4}:)(\d{2})(?=:\d{2}:\d{2}\s+[-\d{4}]).*?ab=\D+(\d{1}[a-zA-Z]{2,})\b/;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Though the above regular expression may look cryptic, it is straight forward to extract hour and string of interest to us.&lt;br /&gt;&lt;br /&gt;(?&amp;lt;=\d{4}:)&amp;nbsp;&amp;nbsp;&amp;nbsp;=&amp;gt; Matches any four digit string (in this case year) FOLLOWED BY ":" and two digits&amp;nbsp;(HOUR) .&amp;nbsp; The beginning "(?&amp;lt;=" is a positive-look-behind&amp;nbsp;assertion engine anchors to look for.&amp;nbsp; If anywhere in the log line two digits appear without preceded by four digits (year) and colon (:), then it is a non-match.&lt;br /&gt;&lt;br /&gt;(\d{2})&amp;nbsp;&amp;nbsp;=&amp;gt; Matches two digit string (HOUR).&amp;nbsp; The parenthesis around the two digits activates storing these two digits in a special variable for future use.&amp;nbsp;&amp;nbsp;As&amp;nbsp;I would like to summarize at the hourly level, I concatenate these digits with other matched string (1AB) and create&amp;nbsp;simple key=value pair.&amp;nbsp;&amp;nbsp;Value is&amp;nbsp;simply an identity (1) which later then passed to hadoop reducer to count.&lt;br /&gt;&lt;br /&gt;(?=:\d{2}:\d{2}\s+[-\d{4}])&amp;nbsp;&amp;nbsp;=&amp;gt; Similar to positive-look-behind, this tells the regular expression engine to look-ahead for positive match from the 2 digit hour string.&amp;nbsp; "(?=" indicates from the current position look-ahead and get colon (:) followed by 2 digits (minutes), another colon (:) 2 digits (seconds) further followed by one or more spaces (\s+).&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; [-\d{4}]&amp;nbsp;&amp;nbsp;=&amp;gt;&amp;nbsp; Tries to matches optional ([ ]) "- and 4 digits". This is the token to&amp;nbsp;match time zone (-5000 above).&lt;br /&gt;&lt;br /&gt;Once the string string is matched, and hour is extracted by the engine, it then&lt;br /&gt;&lt;br /&gt;.*?&amp;nbsp;&amp;nbsp;=&amp;gt; matches any character (denoted by ".") followed by any thing by&amp;nbsp;zero or more (denoted by "*") other characters in non-greedy mode (denoted by "?") until it gets to "ab=" string.&amp;nbsp; Engine actually traces all the way to the end of string and then backtracks it to find "ab=" to meet the minimal match of ".*?".&lt;br /&gt;&lt;br /&gt;ab=&amp;nbsp; =&amp;gt;&amp;nbsp;environment variable that is of interest to us followed by&lt;br /&gt;&lt;br /&gt;\D+&amp;nbsp;&amp;nbsp;=&amp;gt; any non-digit character followed by&amp;nbsp;one or more&amp;nbsp;times until it trys to match&lt;br /&gt;&lt;br /&gt;(\d{1}[a-zA-Z]{2,})&amp;nbsp; =&amp;gt;&amp;nbsp;a string to match&amp;nbsp;that has single digit followed by&amp;nbsp;two or more&amp;nbsp;characters (a to z, lower or upper case) and store this match for future use&amp;nbsp;until &lt;br /&gt;&lt;br /&gt;\b&amp;nbsp; =&amp;gt;&amp;nbsp;a word boundary.&amp;nbsp; A boundary between a word character that includes "a to z, 1 to 9 and _".&amp;nbsp; You can visualize this as a thin line-boundary between word characters and any non-word characters like "." or "-" or "$", etc.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;In the parser application, you can concatenate first match (Hour - 07) and second matched string (Test string - 1AB) like "1AB_07" and set the value to "1".&lt;br /&gt;&lt;br /&gt;When the Hadoop mapper passes this to reducer, reducer simply has to count the 1s and to get sum of hourly hits of "1AB".&amp;nbsp; You can then load this data to some table&amp;nbsp;at (date and) hourly level and maintain its history for further analysis.&amp;nbsp; Similar analysis on web logs can yield IP level, page level and many other useful info.&lt;br /&gt;&lt;br /&gt;Note: While using regular expression it helps to understand how the backtracking technique works and its implications on performance.&amp;nbsp; Backtracking happens when one uses quantifiers like "*, *?, +, +?, {n,m}, and {n,m}?".&amp;nbsp;&amp;nbsp; Take a look at interesting articles &lt;a href="http://www.regular-expressions.info/catastrophic.html"&gt;here&lt;/a&gt; and &lt;a href="http://www.regexguru.com/2008/04/unintended-backtracking-can-bite-you/"&gt;here&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-3901381456115120846?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/3901381456115120846/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/06/hourly-web-log-analysis-through-hadoop.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3901381456115120846'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/3901381456115120846'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/06/hourly-web-log-analysis-through-hadoop.html' title='Hourly web log analysis through Hadoop'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-8803257034397768134</id><published>2010-06-04T16:36:00.000-07:00</published><updated>2010-06-07T09:18:57.907-07:00</updated><title type='text'>Conversation Prism - An Image</title><content type='html'>As the social media, social networking, advertising, Internet marketing continue to evolve with  new technologies and many companies create their own social groups, it  all gets more complex and confusing.&amp;nbsp; Many a times a picture or an image will  explain more elegantly than 1000 words are more and in some cases image is the most suited tool to explain.&amp;nbsp; Here is one  image created by Brian Solis &amp;amp; Jesse Thomas that I like in this conversation!&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_kzso6KW6Rx4/TAmNImNztoI/AAAAAAAACmk/_0Vm-2NbwSc/s1600/theConversationPrism.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="480" src="http://1.bp.blogspot.com/_kzso6KW6Rx4/TAmNImNztoI/AAAAAAAACmk/_0Vm-2NbwSc/s640/theConversationPrism.jpg" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Few other images that will help.&lt;br /&gt;&lt;br /&gt;From &lt;a href="http://www.socialmediatoday.com/SMC/162110"&gt;Dave Fleet&amp;nbsp;&amp;nbsp; &lt;/a&gt;&lt;br /&gt;&amp;nbsp;(http://www.socialmediatoday.com/SMC/162110)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_kzso6KW6Rx4/TAmRnO_MNzI/AAAAAAAACm0/nUHyXXMWfZo/s1600/corporate-media-ecosystem-dave-fleet.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="480" src="http://4.bp.blogspot.com/_kzso6KW6Rx4/TAmRnO_MNzI/AAAAAAAACm0/nUHyXXMWfZo/s640/corporate-media-ecosystem-dave-fleet.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;From FredCavazza.net&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_kzso6KW6Rx4/TAmQVRBeAKI/AAAAAAAACms/NUiUlFco2hs/s1600/fredCavazza.net.SocialMedia.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="480" src="http://3.bp.blogspot.com/_kzso6KW6Rx4/TAmQVRBeAKI/AAAAAAAACms/NUiUlFco2hs/s640/fredCavazza.net.SocialMedia.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Enjoy,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-8803257034397768134?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/8803257034397768134/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/06/conversation-prism-image.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8803257034397768134'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8803257034397768134'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/06/conversation-prism-image.html' title='Conversation Prism - An Image'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_kzso6KW6Rx4/TAmNImNztoI/AAAAAAAACmk/_0Vm-2NbwSc/s72-c/theConversationPrism.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-2518525453117293920</id><published>2010-05-29T15:17:00.000-07:00</published><updated>2010-06-07T09:24:15.969-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='RegEx'/><category scheme='http://www.blogger.com/atom/ns#' term='ETL'/><title type='text'>Email parsing with regular expressions</title><content type='html'>Emails always pose interesting challenges to load and handle for doing any kind of analysis with them - analysing number of users from email providers like Yahoo, Google, Hotmail, etc. or finding top country specific domains or users with specific email pattern.&lt;br /&gt;&lt;br /&gt;RFC 822, &lt;a href="http://tools.ietf.org/html/rfc2822"&gt;2822&lt;/a&gt; (April, 2001) and &lt;a href="http://tools.ietf.org/html/rfc5322"&gt;5322&lt;/a&gt; (October,2008) define the specification and syntax for text messages exchanged between computers.&amp;nbsp; The RFCs cover not just the email address specifications but also the envelop and contents.&amp;nbsp; Most of applications adhere to subset of these specs with more stringent email addresses.&amp;nbsp; For example, though address specs allow for ALPHA (a-z, A-Z), DIGITS (0-9) and special characters like "!", "#", "$", "%", etc. and even more special characters with quoted strings.&amp;nbsp; But typically email service providers won't allow you to create an email address with special characters like "$" or "!", etc.&amp;nbsp; For more spec details see section 3.4 and 3.4.1 in &lt;a href="http://tools.ietf.org/html/rfc5322"&gt;rfc5322&lt;/a&gt;.&amp;nbsp; &lt;a href="http://tools.ietf.org/html/rfc3696"&gt;RFC 3696&lt;/a&gt; explains in much easier terms email addresses, URI and HTTP URL.&lt;br /&gt;&lt;br /&gt;One can load email addresses in to staging area of data warehouse (DW) and then perform validation and cleaning through ETL's in-built utilities or components.&amp;nbsp; But to handle many of different possibilities and ability to quickly update or modify any new format encountered, it is better to validate and clean it through application programs before loading into staging or final tables.&amp;nbsp; In this way the load can also be distributed through file split and multi-threading or processes running on different systems.&lt;br /&gt;&lt;br /&gt;Following regular expression pulled from &lt;a href="http://regular-expressions.info/"&gt;regular-expressions.info&lt;/a&gt; handles all most all email addresses (more than 99.9%) and you can tweak to it suit your needs for performance or handling specific emails only. For example, you may not want to allow any email that has special characters like #, %, $, {, }, /, *, etc.&amp;nbsp; I have explained below in detail how this RE parses an email.&lt;br /&gt;&lt;br /&gt;&lt;pre class="perl" name="code"&gt;[a-z0-9!#$%&amp;amp;'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&amp;amp;'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum)\b&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Same as above with color coding for further explanation:&lt;br /&gt;&lt;span style="background-color: #fff2cc; color: #274e13; font-size: small;"&gt;[a-z0-9!#$%&amp;amp;'*+/=?^_`{|}~-]&lt;/span&gt;&lt;b&gt;&lt;span style="background-color: #fff2cc; color: red; font-size: x-large;"&gt;+&lt;/span&gt;&lt;span style="background-color: #fff2cc; font-size: x-large;"&gt;(&lt;/span&gt;&lt;span style="font-size: x-large;"&gt;&lt;span style="background-color: #fff2cc; color: #f1c232;"&gt;?:&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: #fff2cc; color: blue;"&gt;&lt;b&gt;&lt;span style="font-size: x-large;"&gt;&lt;span style="color: red;"&gt;\.&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;span style="color: #38761d; font-size: small;"&gt;[a-z0-9!#$%&amp;amp;'*+/=?^_`{|}~-]&lt;/span&gt;&lt;/span&gt;&lt;span style="font-size: x-large;"&gt;&lt;b style="background-color: #fff2cc; color: red;"&gt;+&lt;/b&gt;&lt;/span&gt;&lt;span style="background-color: #fff2cc; font-size: x-large;"&gt;)&lt;/span&gt;&lt;span style="font-size: x-large;"&gt;&lt;b&gt;&lt;span style="background-color: #fff2cc; color: purple;"&gt;*&lt;/span&gt;&lt;span style="color: magenta;"&gt;@&lt;/span&gt;&lt;/b&gt;&lt;/span&gt;&lt;span style="background-color: #cfe2f3;"&gt;(&lt;/span&gt;&lt;span style="background-color: #cfe2f3; color: #e69138;"&gt;?:&lt;/span&gt;&lt;span style="background-color: #cfe2f3; color: #3d85c6;"&gt;[a-z0-9]&lt;/span&gt;&lt;span style="background-color: #cfe2f3;"&gt;(&lt;/span&gt;&lt;span style="background-color: #cfe2f3; color: #e69138;"&gt;?:&lt;/span&gt;&lt;span style="background-color: #cfe2f3; color: #3d85c6;"&gt;[a-z0-9-]&lt;/span&gt;&lt;span style="background-color: #cfe2f3;"&gt;*&lt;/span&gt;&lt;span style="background-color: #cfe2f3; color: #3d85c6;"&gt;[a-z0-9]&lt;/span&gt;&lt;span style="background-color: #cfe2f3;"&gt;)&lt;/span&gt;&lt;b style="background-color: #cfe2f3;"&gt;&lt;span style="color: red;"&gt;&lt;span style="color: black;"&gt;?&lt;/span&gt;\.&lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: #cfe2f3;"&gt;)&lt;span style="font-size: x-large;"&gt;&lt;b&gt;&lt;span style="color: red;"&gt;+&lt;/span&gt;&lt;/b&gt;&lt;/span&gt;(&lt;/span&gt;&lt;b style="background-color: #cfe2f3;"&gt;&lt;span style="color: #f1c232;"&gt;?:&lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: #cfe2f3; color: magenta;"&gt;[A-Z]{2}&lt;/span&gt;&lt;span style="background-color: #cfe2f3; color: #20124d;"&gt;|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum&lt;/span&gt;&lt;span style="background-color: #cfe2f3;"&gt;)&lt;/span&gt;&lt;b&gt;&lt;span style="color: red;"&gt;\b&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Address mainly has two parts - one before (part 1) "@" and one after it (part 2).&lt;br /&gt;&lt;br /&gt;&lt;span style="background-color: #fff2cc;"&gt;Part 1:&lt;/span&gt;&lt;br /&gt;&lt;ol&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white;"&gt;&amp;nbsp;&lt;span style="color: #cc0000; font-size: small;"&gt;[a-z0-9!#$%&amp;amp;'*+/=?^_`{|}~-]&lt;/span&gt;&amp;nbsp; -- &lt;span style="color: black;"&gt;A set of characters i.e., between square brackets [ ], and in this case all letters "a to z" (either upper case or lower case) and numerals "0 to 9" and special characters &lt;span style="background-color: white; color: black;"&gt;"&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: white; color: black;"&gt;!#$%&amp;amp;'*+/=?^_`{|}~-" &lt;/span&gt;&lt;span style="background-color: #fff2cc; color: black;"&gt;&lt;span style="background-color: white;"&gt; are allowed&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white;"&gt;&lt;b&gt;&lt;span style="color: red;"&gt;+&amp;nbsp;&lt;/span&gt;&lt;/b&gt; --&amp;nbsp; &lt;span style="color: black;"&gt;One or more times of the above character set. Meaning "joe" or "!j!" or "abc" are all valid i.e.,&amp;nbsp; Alphabets and ! are repeated one or more times. But Part1.1 will not match "(abc)" or "joe&amp;lt;&amp;gt;" because "(", ")", "&amp;lt;", "&amp;gt;" are not in the above character set.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white;"&gt;&lt;span style="color: black;"&gt;&amp;nbsp;(&amp;nbsp; -- Allow grouping also start remembering the matched string.&amp;nbsp; To avoid storing use "?:" right after "(".&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white;"&gt;&lt;span style="color: black;"&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;b style="color: #f1c232;"&gt;?:&lt;span style="color: black;"&gt; -- &lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;Don't need to remember or store what is matched.&amp;nbsp; When grouping is done through "(", it is also remembered in variables $1, $2, $3, etc. With "?:" following "(" indicates that not to store it. This will help with the performance but if you want the matched string to use for further processing, then you can remove "&lt;b style="color: #f1c232;"&gt;?:&lt;/b&gt;" from above regular expression.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&lt;b&gt;&lt;span style="color: red;"&gt;\.&lt;/span&gt;&lt;/b&gt; --&amp;nbsp; It indicates "." can appear in email after bullet 1 above but "." has significance for the regular expression (RE) itself. In RE "." means any single character so we need to escape it and pass it "." down to parsing engine.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt; &lt;span style="background-color: white;"&gt;&lt;span style="color: #cc0000;"&gt;[a-z0-9!#$%&amp;amp;'*+/=?^_`{|}~-]&lt;/span&gt;&amp;nbsp;&lt;/span&gt;&lt;span style="background-color: white; color: black;"&gt; &lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;-- See 1 above.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&lt;b&gt;&lt;span style="color: red;"&gt;&amp;nbsp;+&lt;/span&gt;&lt;/b&gt;&amp;nbsp; -- See 2 above.&amp;nbsp; Also, if a "." appears then there must be at least one of characters in 6.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&lt;b style="color: #741b47;"&gt;*&lt;/b&gt;&amp;nbsp; -- Means zero or more times. In this case all characters in step 5, 6 &amp;amp; 7 can be optional. That is "joe.a" is valid but not "joe." &lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;&lt;span style="color: magenta;"&gt;@ &lt;/span&gt;&lt;/b&gt;&lt;span style="color: black;"&gt;-- "@" sign is a must in the email addre&lt;span style="background-color: white;"&gt;ss&lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&lt;/span&gt;&lt;span style="background-color: white;"&gt;.&amp;nbsp; It should appear once.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;/ol&gt;&lt;span style="background-color: #cfe2f3;"&gt;Part 2:&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Part 2A: &lt;span style="background-color: white;"&gt;(&lt;/span&gt;&lt;span style="background-color: white; color: #e69138;"&gt;?:&lt;/span&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt;[a-z0-9]&lt;/span&gt;&lt;span style="background-color: white;"&gt;(&lt;/span&gt;&lt;span style="background-color: white; color: #e69138;"&gt;?:&lt;/span&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt;[a-z0-9-]&lt;/span&gt;&lt;span style="background-color: white;"&gt;*&lt;/span&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt;[a-z0-9]&lt;/span&gt;&lt;span style="background-color: white;"&gt;)&lt;/span&gt;&lt;b style="background-color: white;"&gt;&lt;span style="color: red;"&gt;&lt;span style="color: black;"&gt;?&lt;/span&gt;\.&lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: white;"&gt;)&lt;b style="color: red;"&gt;+&lt;/b&gt;&lt;/span&gt;&lt;br /&gt;&lt;ol&gt;&lt;li&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white;"&gt;&lt;b&gt;&lt;span style="color: red;"&gt;+&lt;/span&gt;&lt;/b&gt; -- All with the outer parenthesis must appear at least once.&amp;nbsp; It enforcing rule that after "@" sign in email, there must be at least one charcter.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white;"&gt;&lt;b style="color: #f1c232;"&gt;?:&lt;/b&gt; --&amp;nbsp; &lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;Don't need to remember or store what is matched.&amp;nbsp; Helps in performance when dealing with millions of email parsing.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: white; color: #274e13;"&gt;&lt;span style="background-color: white;"&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt;[a-z0-9]&lt;/span&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt; &lt;/span&gt;&lt;span style="background-color: white;"&gt;-- First character after the "@" must be a alpha-numeral (a,b,c...z or 1,2,...9)&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white;"&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: white;"&gt;(&lt;/span&gt;&lt;span style="background-color: white; color: #e69138;"&gt;?:&lt;/span&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt;[a-z0-9-]&lt;/span&gt;&lt;span style="background-color: white;"&gt;*&lt;/span&gt;&lt;span style="background-color: white; color: #3d85c6;"&gt;[a-z0-9]&lt;/span&gt;&lt;span style="background-color: white;"&gt;)&lt;/span&gt;&lt;b style="background-color: #cfe2f3;"&gt;&lt;span style="color: red;"&gt;&lt;span style="color: black;"&gt;&lt;span style="background-color: white;"&gt;?&amp;nbsp; &lt;/span&gt;&lt;span style="background-color: white;"&gt;-- &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="color: red;"&gt;&lt;span style="color: black;"&gt;&lt;span style="background-color: white;"&gt;"?" at the end indicates 0 or 1 time of alpha-numeral characters.&amp;nbsp; That is, after a first character in step 3 next characters must be alpha-numerals only but they are optional.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&lt;b&gt;&lt;span style="color: red;"&gt;\.&lt;/span&gt;&lt;/b&gt; -- See step 5 in part 1. &lt;/span&gt;&lt;/span&gt;&lt;br /&gt;&amp;nbsp;&lt;/li&gt;&lt;/ol&gt;&lt;br /&gt;&lt;span style="background-color: #fff2cc; color: #274e13;"&gt;&lt;span style="background-color: white; color: black;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Part 2B:&amp;nbsp; &lt;/span&gt;&lt;/span&gt;&lt;span style="background-color: white;"&gt;(&lt;/span&gt;&lt;b style="background-color: white;"&gt;&lt;span style="color: #f1c232;"&gt;?:&lt;/span&gt;&lt;/b&gt;&lt;span style="background-color: white; color: magenta;"&gt;[A-Z]{2}&lt;/span&gt;&lt;span style="background-color: white; color: #20124d;"&gt;|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum&lt;/span&gt;&lt;span style="background-color: white;"&gt;)&lt;/span&gt;&lt;span style="background-color: white; color: black;"&gt;&lt;/span&gt;&lt;br /&gt;&lt;ol&gt;&lt;li&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white;"&gt;&amp;nbsp; [A-Z]{2} -- Any two characters after the final ".". This covers country top level domains (TLD) like "uk" (for United Kingdom), "in" (India), "ca"(Canada), etc.&amp;nbsp; See this &lt;a href="http://www.iso.org/iso/english_country_names_and_code_elements"&gt;list of standard country codes&lt;/a&gt;.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white;"&gt;&amp;nbsp;| -- Is a "OR" operator.&amp;nbsp; It matches "com" or "org" or "museum", etc.&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;&lt;/ol&gt;&lt;span style="background-color: #cfe2f3;"&gt;&lt;span style="background-color: white;"&gt;And final &lt;b style="color: red;"&gt;"\b" &lt;/b&gt;&lt;span style="color: red;"&gt;&lt;span style="color: black;"&gt;at the end is a word boundary anchor.&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;During this validation and clean up you can also add length check on the email part 1 and 2.&amp;nbsp; RFCs specify of length 64 characters (octet) for part 1 (before @ character) and 255 characters (octet) after @ symbol.&amp;nbsp; These implementations can effectively handle more than 99.999% of emails.&lt;br /&gt;&lt;br /&gt;Let me know if you have different regular expression or different way of parsing emails.&lt;br /&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-2518525453117293920?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/2518525453117293920/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/05/email-parsing-with-regular-expressions.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/2518525453117293920'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/2518525453117293920'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/05/email-parsing-with-regular-expressions.html' title='Email parsing with regular expressions'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-4108915604602384769</id><published>2010-05-20T17:45:00.000-07:00</published><updated>2010-06-02T13:59:27.657-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Talend'/><category scheme='http://www.blogger.com/atom/ns#' term='ETL'/><category scheme='http://www.blogger.com/atom/ns#' term='DW'/><title type='text'>Data Profiling - An example in Talend Profiler</title><content type='html'>Data is loaded into Data Warehouse (DW) from disparate systems and sometimes from external partners who have their own coding standards.&amp;nbsp; In any case, the quality of the data loaded into the data warehouse is often variable,and especially while discovering the data one may need to load some sample data and do some analysis including initial data profiling.&amp;nbsp; During this process one may discover differences which when resolved result in much smoother data flow along the process flow.&amp;nbsp;&amp;nbsp; Or it may be at later stage, say after the summarization is completed one may need to do some analysis on type of data.&amp;nbsp; In all these cases data profiling helps and Talend provides a tool (Talend Open Profiler - TOP) to quickly and efficiently perform profiling.&lt;br /&gt;&lt;br /&gt;Data profiling - the process of examining available data in different data sources, including databases, applications, files, data transfer from external systems etc., and collecting statistics and information - improves data quality and better reporting.&lt;br /&gt;&lt;br /&gt;In date dimension, we have nearly 220,000 rows covering Jan.01,1900 to Dec.31,2500 (7 hundred year dates) and one of the column is 'day_of_week_name' (cardinality 7 - MONDAY, TUESDAY....).&amp;nbsp; This table has 70 columns including date, weeks, months, names, etc. For testing purpose, I wanted to check the nulls and pattern frequency (distribution) for 'day_of_week_name' column.&lt;br /&gt;&lt;br /&gt;To do so, select the column to profile (day_of_week_name), drag and drop into "Analyzed columns" of "Analysis Settings" tab.&amp;nbsp; Then pick the indicators i.e., how you want the column measured (count, range, stats, etc.) and I picked row count and NULL count along with "Pattern Frequency Table".&amp;nbsp; Pattern frequency will count different patterns. The results in "Analysis Results" tab shows as below.&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_kzso6KW6Rx4/S_XRCVBBMRI/AAAAAAAACk0/SIMcarcum9I/s1600/Talend_profiler_may2010.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="460" src="http://2.bp.blogspot.com/_kzso6KW6Rx4/S_XRCVBBMRI/AAAAAAAACk0/SIMcarcum9I/s640/Talend_profiler_may2010.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;There were 219,146 row count with no NULLs and the pattern frequency indicates 31,307 of pattern AAAAAAAAA (9A Uppercase letters), 31,307 of 7A pattern, 62,613 of 8A pattern and 93,919 of 5A pattern.&lt;br /&gt;&lt;br /&gt;9A pattern count is of 'WEDNESDAY' rows, 5A pattern covers 'SUNDAY', 'MONDAY' and 'FRIDAY'.&amp;nbsp; Similarly for other days.&lt;br /&gt;&lt;br /&gt;You can also have your own 'UDI' - User Defined Indicators that can add more functionality to existing indicators.&amp;nbsp; You can build them in Java jar and import them.&amp;nbsp; But anytime you do processing in the profiler, the data needs to get transferred from database and possibly slowing down the profiling.&amp;nbsp; For smaller data set it may not be noticeable but any profiling on large tables can bog down either due to memory limitation and/or network delay.&lt;br /&gt;&lt;br /&gt;Profiler converts your indicators to appropriate query and runs it.&amp;nbsp; For example, for the above pattern match,&amp;nbsp;MySQL query looks like:&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`day_of_week_name`,'a','a'),'b','a'),'c','a'),'d','a'),'e','a'),'f','a'),'g','a'),'h','a'),'i','a'),'j','a'),'k','a'),'l','a'),'m','a'),'n','a'),'o','a'),'p','a'),'q','a'),'r','a'),'s','a'),'t','a'),'u','a'),'v','a'),'w','a'),'x','a'),'y','a'),'z','a'),'ç','a'),'â','a'),'ê','a'),'î','a'),'ô','a'),'û','a'),'é','a'),'è','a'),'ù','a'),'ï','a'),'ö','a'),'ü','a'),'A','A'),'B','A'),'C','A'),'D','A'),'E','A'),'F','A'),'G','A'),'H','A'),'I','A'),'J','A'),'K','A'),'L','A'),'M','A'),'N','A'),'O','A'),'P','A'),'Q','A'),'R','A'),'S','A'),'T','A'),'U','A'),'V','A'),'W','A'),'X','A'),'Y','A'),'Z','A'),'Ç','A'),'Â','A'),'Ê','A'),'Î','A'),'Ô','A'),'Û','A'),'É','A'),'È','A'),'Ù','A'),'Ï','A'),'Ö','A'),'Ü','A'),'0','9'),'1','9'),'2','9'),'3','9'),'4','9'),'5','9'),'6','9'),'7','9'),'8','9'),'9','9'), COUNT(*) c &lt;br /&gt;FROM `dw_mktg`.`dim_date` t  &lt;br /&gt;GROUP BY REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`day_of_week_name`,'a','a'),'b','a'),'c','a'),'d','a'),'e','a'),'f','a'),'g','a'),'h','a'),'i','a'),'j','a'),'k','a'),'l','a'),'m','a'),'n','a'),'o','a'),'p','a'),'q','a'),'r','a'),'s','a'),'t','a'),'u','a'),'v','a'),'w','a'),'x','a'),'y','a'),'z','a'),'ç','a'),'â','a'),'ê','a'),'î','a'),'ô','a'),'û','a'),'é','a'),'è','a'),'ù','a'),'ï','a'),'ö','a'),'ü','a'),'A','A'),'B','A'),'C','A'),'D','A'),'E','A'),'F','A'),'G','A'),'H','A'),'I','A'),'J','A'),'K','A'),'L','A'),'M','A'),'N','A'),'O','A'),'P','A'),'Q','A'),'R','A'),'S','A'),'T','A'),'U','A'),'V','A'),'W','A'),'X','A'),'Y','A'),'Z','A'),'Ç','A'),'Â','A'),'Ê','A'),'Î','A'),'Ô','A'),'Û','A'),'É','A'),'È','A'),'Ù','A'),'Ï','A'),'Ö','A'),'Ü','A'),'0','9'),'1','9'),'2','9'),'3','9'),'4','9'),'5','9'),'6','9'),'7','9'),'8','9'),'9','9') &lt;br /&gt;ORDER BY c DESC LIMIT 10&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;In other DBMS like Oracle, PostgresSQL and DB2 it will be much smaller query since those systems provide TRANSLATE function.&lt;br /&gt;Profiler provides many built-in indicators like Soundex Frequency, Mean, Median, Mode, Range, Quartile Range, etc.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-4108915604602384769?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/4108915604602384769/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/05/data-profiling-example-in-talend.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4108915604602384769'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4108915604602384769'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/05/data-profiling-example-in-talend.html' title='Data Profiling - An example in Talend Profiler'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_kzso6KW6Rx4/S_XRCVBBMRI/AAAAAAAACk0/SIMcarcum9I/s72-c/Talend_profiler_may2010.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-819302391495925646</id><published>2010-05-07T12:33:00.000-07:00</published><updated>2010-05-15T21:19:13.385-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ETL'/><category scheme='http://www.blogger.com/atom/ns#' term='DW'/><category scheme='http://www.blogger.com/atom/ns#' term='SQL'/><title type='text'>Logging queries for performance monitoring</title><content type='html'>Before making any performance improvements, one need to measure it either you are trying to improve computer applications or DW queries, etc.&amp;nbsp; In one of the Data Warehouse subject area where I run more than 120K (120,00) queries everyday in an ETL environment, I log all queries to a table and track over time to see which queries are showing performance issue(s). With the following table, I was able to improve the performance by more than 50% some time. &amp;nbsp;For a query that repeatedly runs with different parameters in where clause, a small improvement adds up quickly and other times due to data skewing you can clearly see any changes needed to improve performance. &amp;nbsp; Other times any mistakes in indexes (DBA dropped it :) or application user modified their query, etc. you will have a reference point to check against why a particular query or queries are slow.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;table align="center" class="tb" padding="3px" style="width: 90%;"&gt;&lt;caption&gt; Query Performance Monitoring table definition&lt;/caption&gt; &lt;tbody&gt;&lt;tr&gt; &lt;th&gt;Column&lt;/th&gt;   &lt;th&gt;Null?&lt;/th&gt;  &lt;th&gt;Type&lt;/th&gt;  &lt;th&gt;Comment&lt;/th&gt; &lt;/tr&gt;&lt;/tbody&gt;&lt;tbody&gt;&lt;tr&gt; &lt;td&gt;ID&lt;/td&gt; &lt;td&gt;No&lt;/td&gt;  &lt;td&gt;int (11)&lt;/td&gt; &lt;td&gt;Auto Increment&lt;/td&gt;  &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;RUN_NUMBER&lt;/td&gt;  &lt;td&gt;No&lt;/td&gt;  &lt;td&gt;tinyint&lt;/td&gt;  &lt;td&gt;Useful when job is recovered or&amp;nbsp;re-ran&lt;/td&gt;   &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;APPLICATION&lt;/td&gt;   &lt;td&gt;No&lt;/td&gt;  &lt;td&gt;varchar(128)&lt;/td&gt;  &lt;td&gt;Application name to track&lt;/td&gt;  &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;DOMAIN&lt;/td&gt;  &lt;td&gt;Yes&lt;/td&gt;  &lt;td&gt;varchar(128)&lt;/td&gt;  &lt;td&gt;Application domain or subject&amp;nbsp;area&lt;/td&gt;   &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;QUERY&lt;/td&gt;       &lt;td&gt;No&lt;/td&gt;       &lt;td&gt;varchar(4096)&lt;/td&gt;       &lt;td&gt;Actual query&lt;/td&gt;     &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;DURATION&lt;/td&gt;  &lt;td&gt;No&lt;/td&gt;  &lt;td&gt;decimal(10,5)&lt;/td&gt;  &lt;td&gt;How long did query run?&lt;/td&gt;   &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;ROW_COUNT&lt;/td&gt;       &lt;td&gt;Yes&lt;/td&gt;       &lt;td&gt;int&lt;/td&gt;       &lt;td&gt;Number of rows affected&lt;/td&gt;     &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;RESULT&lt;/td&gt;   &lt;td&gt;No&lt;/td&gt;  &lt;td&gt;varchar(32)&lt;/td&gt; &lt;td&gt;Query result type - succeeded or&amp;nbsp;failed?&lt;/td&gt;  &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;COMMENT&lt;/td&gt;       &lt;td&gt;Yes&lt;/td&gt;       &lt;td&gt;varchar(128)&lt;/td&gt;       &lt;td&gt;User comment that can be logged with each query&lt;/td&gt;     &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;CREATED_DT&lt;/td&gt;  &lt;td&gt;No&lt;/td&gt;  &lt;td&gt;date&lt;/td&gt;  &lt;td&gt;Query run date&lt;/td&gt;     &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;CREATED_DT_TM&lt;/td&gt;       &lt;td&gt;No&lt;/td&gt;       &lt;td&gt;timestamp/ datetime&lt;/td&gt;       &lt;td&gt;Query run date and time&lt;/td&gt;     &lt;/tr&gt;&lt;tr&gt;  &lt;td&gt;CREATED_BY&lt;/td&gt;   &lt;td&gt;Yes&lt;/td&gt;    &lt;td&gt;varchar(64)&lt;/td&gt;  &lt;td&gt;Query user name&lt;/td&gt;  &lt;/tr&gt;&lt;/tbody&gt; &lt;/table&gt;&lt;span style="font-family: 'Roman Times';"&gt;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;In the above table, one can use CREATED_DT_TM wherever CREATED_DT is needed while doing query performance analysis but most of these analyzes are done at date level (as most of DW ETL and summarizations are daily processes).  By having date column and indexed, the query analysis queries are much faster as there is no need to apply date function on each row.&lt;br /&gt;&lt;br /&gt;More interesting analysis including job failures, errors and query result set (ROW_COUNT) can be done with this data.  You can also analyze specific database query performances.  For example when a same query runs against different (mysql) databases (say, more than 100), some databases show performance getting worse due to table size or load, etc.&lt;br /&gt;&lt;br /&gt;Once you know that there is some query performance issue, you can take remedial action. &amp;nbsp;Let me know how you have implemented query performance monitoring?&lt;br /&gt;&lt;br /&gt;Cheers,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-819302391495925646?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/819302391495925646/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/05/logging-queries-for-performance.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/819302391495925646'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/819302391495925646'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/05/logging-queries-for-performance.html' title='Logging queries for performance monitoring'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-8763607613369319143</id><published>2010-05-06T15:05:00.000-07:00</published><updated>2010-05-15T11:39:05.745-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Analytics'/><category scheme='http://www.blogger.com/atom/ns#' term='Excel'/><category scheme='http://www.blogger.com/atom/ns#' term='Google'/><title type='text'>Google Analytics Plugin - Very useful one</title><content type='html'>&lt;strong&gt;Google Analytics (GA)&lt;/strong&gt; is&amp;nbsp;invaluable when you need to slice and dice web traffic&amp;nbsp;data along numerous dimensions.&amp;nbsp; For example, by date, time, location just to name few measuring visits, vistiors, pageviews, etc.&amp;nbsp; For full list of possible combinations take a look at &lt;a href="http://code.google.com/apis/analytics/docs/gdata/gdataReferenceValidCombos.html"&gt;this documentation.&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;While many business users and SEO/SEM analyts use GA on regular basis to manage their Internet Marketing spend and improve site traffic for better ROI, they can chart a single measure for different segments and analyse one metric at a time.&amp;nbsp;&amp;nbsp;Many a time users download their data from adwords or get GA data&amp;nbsp;through their in-house API setup.&amp;nbsp; With almost all business users having used Excel as one of their main analytical tool, they prefer it over other tools (either in-house or 3rd party). &amp;nbsp;Also, as quickly as things change in Internet Marketing (IM) world, some quick analysis business user can do on their own allows them to respond to quickly to market changes. &amp;nbsp;You may find bullet 3 and 4 &lt;a href="http://www.kaushik.net/avinash/2007/12/web-metrics-demystified.html"&gt;in this article &lt;/a&gt;&amp;nbsp;from Avinash Kaushik&amp;nbsp;relevant.&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;One freeware plug-in that provides the solution is &lt;a href="http://excellentanalytics.com/"&gt;ExcellentAnalytics' plug-in for Excel&lt;/a&gt;.&amp;nbsp; &lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_kzso6KW6Rx4/S-Mu9XcpxRI/AAAAAAAACkc/3pgAr-YkrBo/s1600/Excellent_Analtics_pic.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://4.bp.blogspot.com/_kzso6KW6Rx4/S-Mu9XcpxRI/AAAAAAAACkc/3pgAr-YkrBo/s400/Excellent_Analtics_pic.png" width="377" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;b&gt;&lt;span class="Apple-style-span" style="font-family: Georgia, 'Times New Roman', serif;"&gt;&lt;span class="Apple-style-span" style="color: #990000;"&gt;ExcellentAnalytics' Plug-in&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;It is pretty simple to use and an user can start using it immediately.&amp;nbsp; Once the data that you are interested in is downloaded through this plug-in, you can exploit Excel abilities to further analyze the data without any other (IT) application needed. &lt;br /&gt;&lt;br /&gt;After downloading and installing the plug-in, select "ExcellentAnalytics" menu at the top when you bring up MS Excel 2007 and click on "Account" button; then enter your GA email and password for it to connect to GA. &amp;nbsp;Click on "New Query" and select the profile of interest to you and select valid dimensions and metrics. &amp;nbsp;If it is an invalid combination or query, the result set does not return any and hopefully future release of this plug-in will provide some user friendly info.&lt;br /&gt;&lt;br /&gt;There were queries that didn't return right result sets. For example, "Visitor Recency" metrics like &amp;nbsp;"most visits the previous visits happened: X days ago". &amp;nbsp;Or under Filter section dimensions are assumed to be string type and any dimension that is a number can't be entered. For example, you selected "visit count" dimension and want to filter any visit count less than 5. &amp;nbsp;You can't. &amp;nbsp;Hey, it is a freeware and it does a decent job for quick analysis. &amp;nbsp;Use it with caution and having said that you can always check the numbers against GA's report to make sure the plug-in didn't mess up anything.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;span class="Apple-style-span" style="color: #990000;"&gt;Sample reports:&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_kzso6KW6Rx4/S-NDK9KIf4I/AAAAAAAACks/kPFpgnh0mn0/s1600/GA-Daily-Report-1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="514" src="http://1.bp.blogspot.com/_kzso6KW6Rx4/S-NDK9KIf4I/AAAAAAAACks/kPFpgnh0mn0/s640/GA-Daily-Report-1.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-8763607613369319143?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/8763607613369319143/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/05/google-analytics-plugin-very-useful-one.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8763607613369319143'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8763607613369319143'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/05/google-analytics-plugin-very-useful-one.html' title='Google Analytics Plugin - Very useful one'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_kzso6KW6Rx4/S-Mu9XcpxRI/AAAAAAAACkc/3pgAr-YkrBo/s72-c/Excellent_Analtics_pic.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-8296928131750709991</id><published>2010-04-29T15:11:00.000-07:00</published><updated>2010-05-05T18:12:40.001-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><title type='text'>Special characters in table names</title><content type='html'>In DW environment, we allow business user to upload data into new tables and run their own adhoc queries when needed.  The user interface allows the user to upload up to 10,000 (10K) rows in either comma or tab delimited format to a temporary database/table.  The user can specify the table name they want the data to upload to.  Many savvy business users can thus run their own SQL queries joining with the data in DW tables.&lt;br /&gt;&lt;br /&gt;All this works fine but many a times user provides table names with non-alphanumeric characters including space, forward or backward slash, dollar sign, etc.  At some point DBAs noted that managing these tables becoming an issue especially the tables names with special characters.  The following monitoring tool helped alleviate or eliminated the problem and also the UI was modified to check for any special characters and remove them before creating users' temporary tables.&lt;br /&gt;&lt;br /&gt;Since MySQL doesn't provide regular expression (&lt;a href="http://dev.mysql.com/doc/refman/5.0/en/regexp.html"&gt;RLIKE&lt;/a&gt; or REGEXP) in SHOW TABLE syntax one is forced to use UNIX or other language (&lt;a href="http://www.gnu.org/manual/gawk/html_node/index.html"&gt;awk&lt;/a&gt;, perl, python, etc.) for RE features. &lt;br /&gt;&lt;br /&gt;The following pulls all table names with special characters including $, \, space, ?, and @.  You can add other characters that are of interest to you to this set.&lt;br /&gt;&lt;pre class="perl" name="code"&gt;linux&amp;gt; mysql --host=hostname_or_ip  --database=db_name   --user=root  --password=your_pw  --execute="SHOW TABLES" | egrep [$\\\ ?!@]&lt;br /&gt;&lt;br /&gt;# This shows tables like (not including double quotes)&lt;br /&gt;"non_alphanum_table_name test it 2"&lt;br /&gt;"non_alphanum_table_name; test it"&lt;br /&gt;"non_alphanum_table_name; test it $"&lt;br /&gt;"non_alphanum_table_name; test it $ \"&lt;br /&gt;"non_alphanum_table_name_test_$_!_2"&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Here you have choice either delete those tables or store the name of these tables into a separate table called "ERROR_TABLE_NAME" and work with business users to later modify them.  Option 2 is friendly and professional and you won't annoy users like in option 1 :).&lt;br /&gt;&lt;br /&gt;Quickly zap an awk script like &lt;br /&gt;&lt;pre class="perl" name="code"&gt;{ print "INSERT INTO ERROR_TABLE_NAME(name) VALUES('" $0 "'); COMMIT;" }&lt;br /&gt;&lt;/pre&gt;to insert each table name into ERROR_TABLE_NAME table.&lt;br /&gt;&lt;br /&gt;You need to store the script in a file (say, error_tbl_name.awk) since at command line passing a single quote in the above string doesn't work.  When you pass a table name to MySQL, you need to quote the name like 'non_alphanum_table_name test it 2'.  But single quote is used by awk which was not easily escaped and errored out.&lt;br /&gt;&lt;br /&gt;Now piping all three commands above will result in &lt;br /&gt;&lt;pre class="perl" name="code"&gt;linux&amp;gt; mysql --host=hostname_or_ip  --database=db_name   --user=root  --password=your_pw  --execute="SHOW TABLES" | egrep [$\\\ ?!@] | awk -f error_tbl_name.awk&lt;br /&gt;&lt;/pre&gt;&lt;pre class="sql" name="code"&gt;-- The INSERT statements generated for each looks like&lt;br /&gt;INSERT INTO ERROR_TABLE_NAME(name) VALUES('non_alphanum_table_name test it 2'); commit;&lt;br /&gt;INSERT INTO ERROR_TABLE_NAME(name) VALUES('non_alphanum_table_name; test it'); commit;&lt;br /&gt;INSERT INTO ERROR_TABLE_NAME(name) VALUES('non_alphanum_table_name; test it $'); commit;&lt;br /&gt;INSERT INTO ERROR_TABLE_NAME(name) VALUES('non_alphanum_table_name; test it $ \\'); commit;&lt;br /&gt;INSERT INTO ERROR_TABLE_NAME(name) VALUES('non_alphanum_table_name_test_$_!_2'); commit;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Also, note you have commit after each insert, due to fact that the auto commit is turned off and when piping is done mysql command is executed for each table which will generate error otherwise.&lt;br /&gt;&lt;br /&gt;As final step execute&lt;br /&gt;&lt;br /&gt;&lt;pre class="perl" name="code"&gt;linux&amp;gt; mysql --host=hostname_or_ip  --database=db_name   --user=root  --password=your_pw  --execute="SHOW TABLES" | egrep [$\\\ ?!@] | awk -f error_tbl_name.awk |  mysql --host=hostname_or_ip  --database=db_name   --user=root  --password=your_pw&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Change these table names to new standard table names and send an email to business users with list of above table names.&lt;br /&gt;&lt;br /&gt;There you have it. &lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-8296928131750709991?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/8296928131750709991/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/04/special-characters-in-table-names.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8296928131750709991'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/8296928131750709991'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/04/special-characters-in-table-names.html' title='Special characters in table names'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-4449165353201132706</id><published>2010-04-24T17:07:00.000-07:00</published><updated>2010-05-24T17:32:01.041-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><title type='text'>Getting MySQL DB size</title><content type='html'>Many a times you want to find the size of database (schema) for performance monitoring or to allocate more disk space or to take some proactive action on db size.&amp;nbsp; You can pull relevant information from MySQL INFORMATION_SCHEMA with the following query.&amp;nbsp; It sums up each table data and its index size for all tables in a single schema.&amp;nbsp; &lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;SELECT s.schema_name,&lt;br /&gt;  CONCAT(IFNULL(ROUND((SUM(t.data_length) +&lt;br /&gt;       SUM(t.index_length))/1024/1024,2),0.00),"Mb") total_size,&lt;br /&gt;  CONCAT(IFNULL(ROUND((SUM(t.data_length))/1024/1024,2),0.00),"Mb") data_size,&lt;br /&gt;  CONCAT(IFNULL(ROUND((SUM(t.index_length))/1024/1024,2),0.00),"Mb") index_size,&lt;br /&gt;  CONCAT(IFNULL(ROUND(((SUM(t.data_length)+SUM(t.index_length)) -&lt;br /&gt;         SUM(t.data_free))/1024/1024,2),0.00),"Mb") data_used,&lt;br /&gt;  CONCAT(IFNULL(ROUND(SUM(data_free)/1024/1024,2),0.00),"Mb") data_free,&lt;br /&gt;  IFNULL(ROUND((((SUM(t.data_length)+SUM(t.index_length)) -&lt;br /&gt;       SUM(t.data_free))/((SUM(t.data_length) + &lt;br /&gt;       SUM(t.index_length)))*100),2),0) pct_used,&lt;br /&gt;  SUM(t.table_rows) total_rows,&lt;br /&gt;  COUNT(t.table_name) total_tables,&lt;br /&gt;  CONCAT(ROUND(MAX(t.data_length+t.index_length)/1024/1024, 0.00), &lt;br /&gt;      "Mb") biggest_table_size,&lt;br /&gt;  GROUP_CONCAT(distinct t.engine) engines_used,&lt;br /&gt;  GROUP_CONCAT(distinct t.table_collation) collations_used&lt;br /&gt;FROM INFORMATION_SCHEMA.SCHEMATA s&lt;br /&gt;  LEFT JOIN INFORMATION_SCHEMA.TABLES t &lt;br /&gt;ON s.schema_name = t.table_schema&lt;br /&gt;WHERE s.schema_name = "test_dw_advertising"&lt;br /&gt;GROUP BY s.schema_name\G&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;*************************** 1. row ***************************&lt;br /&gt;       schema_name: test_dw_advertising&lt;br /&gt;        total_size: 212151.14Mb&lt;br /&gt;         data_size: 154660.83Mb&lt;br /&gt;        index_size: 57490.31Mb&lt;br /&gt;         data_used: 210039.45Mb&lt;br /&gt;         data_free: 2111.69Mb&lt;br /&gt;          pct_used: 99.00&lt;br /&gt;        total_rows: 309940227&lt;br /&gt;      total_tables: 118&lt;br /&gt;biggest_table_size: 74360Mb&lt;br /&gt;      engines_used: InnoDB,MyISAM&lt;br /&gt;   collations_used: latin1_swedish_ci&lt;br /&gt;1 row in set (1 min 38.26 sec)&lt;br /&gt;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The total size of allocated (including free data size) is nearly 212G, out of which data used ~155G and indexes used ~57G for 118 tables leaving 2G free (99% of disk allocated is used). Biggest table used 74.3G disk space and this schema has both InnoDB and MyISAM tables using single collation latin1_swedish_ci.&lt;br /&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-4449165353201132706?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/4449165353201132706/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/04/getting-mysql-db-size.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4449165353201132706'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4449165353201132706'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/04/getting-mysql-db-size.html' title='Getting MySQL DB size'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-4135854561304629627</id><published>2010-04-08T18:14:00.000-07:00</published><updated>2010-04-29T15:24:37.429-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ETL'/><title type='text'>Talend - Oracle - Java Path error</title><content type='html'>Few weeks ago on of our QA systems I installed Oracle 10g Express for testing and installation was smooth and all went well.&amp;nbsp; But recently, I needed to do some data profiling in a specific table in MySQL DW environment due to fact that some of queries were taking more than twice the amount of normal runs.&amp;nbsp; Specifically, one query that used to take 5-6 minutes churning through couple of million rows in a staging table started taking more than 15 minutes.&amp;nbsp; I suspected some data issue but needed to quickly run some tests knowing how the recent data profiles against earlier weeks data.&amp;nbsp; &lt;br /&gt;&lt;br /&gt;I downloaded Talend Data Profiler to this new test machine (where I had Oracle 10g) and tried to install it. &amp;nbsp; Soon after double clicking on exe file, I got an error indicating something like "org.talend.rcp.branding.top.product could not be found." in configuration\12725000123.log.&amp;nbsp; The log also has some more information regarding which Java version it is trying to use and command line options passed to run it.&amp;nbsp; The error itself doesn't indicate much and this wasted quite a bit of time.&lt;br /&gt;&lt;br /&gt;Going through manual of Talend I found out that profiler needs Java version 1.5 or later version.&amp;nbsp; But I was pretty sure that I had the latest Java environment on this box and checked twice by running java -version in cygwin and checking the path.&amp;nbsp; Also by visiting  &lt;a class="external free" href="http://www.java.com/" rel="nofollow" title="http://www.java.com"&gt;http://www.java.com&lt;/a&gt; then clicking on "Verify Installation" button.&amp;nbsp; They both indicated latest version.&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre class="ruby" name="code"&gt;cygwin$ java -version&lt;br /&gt;java version "1.6.0_20"&lt;br /&gt;Java(TM) SE Runtime Environment (build 1.6.0_20-b02)&lt;br /&gt;Java HotSpot(TM) Client VM (build 16.3-b01, mixed mode, sharing)&lt;br /&gt;&lt;/pre&gt;&lt;pre class="ruby" name="code"&gt;cygwin$ which java&lt;br /&gt;/cygdrive/c/WINDOWS/system32/java&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Then on checking Windows "path" through&lt;br /&gt;&lt;i&gt;Start-&amp;gt;Settings-&amp;gt;Control Panel-&amp;gt;System-&amp;gt;Advanced Tab-&amp;gt;Environment Variables-&amp;gt;path,&amp;nbsp;&lt;/i&gt;&lt;br /&gt;found out that Oracle had stuck its own java directory (version 1.4.2) in the beginning which was throwing off Talend installation.&amp;nbsp; I simply moved the oracle's path variable values&lt;br /&gt;&lt;i&gt;(C:\oraclexe\app\oracle\product\10.2.0\server\bin;C:\OraHome_1\bin;C:\OraHome_1\jre\1.4.2\bin\client;C:\OraHome_1\jre\1.4.2\bin;)&amp;nbsp;&lt;/i&gt;&lt;br /&gt;to the end of path string and restarted the installation.&amp;nbsp; This time data profiler found the right Java and all went well.&lt;br /&gt;&lt;br /&gt;I wish Talend had thrown a better exception indicating which version of Java it needs and which version it found.&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-4135854561304629627?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/4135854561304629627/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/04/talend-oracle-java-path-error.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4135854561304629627'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4135854561304629627'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/04/talend-oracle-java-path-error.html' title='Talend - Oracle - Java Path error'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-7009166527254400127</id><published>2010-03-29T15:37:00.000-07:00</published><updated>2010-04-29T15:44:08.424-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SEM Yahoo'/><title type='text'>Yahoo SEM isBooksClosed</title><content type='html'>As a part of Internet Marketing and SEM, you will need to pull daily summary of keyword performance reports from search engines like Google, Yahoo and MSN.  &lt;br /&gt;&lt;br /&gt;In case of Yahoo, you should check "isBooksClosed" to get the correct data for your report dates; meaning the data you pulled will not change if isBooksClosed is true and Yahoo analytics has collected all the data for the date you requested the report.&lt;br /&gt;&lt;br /&gt;This is especially important if you are pulling the previous day data early in the morning as Yahoo may not have had time to get collect and complete the analytics.  We have run into many times books not closed condition for few accounts out of many tens of accounts.  We then need to repull the data at later time of the day and run the daily summaries.  &lt;br /&gt;&lt;br /&gt;I have reproduced the part of Yahoo explaination related to this from http://developer.searchmarketing.yahoo.com/docs/V6/gsg/BasicReportService.php#books_closed&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;div style="color: red;"&gt;&lt;b&gt;Books Closed&lt;/b&gt;&lt;/div&gt;It is recommended that you wait until the books are closed before running a report. Use the isBooksClosed operation to periodically check the status of books. After the report is run, you may also check the information in the report header. See &lt;a href="http://developer.searchmarketing.yahoo.com/docs/V6/reference/reportDetails.php#header"&gt;Report Headers.&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;isBooksClosed = (Books Closed Time &amp;gt; Inclusive End Date)&lt;br /&gt;&lt;br /&gt;If the books closed time is greater than the inclusive end date, the full 24 hours of the report end date, then isBooksClosed is true and the data in the report is considered complete. If the books closed time is less than the inclusive end date, then isBooksClosed is false and the data in the report is considered incomplete.&lt;br /&gt;&lt;br /&gt;Say your report dates are 2009.11.01 to 2009.11.03 (November 1, 2009 to November 3,  2009) and you made a API request on Nov.4th morning 3am for an  account's "KeywordSummaryByDay" and there is a possibility that Yahoo  has not closed the book for Nov.3rd data.&amp;nbsp; By making a SOAP call to  BasicReportService with report name "isBooksClosedStatus" you be sure  whether data is finalized on Yahoo side for Nov.3rd.&amp;nbsp; I have seen  accounts or campaigns with tens of thousands of keywords not closed by  3am the next day.&amp;nbsp; So make sure that you are pulling the right data for  your DW.&lt;br /&gt;&lt;br /&gt;So, always make sure that you check for isBooksClosed.&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-7009166527254400127?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/7009166527254400127/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/04/yahoo-sem-isbooksclosed.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/7009166527254400127'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/7009166527254400127'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/04/yahoo-sem-isbooksclosed.html' title='Yahoo SEM isBooksClosed'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-1562087418209735955</id><published>2010-03-26T14:18:00.000-07:00</published><updated>2010-04-29T15:50:31.319-07:00</updated><title type='text'>Google's extended searches</title><content type='html'>Internet search users are very well aware of Google search engine and typically they end up entering their query or keywords in the search box and click the resulting links of interest/relevant to them (typically first page results and especially among top 3 o 4 results).&amp;nbsp; Large percentage of users seem to be unaware of Google's extended search that can provide exact result he or she is looking for without having to hop to one more site. &lt;br /&gt;&lt;br /&gt;For example, weather for a particular city or current time at a location or dictionary or area code or sports scores and many more. &amp;nbsp; Following are few examples.&lt;br /&gt;&lt;br /&gt;For &lt;b&gt;weather&lt;/b&gt;: To find current weather at Boston, USA just enter keywords "weather Boston, USA" and you will see today and next few days weather in the first result!&lt;br /&gt;&lt;br /&gt;For &lt;b&gt;time&lt;/b&gt;: To find current time at Bangalore, India (or Bengaluru, India) enter "time Bengaluru, India" and first result is the time.&lt;br /&gt;&lt;br /&gt;For &lt;b&gt;dictionary&lt;/b&gt;: use "define" keyword first before entering your query like "define avatar"&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;For &lt;b&gt;stocks&lt;/b&gt;: This one is little too specific for many users because one need to know specific stock's ticker symbol and there more than 10,000 of the them in US. &amp;nbsp;It doesn't seem to work for international stocks.&lt;br /&gt;&lt;br /&gt;Area &lt;b&gt;code&lt;/b&gt;: Enter US 3 digit area code to get phone's area code info.&lt;br /&gt;&lt;br /&gt;Fill in the Blank: &amp;nbsp;My favourite when searching something for kids home work or a pop quiz. &amp;nbsp;Try "Einstein got Nobel prize in *" or "Earth circumference is * miles"&lt;br /&gt;&lt;br /&gt;There are many more that can save time and typing or mouse clicks! Check out &lt;a href="http://www.google.com/landing/searchtips/"&gt;Google's search tips&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;div style="margin: 0px;"&gt;OR&lt;/div&gt;&lt;div style="margin: 0px;"&gt;Better yet for all these queries just use&amp;nbsp;&lt;a href="http://www.wolframalpha.com/"&gt;"WolframAlpha knowledge engine".&lt;/a&gt;&lt;/div&gt;&lt;div style="margin: 0px;"&gt;&lt;br /&gt;&lt;/div&gt;Cheers,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-1562087418209735955?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/1562087418209735955/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/03/googles-extended-searches.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1562087418209735955'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1562087418209735955'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/03/googles-extended-searches.html' title='Google&apos;s extended searches'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-1461739217444643421</id><published>2010-03-16T13:43:00.001-07:00</published><updated>2010-03-16T14:01:48.217-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Data Warehouse'/><category scheme='http://www.blogger.com/atom/ns#' term='Yahoo'/><category scheme='http://www.blogger.com/atom/ns#' term='SEM'/><category scheme='http://www.blogger.com/atom/ns#' term='SQL'/><title type='text'>Yahoo SEM daily report - Currency and Dups handling</title><content type='html'>If you are pulling daily keyword performance data from major search engines to store in your in Data Warehouse, you need to perform many of transformation before having a daily, weekly or monthly summaries available for business (end) users.&amp;nbsp; In the case of Yahoo the required transformations that I needed to make are, one to handle the currency conversion and two to handle Yahoo sending more than one keyword when only one is expected. This has seem to happen when query or landing page url is modified for the same bid unit and for high impressions queries. Also, for disabled queries/keywords.&lt;br /&gt;&lt;br /&gt;The following sql handles both with data read from staging table and loaded into final table after some transformation and cleaning. It does currency conversion for all countries except US and Mexico (units are in US dollars) which do not need conversions.&lt;br /&gt;&lt;br /&gt;Sub-query below uses MySQL GROUP_CONCAT to find any duplicate keywords for the bid unit and dim_curreny has the daily currency exchange rates pulled from external third party services early in the morning.  se_query_group table has the grouping of the keywords based on company requirements and search engine.  Query also filters out any keyword that has zero impressions.&lt;br /&gt;&lt;br /&gt;The query can further be modified for better performance of using query_id by joining with data warehouse query tables and using query_id in the joins also using search engine bid unit as the surrogate key.&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre class="sql" name="code"&gt;INSERT INTO INT_MKTG_KEYWORD_PERF&lt;br /&gt;    (se_date, search_engine, account_id, campaign_id, ad_group_id, keyword_id&lt;br /&gt;    ,domain, url, query_group_id&lt;br /&gt;    ,query, query_num_2&lt;br /&gt;    ,impressions, clicks, ctr&lt;br /&gt;    ,cpc_orig, cpc&lt;br /&gt;    ,revenue_orig, revenue&lt;br /&gt;    ,cost_orig, cost&lt;br /&gt;    ,num_assist&lt;br /&gt;    ,avg_position )&lt;br /&gt;SELECT KP.se_date, KP.search_engine, KP.account_id, KP.campaign_id&lt;br /&gt;       ,KP.ad_group_id, KP.keyword_id, &lt;br /&gt;       ,KP.domain, KP.url, KP.query_group_id&lt;br /&gt;       ,CASE&lt;br /&gt;           WHEN query2 NOT RLIKE  '---'  THEN  query2                        # Normal single kw&lt;br /&gt;           WHEN query2 RLIKE  '---'  THEN  substring_index(query2, '---', 1) # Get 1st kw&lt;br /&gt;       END query2&lt;br /&gt;       ,CASE&lt;br /&gt;           WHEN query2 RLIKE  '---'  THEN  substring_index(substring_index(query2, '---', 2), '---', -1)&lt;br /&gt;       END  query_num_2                                            # Get 2nd query&lt;br /&gt;       ,impressions&lt;br /&gt;       ,clicks&lt;br /&gt;       ,ctr&lt;br /&gt;       ,cpc cpc_orig&lt;br /&gt;       ,CASE&lt;br /&gt;           WHEN  C.country_code not in ('US', 'MX') THEN cpc / C.currency_rate&lt;br /&gt;           ELSE cpc&lt;br /&gt;       END cpc&lt;br /&gt;       ,revenue revenue_orig&lt;br /&gt;       ,CASE&lt;br /&gt;           WHEN C.country_code not in ('US', 'MX') THEN revenue / C.currency_rate&lt;br /&gt;           ELSE revenue&lt;br /&gt;       END revenue&lt;br /&gt;       ,cost cost_orig&lt;br /&gt;       ,CASE&lt;br /&gt;           WHEN C.country_code  not in ('US', 'MX') THEN cost / C.currency_rate&lt;br /&gt;           ELSE cost&lt;br /&gt;       END cost&lt;br /&gt;       ,CASE&lt;br /&gt;           WHEN avg_position &amp;gt; 0 and avg_position &amp;lt; 1 THEN 1&lt;br /&gt;           ELSE avg_position&lt;br /&gt;       END avg_position&lt;br /&gt;     FROM (&lt;br /&gt;          SELECT se_date, search_engine&lt;br /&gt;              ,account_id, campaign_id, ad_group_id, keyword_id, &lt;br /&gt;              ,domain, url, query_group_id, query&lt;br /&gt;              ,GROUP_CONCAT(query SEPARATOR '---') query2&lt;br /&gt;              ,SUM(impressions) impressions, SUM(clicks) clicks&lt;br /&gt;              ,AVG(ctr) ctr, AVG(cpc) cpc,&amp;nbsp;&lt;br /&gt;              ,AVG(revenue) revenue,&amp;nbsp;&lt;br /&gt;              ,SUM(cost) cost, AVG(avg_position) avg_position&lt;br /&gt;          FROM stg_keyword_perf&lt;br /&gt;          WHERE se_date &amp;gt;=  '&amp;lt;date&amp;gt;' #-- Date here&lt;br /&gt;          GROUP BY se_date ,account_id ,campaign_id ,ad_group_id ,keyword_id&lt;br /&gt;          ) KP #- stg_kw_perf&lt;br /&gt;     JOIN se_query_group QG&lt;br /&gt;       ON KP.query_group_id = QG.query_group_id AND KP.search_engine = QG.search_engine&lt;br /&gt;     JOIN dim_currency C &lt;br /&gt;       ON KP.se_date = C.currency_date AND QG.country_code = C.country_code&lt;br /&gt;     WHERE KP.se_date = '&amp;lt;date&amp;gt;'   #-- Date here&lt;br /&gt;       AND KP.impressions &amp;lt;&amp;gt; 0&lt;br /&gt;      ;&lt;br /&gt;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-1461739217444643421?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/1461739217444643421/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/03/yahoo-sem-daily-report-currency-and.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1461739217444643421'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/1461739217444643421'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/03/yahoo-sem-daily-report-currency-and.html' title='Yahoo SEM daily report - Currency and Dups handling'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-6230891984630514046</id><published>2010-03-15T17:41:00.000-07:00</published><updated>2010-03-15T17:47:12.740-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySQL'/><category scheme='http://www.blogger.com/atom/ns#' term='Ruby'/><title type='text'>Ruby libmysql.dll or mysql_api.so error</title><content type='html'>Installing Ruby MySQL module is pretty straight forward either on *nix or Windows by running &lt;br /&gt;&lt;pre class="ruby" name="code"&gt;&amp;gt; gem  install  mysql&lt;br /&gt;&lt;/pre&gt;But on Windows when you try to import the module through "require mysql" you may get an error like &lt;br /&gt;&lt;pre class="ruby" name="code"&gt;C:/Ruby/lib/ruby/gems/1.8/gems/mysql-2.8.1-x86-mswin32/lib/1.8/mysql_api.so: 126: The specified module could not be found.   - C:/Ruby/lib/ruby/gems/1.8/gems/mysql-2.8.1-x86-mswin32/lib/1.8/mysql_api.so (LoadError)&lt;br /&gt;#  from C:/Ruby/lib/ruby/site_ruby/1.8/rubygems/custom_require.rb:31:in `require'&lt;br /&gt;...&lt;br /&gt;...&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;In that scenario, you are missing "libmysql.dll" in the system32 directory.&amp;nbsp;  In my case I had MySQL server copy and had the dll in folder C:/Program Files/MySQL/MySQL Server 5.4/lib/opt&lt;br /&gt;&lt;br /&gt;&lt;a name='more'&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;From cygwin window do&lt;br /&gt;&lt;pre class="perl" name="code"&gt;&amp;gt; cp  /cygdrive/c/Program\ Files/MySQL/MySQL\ Server\ 5.4/lib/opt/libmysql.dll   /cygdrive/c/WINDOWS/system32/libmysql.dll&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Now run your Ruby db test program&lt;br /&gt;&lt;pre class="ruby" name="code"&gt;require "mysql"&lt;br /&gt;&lt;br /&gt;begin&lt;br /&gt;  dbh = Mysql.real_connect("db_server_host", "user_name", "password", "database_name")&lt;br /&gt;  puts "Server version:" + dbh.get_server_info&lt;br /&gt;rescue Mysql::Error =&amp;gt; err&lt;br /&gt;  puts "Error code: #{err.errno}"&lt;br /&gt;  puts "Error: #{err.error}"&lt;br /&gt;ensure&lt;br /&gt;   dbh.close if dbh&lt;br /&gt;end&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-6230891984630514046?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/6230891984630514046/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/03/ruby-libmysqldll-or-mysqlapiso-error.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/6230891984630514046'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/6230891984630514046'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/03/ruby-libmysqldll-or-mysqlapiso-error.html' title='Ruby libmysql.dll or mysql_api.so error'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-3227782676487079407.post-4940634013736510751</id><published>2010-03-12T17:24:00.000-08:00</published><updated>2010-03-16T17:58:48.175-07:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SEO'/><category scheme='http://www.blogger.com/atom/ns#' term='SQL'/><title type='text'>Mapping URL to top level domain</title><content type='html'>When parsing web logs as part of web analytics, one might be interested in the top level domain analysis before drilling down further.&amp;nbsp; Following query maps an url (referer) to top level domain.&amp;nbsp; Top level domain include not only generic ones like '.com', '.edu', '.org', etc but also country level top levels like '.uk', '.de', etc.&amp;nbsp; You can download the list from &lt;a href="http://www.iana.org/domains/root/db/"&gt;IANA (Internet Assigned Numbers Authority)&lt;/a&gt; .&lt;br /&gt;&lt;br /&gt;After downloading it import into a table dim_int_mktg_tld which you can join with the staging data.&lt;br /&gt;&lt;pre name="code" class="sql"&gt;CREATE TABLE `dim_int_mktg_tld` (&lt;br /&gt;  `tld` varchar(16) NOT NULL,&lt;br /&gt;  `type` varchar(32) default 'country-code',  #-- Can be country-code or generic&lt;br /&gt;  `info` varchar(500) default NULL,&lt;br /&gt;  PRIMARY KEY  (`tld`)&lt;br /&gt;) ENGINE=InnoDB DEFAULT CHARSET=latin1&lt;br /&gt;&lt;br /&gt;#--  Join with yesterdays staging data and get TLD&lt;br /&gt;SELECT WL.url, TLD.tld&lt;br /&gt;FROM stg_weblog_date WL&lt;br /&gt;LEFT OUTER JOIN dim_int_mktg_tld TLD&lt;br /&gt;ON substring_index(&lt;br /&gt;     substring_index(&lt;br /&gt;       substring_index(url, '://', -1),&lt;br /&gt;       '/',1), &lt;br /&gt;    '.',-1) = TLD.tld&lt;br /&gt;WHERE WL.date = DATE_ADD(CURRENT_DATE, interval -1 day)&lt;br /&gt;&lt;br /&gt;#-- Example: http://www.abc.co.au/in/test&lt;br /&gt;#-- Innermost substring_index returns everything to the right of '://' &lt;br /&gt;#--   i.e, www.abc.co.au/in/test,&lt;br /&gt;#-- then the middle one gets everything to the left of first '/'&lt;br /&gt;#--   i.e, www.abc.co.au&lt;br /&gt;#-- then the outer most substring_index returns everything to the right of last '.'&lt;br /&gt;#--   i.e, au&lt;br /&gt;#-- which is then joined with TLD table&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;HTH,&lt;br /&gt;Shiva&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3227782676487079407-4940634013736510751?l=www.hiregion.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://www.hiregion.com/feeds/4940634013736510751/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.hiregion.com/2010/03/mapping-url-to-top-level-domain.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4940634013736510751'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/3227782676487079407/posts/default/4940634013736510751'/><link rel='alternate' type='text/html' href='http://www.hiregion.com/2010/03/mapping-url-to-top-level-domain.html' title='Mapping URL to top level domain'/><author><name>Shiva</name><uri>http://www.blogger.com/profile/02008516387150084750</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry></feed>
