projects:digikey_partsdb
Differences
This shows you the differences between two versions of the page.
Next revision | Previous revision | ||
projects:digikey_partsdb [2013/10/12 08:34] – created charliex | projects:digikey_partsdb [2013/10/13 10:11] (current) – charliex | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | === digikey parts slurper == | ||
+ | |||
+ | fetch www.digikey.com/ | ||
+ | |||
+ | grep for **catfilterlink** | ||
+ | |||
+ | remove beginning of line to inclusive **"** | ||
+ | |||
+ | remove end of line from **"** inclusive | ||
+ | |||
+ | == produces following info == | ||
* http:// | * http:// | ||
Line 859: | Line 870: | ||
* http:// | * http:// | ||
* http:// | * http:// | ||
- | | + | |
+ | == grabbing FV's == | ||
+ | |||
+ | we need the FV's to crawl each subsection. grab all the above urls, make sure Results per Page = 500. The CSV download is capped at 500 results per fetch, so no point increasing this value. | ||
+ | |||
+ | | ||
+ | |||
+ | also grab the total page count | ||
+ | |||
+ | * <a class=" | ||
+ | |||
+ | The page/ | ||
+ | |||
+ | grab the FV value and page count, and store for each of the above URL' | ||
+ | |||
+ | == crawl individual pages == | ||
+ | |||
+ | curl with a valid useragent i used --useragent " | ||
+ | |||
+ | < | ||
+ | curl.exe -o page%1.csv -L -v -G " | ||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | The response has 4 bytes at the front we don't want, so a simple byteskip script or piece of code. | ||
+ | |||
+ | < | ||
+ | |||
+ | #include < | ||
+ | #include < | ||
+ | |||
+ | int main(int argc, | ||
+ | { | ||
+ | FILE *fp,*ofp; | ||
+ | |||
+ | if( argc < 4 ) { | ||
+ | fprintf(stderr," | ||
+ | exit(-1); | ||
+ | } | ||
+ | |||
+ | fp =fopen( argv[1], | ||
+ | if( fp == NULL ) { | ||
+ | fprintf(stderr," | ||
+ | exit(-2); | ||
+ | } | ||
+ | |||
+ | unsigned long length ; | ||
+ | |||
+ | fseek(fp, | ||
+ | |||
+ | length = ftell( fp ) ; | ||
+ | |||
+ | |||
+ | if( length == 0 ) { | ||
+ | |||
+ | fclose( fp ); | ||
+ | |||
+ | fprintf(stderr," | ||
+ | exit(-3); | ||
+ | } | ||
+ | |||
+ | unsigned long offset; | ||
+ | |||
+ | //skip offset | ||
+ | offset = strtoul (argv[3], NULL, 0); | ||
+ | |||
+ | if( offset >= length ){ | ||
+ | |||
+ | fclose( fp ); | ||
+ | |||
+ | fprintf(stderr," | ||
+ | exit(-5); | ||
+ | } | ||
+ | |||
+ | // set to skip position | ||
+ | fseek(fp, | ||
+ | |||
+ | unsigned char *buffer = NULL; | ||
+ | |||
+ | buffer = (unsigned char *)malloc( length - offset ); | ||
+ | if( buffer == NULL ) { | ||
+ | |||
+ | fclose(fp); | ||
+ | |||
+ | fprintf(stderr," | ||
+ | exit(-6); | ||
+ | } | ||
+ | |||
+ | // read whole buffer. | ||
+ | if( fread(buffer, | ||
+ | fclose(fp); | ||
+ | fprintf(stderr," | ||
+ | exit(-7); | ||
+ | |||
+ | } | ||
+ | |||
+ | // open output file for writing. | ||
+ | ofp = fopen( argv[2], | ||
+ | |||
+ | if( ofp == NULL ) { | ||
+ | fclose(fp); | ||
+ | |||
+ | free( buffer ); | ||
+ | buffer = NULL; | ||
+ | fprintf(stderr," | ||
+ | exit(-8); | ||
+ | } | ||
+ | |||
+ | if( fwrite(buffer, | ||
+ | fclose(fp); | ||
+ | fclose(ofp); | ||
+ | fprintf(stderr," | ||
+ | exit(-9); | ||
+ | } | ||
+ | |||
+ | free( buffer ); | ||
+ | |||
+ | fclose(fp); | ||
+ | fclose(ofp); | ||
+ | |||
+ | |||
+ | return 0; | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | Process all the files. | ||
+ | |||
+ | < | ||
+ | for %a in (*.csv) do byteskip %a o%a 4 | ||
+ | </ | ||
+ | |||
+ | I used one of the online CSV to MYSQL converters, but most of them can't handle the variations in CSV. To create the initial schema for each table i converted one CSV to XLS by importing it into google docs, and then re-exporting it as an XLS then importing that into phpmyadmin, that makes the base schema.< | ||
+ | |||
+ | Rename the table in phpmyadmin or via mysql tool | ||
+ | |||
+ | Then do the final import with the csvtosql tool, (in progress) | ||
+ | |||
+ |
projects/digikey_partsdb.1381592079.txt.gz · Last modified: 2013/10/12 08:34 by charliex