tutorial/ 0000755 0000765 0000765 00000000000 10327247041 013143 5 ustar brian brian 0000000 0000000 tutorial/blocked_ordered.py 0000755 0000765 0000765 00000001366 10327247041 016635 0 ustar brian brian 0000000 0000000 import metakit, random, time
st = metakit.storage("test.mk", 1)
# create a blocked view and
# order the view on the first integer value
vw = st.getas("large_view[_B[key:I,data:B]]").blocked().ordered(1)
t1 = time.time()
for i in range(1000000):
vw.append((i, str(i)))
if i % 10000 == 0:
# commit every 10000 entries
print i
st.commit()
t2 = time.time()
st.commit()
print (t2-t1), "seconds to load", len(vw), "entries"
# now test lookup times
lookup = []
size = len(vw)
for i in range(1000):
lookup.append(int(random.random()*size))
t1 = time.time()
for i in lookup:
vw.find(key=i)
t2 = time.time()
print (t2-t1), "seconds to lookup up", len(lookup), "entries"
print "or", (t2-t1)/len(lookup), "seconds per lookup"
tutorial/filter_remove.py 0000755 0000765 0000765 00000000617 10327247041 016366 0 ustar brian brian 0000000 0000000 """Indicies Test"""
import metakit
st = metakit.storage()
vw = st.getas("test[id:I]")
# populate rows
for i in range(10000):
vw.append(i)
indexvw = vw.filter(lambda row: row.id >= 10)
resultvw = vw.remapwith(indexvw)
for row in resultvw:
assert row.id >= 10
indexvw = vw.filter(lambda row: row.id >= 10)
vw.remove(indexvw)
# make sure that there are only 10 entries
assert len(vw) == 10
tutorial/flatten.py 0000755 0000765 0000765 00000001020 10327247041 015146 0 ustar brian brian 0000000 0000000 import metakit
storage = metakit.storage()
vw = storage.getas("person[name:S,affiliation[group:S]")
vw.append(name="Frank")
vw.append(name="Bill")
vw[0].affiliation = [{"group":"Physics"},{"group":"Chemistry"}]
vw[1].affiliation = [{"group":"Biology"},{"group":"Theatre"}]
metakit.dump(vw)
print "table using two loops"
for row in vw:
for affiliation in row.affiliation:
print row.name, affiliation.group
print
print "table using flatten"
for row in vw.flatten(vw.affiliation):
print row.name, row.group
tutorial/hash.py 0000755 0000765 0000765 00000000426 10327247041 014445 0 ustar brian brian 0000000 0000000 import metakit
storage = metakit.storage()
view = storage.getas("test[key:S,value1:F,value2:B]")
hashvw = storage.getas("__test_hash__[_H:I,_R:I]")
view = view.hash(hashvw, 1)
view.append(("king", 2.0))
view.append(("king", 4.0))
view.append(("King", 3.0))
metakit.dump(view)
tutorial/index_test.py 0000755 0000765 0000765 00000001654 10327247041 015674 0 ustar brian brian 0000000 0000000 """Indicies Test"""
import metakit
st = metakit.storage()
vw = st.getas("test[a:I]")
# populate rows
for i in range(10000):
vw.append(i)
indices = vw.indices(vw)
assert indices[0].index != -1, "Failed retrieving index of view"
# test the slicing operator
subset = vw[0:10]
indices = vw.indices(subset)
print "This table should not be populated with -1's"
metakit.dump(indices)
# the indices should not be -1!!!
if indices[0].index == -1:
print "Failed retrieving index of subset"
else:
print "vw[0:10] works okay with vw.indices"
print
# test the select operator
subset = vw.select(0,9)
indices = vw.indices(subset)
print "This table should not be populated with -1's"
metakit.dump(indices)
# the indices should not be -1!!!
assert indices[0].index != -1, "Failed retrieving index of subset"
if indices[0].index == -1:
print "Failed retrieving index of subset"
else:
print "vw.select(lo,hi) works ok with vw.indices"
tutorial/join.py 0000755 0000765 0000765 00000000724 10327247041 014462 0 ustar brian brian 0000000 0000000 import metakit
storage = metakit.storage()
vw = storage.getas("test1[a:I,b:S]")
vw2 = storage.getas("test2[a:I,c:S]")
vw.append((0, "view1"))
vw2.append((0, "view2"))
vw2.append((1, "view2"))
metakit.dump(vw.join(vw2, vw.a))
storage = metakit.storage()
vw = storage.getas("test1[a:I,b:S]")
vw2 = storage.getas("test2[aa:I,c:S]")
vw.append((0, "view1"))
vw2.append((0, "view2"))
vw2.append((1, "view2"))
temp = vw2.rename('aa', 'a')
metakit.dump(vw.join(temp, vw.a))
tutorial/ordered.py 0000755 0000765 0000765 00000000332 10327247041 015142 0 ustar brian brian 0000000 0000000 import metakit
storage = metakit.storage()
view = storage.getas("test[key:S,value1:F]").ordered()
for i in range(10):
if i % 2 == 0: view.append(("king", i))
else: view.append(("KING", i))
metakit.dump(view)
tutorial/python.html 0000755 0000765 0000765 00000241233 10327247041 015362 0 ustar brian brian 0000000 0000000
Metakit for Python
Metakit for Python
The structured database which fits in the palm of your hand
[ Terminology | Installation
| Getting started | Mk4py
Reference
]
Buzzwords - Metakit is an embeddable
database
which runs on Unix, Windows, Macintosh, and other platforms. It lets
you build applications which store their data efficiently, in a
portable way, and which will not need a complex runtime installation.
In terms of the data model, Metakit takes the middle ground between
RDBMS, OODBMS, and flat-file databases - yet it is quite different from
each of them.
Technology - Everything is stored variable-sized yet with
efficient positional row access. Changing an existing datafile
structure is as simple as re-opening it with that new structure. All
changes are transacted. You can mix and match software written in C++,
Python, and Tcl. Things can't get much more flexible...
Python - The extension for Python
is called "Mk4py". It provides a lower-level API for the Metakit C++
core extension than an earlier version of this interface, and uses SCXX by
Gordon McMillan as C++ glue interface.
Mk4py 2.4.9.2 - is a final/production release. The homepage points to a download area with pre-compiled
shared libraries for Unix, Windows, and Macintosh. The Metakit source
distribution includes this documentation, the Mk4py C++ source code, a
"MkMemoIO.py" class which provides efficient and fail-safe I/O
(therefore also pickling) using Metakit memo fields, and a few more
goodies.
Changes since 2.01 - the MK core has changed substantially:
- New commit-aside and commit-extend modes (see storage objects
below)
- Performance improvements, mostly due to a much more scalable file
format
- The "M" (memo) datatype is gone, use "B" instead, it now handles
huge items
- Internal changes to take advantage of the hash/ordered/blocked
viewers
- Mk4py members storage, view, and property are now lower-case
- Added Mk4py.version as attribute, containing the "2.x.y" string
- Added "metakit.py" as new main wrapper, use as "import metakit"
License and support - Metakit 2 and up are distributed
under
the liberal X/MIT-style open source license. Commercial support is
available through an Enterprise License. See the license
page for details.
Credits - Are due to Gordon McMillan for not stopping at
the
original Mk4py and coming up with a more Pythonic interface, and to
Christian Tismer for pushing Mk4py way beyond its design goals. Also
to GvR and the Python community for taking scripting to such
fascinating heights...
Updates - The latest version of this document is at http://www.equi4.com/metakit/python.html
Terminology
There are several ways to say the same thing, depending on where you're
coming from. For example, the terms table, list, collection,array,sequence,
and vector all denote a more or less similar concept. To
help avoid confusion, Metakit uses a simple (but hopefully precise)
terminology.
The terms adopted by Metakit can be summarized as follows:
- A view is an indexable collection of rows (a table
of records, an array of elements).
- An index is a position in a view, used to specify
a row (the first row is at index zero).
- Each view has an ordered set of properties,
used to refer to the data values of each row.
- In Metakit, each (view, index, property)
combination denotes a single data value.
- A different way to describe this combination would be: (matrix,
row-index, column-id).
- Data values can be strings, numeric, untyped data, or
a nested view, called a subview.
A few more comments about the semantics of Metakit:
- Views are homogenous: each row in a view contains the
same type of information.
- This also implies that all subviews within the same view always
have the same structure.
- Rows are either part of a view on file, or temporary
(gone when no longer referenced).
Installation
- Download the latest version from http://www.equi4.com/pub/download.html
- On Unix, rename the appropriate compiled extension to "Mk4py.so"
(on Win/Mac, use the corresponding file)
- Place the Mk4py extension as well as the "metakit.py" wrapper
somewhere on Python's module search path,
such as in the site-packages directory (or just leave it in ".")
- Do a small test, by running "demo.py". If all is well, you
should get some self-explanatory output
Getting started
Create a database:
import metakit
db = metakit.storage("datafile.mk",1)
Create a view (this is the Metakit term for "table"):
vw = db.getas("people[first:S,last:S,shoesize:I]")
Add two rows (this is the Metakit term for "record"):
vw.append(first='John',last='Lennon',shoesize=44)
vw.append(first='Flash',last='Gordon',shoesize=42)
Commit the changes to file:
db.commit()
Show a list of all people:
for r in vw: print r.first, r.last, r.shoesize
Show a list of all people, sorted by last name:
for r in vw.sort(vw.last): print r.first, r.last, r.shoesize
Show a list of all people with first name 'John':
for r in vw.select(first='John'): print r.first, r.last, r.shoesize
Mk4py Reference
- Module functions
- Storage objects
- View objects
- Derived views
- View operations
- Mapping views
- Rowref objects
- Property objects
1. Module functions
These functions live at the module level. You can use them as described
below after executing the following preamble:
import metakit
print metakit.version
SYNOPSYS
- db = metakit.storage()
- Create an in-memory database (can't use commit/rollback).
Details...
Notes...
- db = metakit.storage(file)
- Use a specified file object to build the storage on
- db = metakit.storage(name,mode)
- Open file, create if absent and rwflag is non-zero. Open
read-only if mode is 0, r/w if mode is 1 (cannot be shared), or
as commit-extend if mode is 2 (in mode 1 and 2, the file will
be created if needed).
- vw = metakit.view()
- Create a standalone view; not in any storage object
- pr
= metakit.property(type, name)
- Create a property (a column, when associated to a
view). Notes...
- vw = metakit.wrap(sequence,proplist,byPos=0)
- Wraps a Python sequence as a view. Details...
Notes...
- metakit.dump(view)
- Prints the contents of the view to the screen.
Metakit.dump is your friend, use it liberally.
ADDITIONAL DETAILS
storage-
When given a single
argument, the file object must be a real stdio file, not a
class implementing the file r/w protocol. When the storage
object is destroyed (such as with 'db = None'), the
associated datafile will be closed. Be sure to keep a reference to
it around as long as you use it.
wrap-
This call can be used to
wrap any Python sequence, it assumes that each item is
either a dictionary or an object with attribute names
corresponding to the property names. Alternately, if byPos
is nonzero, each item can be a list or tuple - they will then be
accessed by position instead. Views created in this way
can be used in joins and any other view operations.
2. Storage objects
SYNOPSYS
- vw = storage.getas(description)
- Locate, define, or re-define a view stored in a storage
object. Notes...
- vw = storage.view(viewname)
- The normal way to retrieve an existing view.
- storage.rollback(full=0)
- Revert data and structure as was last committed to disk. In
commit-aside mode, a "full" rollback reverts to the state of the
original file and forgets about the aside file.
After a rollback, your view objects are invalid (use the view or getas
methods on your storage object to get them back). Furthermore, after a
full rollback, the aside storage is detached from the main storage. Use
the aside method on your main storage object to reattach it. If you do
not reattach it, further commits will (try to) write to the main
storage.
- storage.commit(full=0)
- Permanently commit data and structure changes to disk In
commit-aside mode, a "full" commit save the latest state in the
original file and clears the aside datafile.
- ds = storage.description(viewname='')
- The description string is described under getas. Notes...
- vw = storage.contents()
- Returns the View which holds the meta data for the
Storage. Notes...
- storage.autocommit()
- Commit changes automatically when the storage object goes away
- storage.load(fileobj)
- Replace storage contents with data from file (or any other
object supporting read such as sys.stdin or StringIO) Notes...
- storage.save(fileobj)
- Serialize storage contents to file (or any other object
supporting write such as sys.stdout or StringIO) Notes...
ADDITIONAL DETAILS
description-
A description of
the entire storage is retured if no viewname is specified,
otherwise just the specified top-level view.
getas-
Side-effects: the
structure of the view is changed.
Notes: Normally used to create a new View, or alter the structure of an
existing one.
A description string looks like:
"people[name:S,addr:S,city:S,state:S,zip:S]"
That is "<viewname>[<propertyname>:<propertytype>...]"
Where the property type is one of:
|
I |
|
adaptive integer (becomes Python int) |
|
L |
|
64-bit integer (becomes Python long) |
|
F |
|
C float (becomes Python float) |
|
D |
|
C double (is a Python float) |
|
S |
|
C null terminated string (becomes Python string) |
|
B |
|
C array of bytes (becomes Python string) |
Careful: do not include white space in the decription string.
In the Python binding, the difference between S and B types is
not as important as in C/C++, where S is used for zero-terminated text
strings. In Python, the main distinctions are that B properties must be
used if the data can contain zero bytes, and that sort order of S
(stricmp) and B (memcmp) differ. At some point, Unicode/UTF-8 will also
play a role for S properties, so it's best to use S for
text. However, if you are planning on using python's pickle
facility
it is safest to use the 'B' data type as this supports all pickle
modes. Notes...
Dropping
or modifying a view- It may not be obvious at this point, but
getas can be called multiple times for the same view. This is
what metakit considers 'restructuring' and it can be done on the
fly. For instance, if you wanted to add a phone number to the
people table you simple call getas again as follows:
"people[name:S,addr:S,city:S,state:S,zip:S,phone:S]"
To drop a view, call getas with only the view name:
3. View objects
View implements sequence (list) methods, including slicing,
concatentation etc. They behave as a sequence of "rows", which in turn
have "properties". Indexing (getitem) returns
a reference to a row, not a copy.
Notes...
r = view[0]
r.name = 'Julius Caesar'
view[0].name # will yield 'Julius Caesar'
A slice returns a modifiable view which is tied to the underlying view.
As special case, however, you can create a fresh empty view with the
same structure as another view with:
v2 = v[0:0]
Setting a slice changes the view:
v[:] = [] # empties the view
All columns are described with a metakit
Property that indicates the name of the column and the type of the
column. A column's property is available from the view as follows:
view.name # will yield metakit.Property("S", "name")
View also supports getattr, which
also returns a Property. Views can be obtained from Storage
objects: view = db.view('inventory') or from other views (see select,
sort, flatten, join, project...) or empty, columnless views can be
created: vw = metakit.view()
SYNOPSYS
- view.insert(index, obj)
- Coerce object to a Row and insert at index in View
- ix = view.append(obj)
- Object is coerced to Row and added to end of View
- view.delete(index)
- Row at index removed from View
- lp = view.structure()
- Return a list of property objects
- cn = view.addproperty(fileobj)
- Define a new property, return its column position
- view.map(func, subset=None)
- Apply func to each row of view, or (if subset specified) to
each row in view that is lso in subset. Func must have the signature
"func(row)", and may mutate row. Subset must be a subset of view: e.g.
"customers.map(func, customers.select(...))".
- rview = view.filter(func)
- Return a view containing the indices of those rows satisfying
func. Func must have signature "func(row)" and must return a false
value to omit the row. In general, you will use this in
conjunction with view.remapwith or view.remove
- obj = view.reduce(func,start=0)
- Return the result of applying func(row, lastresult) to each
row in view.
- view.remove(indices)
- Remove all rows whose indices are in subset from view. Not the
same as view.minus, because unique is
not required, and view is not
reordered.
- rview
= view.indices(subset)
- Returns a view containing the indices in view of the rows in
subset. The resulting view is suitable for use with view.remapwith or view.remove among others. Notes...
- rview = view.copy()
- Returns a copy of the view.
Esoteric methods - if you use these,
you know more than I do.
- str = view.access(byteprop,rownum,offset,length=0)
- Get (partial) byte property contents.
- view.modify(byteprop,rownum,string,offset,diff=0)
- Store (partial) byte property contents. A non-zero value of
diff removes (<0) or inserts (>0) bytes.
- n = view.itemsize(prop,rownum=0)
- Return size of item (rownum only needed for S/B types). With
integer fields, a result of -1/-2/-4 means 1/2/4 bits per value,
respectively.
ADDITIONAL DETAILS
addproperty- This adds
properties which do not persist when committed. To make them
persist, you should use storage.getas(...) when defining (or
restructuring) the view.
append- Also support keyword
args (colname=value...). See insert below.
insert- coercion to a Row is
driven by the View's columns, and works for:
|
dictionaries |
|
(column name -> key) |
|
instances |
|
(column name -> attribute name) |
|
lists |
|
(column number -> list index) - watch out! |
4. Derived views
SYNOPSYS
- vw = view.select(criteria...)
- Return a view which has fields matching the given
criteria. Details...
- vw = view.select(low,high)
- Return a view with rows in the specified range. This is
similar to vw[low:high+1] except that the result can be used in view.indices.
- vw = view.sort()
- Sort view in "native" order, i.e. the definition order of its
keys. Keys are specified by using a mapping view.
- vw = view.sort(property...)
- Sort view using the specified properties. Details... Notes...
- vw = view.sortrev((propall...),
(proprev...))
- Sort view in specified order, with optionally some properties
in reverse. Details... Notes...
- vw = view.project(property...)
- Returns a derived view with only the named columns
ADDITIONAL DETAILS
select-
Example selections,
returning the corresponding subsets:
result = inventory.select(shoesize=44)
result = inventory.select({'shoesize':40},{'shoesize':43})
result = inventory.select({},{'shoesize':43})
The derived view is "connected"to the base view. Modifications of rows
in the derived view are reflected in
the base view.
sort-
Example, returning the
sorted permutation
result = inventory.sort(inventory.shoesize)
See notes for select concerning changes to the sorted view
sortrev - Example, sort as follows: shoesize:
ascending then shoestyle: descending then shoecolor: ascending
result = inventory.sortrev([inventory.shoesize, inventory.shoestyle, inventory.shoecolor], [inventory.shoecolor])
5. View operations
SYNOPSYS
- vw = view.flatten(subprop,outer=0)
- Produces one 'flat' view from a nested view Notes...
- vw = view.join(view,property...,outer=0)
- Both views must have a property (column) of that name and type
- ix = view.find(criteria...,start=0)
- Returns the index of the found row, or -1. Details...
- ix = view.search(criteria...)
- Binary search (native view order), returns match or insertion
point. Details...
- ix, cnt = view.locate(criteria...)
- Binary search, returns position and count as tuple (count can
be zero). Details...
- vw = view.unique()
- Returns a new view without duplicate rows (a set)
- vw = view.union(view2)
- Returns a new view which is the set union of view and view2
- vw = view.intersect(view2)
- Returns a new view which is the set intersection of view and
view2
- vw = view.different(view2)
- Returns a new view which is the set XOR of view and view2
- vw = view.minus(view2)
- Returns a new view which is (in set terms) view -
view.intersect(view2)
- vw
= view.remapwith(view2)
- Remap rows according to the first (int) property in view2
- vw = view.pair(view2)
- Concatenate rows pairwise, side by side
- vw =
view.rename('oldname',
'newname')
- Returns a derived view with one property renamed
- vw = view.product(view)
- Returns the cartesian product of both views. Notes...
- vw = view.groupby(property...,
'subname')
- Groups on specified properties, with subviews to hold groups
- vw = view.counts(property...,
'name')
- Groups on specified properties, replacing rest with a count
field
ADDITIONAL DETAILS
find-
view[view.find(firstname='Joe')] is essentially the same as
view.select(firstname='Joe')[0] but much faster Subsequent
finds use the "start"keyword: view.find(firstname='Joe',
start=3)
In general you should not use view[view.find(...)] as on failure this
will return the last row in the view. Always check the result if
view.find or view.search to ensure that it is not -1.
search,
locate- You should probably
never use these directly unless you are certain that the property you
are searching is orderd. When using mapping views the fast binary
searches will occur automatically.
6. Mapping views
Mapping views create wrappers around ordinary views. These
mapping views enhance normal views in various ways. Mapping views
can speed up access to particular data (hash views, ordered views) or
can allow a view to hold more data (blocked views). In addition,
blocked views and ordered views can be combined to give a good tradeoff
between data access and amount of data stored.
SYNOPSYS
- vw = view.hash(mapview,numkeys=1)
- Construct a hash mapping based on the first N fields. Details...
Notes...
- vw = view.blocked(blockview)
- Construct a "blocked" view, which acts as if all segments
together form a single large view. Details... Notes...
- vw = view.ordered(numkeys=1)
- Define a view which assumes and maintains sort order, based on
the first N fields. When layered on top of a blocked view, this
implements a 2-level btree. Details... Notes...
ADDITIONAL DETAILS
hash-
This view creates and
manages a special hash map view, to implement a fast find on the
key. The key is defined to consist of the first numKeys_
properties of the underlying view.
The mapview must be empty the first time this hash view is used, so
that Metakit can fill it based on whatever rows are already present in
the underlying view. After that, neither the underlying view nor the
map view may be modified other than through this hash
mapping layer. The defined structure of the map view must be
"_H:I,_R:I".
This view is modifiable. Insertions and changes to key field properties
can cause rows to be repositioned to maintain hash uniqueness.
Careful: when a row is changed in such a way that its key is the same
as in another row, that other row will be deleted from the view.
blocked-
This view acts like a
large flat view, even though the actual rows are stored in blocks,
which are rebalanced automatically to maintain a good trade-off
between block size and number of blocks. Use this style of
view if you are going to have a view with a great number of records
(for example > 250,000) The underlying view must be defined with a
single view property named "_B", with
the structure of the subview being as needed.
Example: vw = st.getas("myview[_B[id:I,data:B]]").blocked()
If a view is created in this fashion, blocked must always be called to
access the data normally. Blocked views cannot be hashed,
although they can be ordered.
ordered-
This is an identity
view, which has as its only use to inform Metakit that the underlying
view can be considered to be sorted on its first numKeys
properties. The effect is that view.find() will try to use binary
search when the search includes key properties (results will be
identical to unordered views, the find will just be more
efficient).
This view is modifiable. Insertions and changes to key field properties
can cause rows to be repositioned to maintain the sort order.
This view can be combined with view.blocked(), to create a 2-level
btree structure.
7. Rowref objects
RowRef allows setting and getting of attributes (columns)
RowRef encapsulates a (view, ndx) tuple.
Normally obtained from a view: rowref = view[33]
8. Property objects
Property has attributes name, id and type. Example: p =
metakit.property('I', 'shoesize')
Note that a property is used to describe a column, but it is NOT the
same as a column. That is, in a given storage, the property
Property('I', 'shoesize') will be unique, (that is, no matter how many
instances you create, they will all have the same property.id). But
that one property can describe any number of columns, each one
in a different view. This is how joins are done, and why
"view.sort(view.firstname)" is the same as
"view.sort(metakit.property('S','firstname'))".
Advanced Notes and
Usage Tips.
Note: metakit.storage() is very useful
for learning how to use metakit.
Simply open a python interpreter
and create an in-memory storage object, then play with it :)
>>> db = metakit.storage()
Then you can play with the storage to figure out some metakit commands
that you don't quite understand.
Note: metakit.getas(...)
You can get into some trouble with getas:
"people[name:S,name:F]"
won't raise any errors for instance and will use the first
description of name as a string.
"people[name:S, name:F]"
will create two columns "name" and " name". The second won't
be directly accessible from python. See this note.
The two basic rules to
follow when creating description strings are:
- Don't use spaces or other special characters (use an underscore
if necessary, this is common database practice). In other words
"zip_code" not "zip code" and don't use a column named "phone #".
You will be able to retrieve these columns but you will have to go
through some shenanigans.
- Always start the column name with a letter.
Note: metakit.wrap
Metakit.wrap is a very powerful way of converting python sequences into
(temporary) metakit views. It is a little tricky only because if
you are loading from a list of tuples you must use byPos=1 to load the
data correctly. This flags simply stats that the row indexed by X
will be loaded as if it were property_list[X]. The example below
should make this clear.
import
metakit
storage = metakit.storage()
table = [
(10000
,8.3,7.1,8.3,8.1),
(14999,5.5,5.8
,5.8,6.2),
(24999
,11.3,12.7,12.4,13.5),
(34999,11.9,13.2 ,12.4,13.7),
(49999,16,2,18.1,16.5 ,17.9),
(74999
,20.7,22.7,21.4,21.4),
(99999,11.6,10.9 ,11.4,10.2),
(149999,9,6.4,8.6,6.3),
(199999,2.6,1.5
,2.1,1.4),
(200000,2.8,1.5,2.0,1.5),
]
headers = ['Income Range',
'Percent Population In Illinois',
'Percent Population In Wisconsin',
'Percent Population In Michigan',
'Percent Population In Indiana']
properties =
[metakit.property('F', h.replace(' ', "_"))
for h in headers]
# we are using byPos = 1 here
since we have the
# input as a list
view = metakit.wrap(table,
properties, 1)
metakit.dump(view)
Income_Range
Percent_Population_In_Illinois
Percent_Population_In_Wisconsin
Percent_Population_In_Michigan Percent_Population_In_Indiana
------------
------------------------------
-------------------------------
------------------------------ -----------------------------
10000.0
8.30000019073
7.09999990463
8.30000019073
8.10000038147
14999.0
5.5
5.80000019073
5.80000019073
6.19999980927
24999.0
11.3000001907
12.6999998093
12.3999996185
13.5
34999.0
11.8999996185
13.1999998093
12.3999996185
13.6999998093
49999.0
16.0
2.0
18.1000003815
16.5
74999.0
20.7000007629
22.7000007629
21.3999996185
21.3999996185
99999.0
11.6000003815
10.8999996185
11.3999996185
10.1999998093
149999.0
9.0
6.40000009537
8.60000038147
6.30000019073
199999.0
2.59999990463
1.5
2.09999990463
1.39999997616
200000.0
2.79999995232
1.5
2.0
1.5
------------
------------------------------
-------------------------------
------------------------------ -----------------------------
Total:
10 rows
Now we can use all the metakit goodness on this view! It can be
joined with any other view, searched, selected and so on.
Note:
Metakit properties are case
insensitive.
Actually, most things in metakit are case
insensitive. This can cause problems if you
expect "Object" to be different than "object". Additionally
properties can have names that python will not allow as variable
names. When one access a column in metakit it usually goes
something like this:
>>>
view = db.getas(...)
>>> row = view[index]
>>> data = row.first_name
However, "first name" is also a valid metakit property or even
"1st_name" neither of which are valid python variable names.
These must be retrieved using python's magic getattr function.
>>>
data = getattr(row,
"1st_name")
This will raise an AttributeError exception if row doesn't have a
column "1st_name". You can also use getattr as follows:
>>>
data = getattr(row,
"1st_name", default)
Which will set data to default if row doesn't have a column "1st_name".
You will run into some issues when columns are named the same as view
methods. For instance, if you have a column named "append" view.append will not return the
metakit Property, it will return the append method. In these
cases, you can use the properties
method of the view to retrieve a python dictionary describing the
relevant properties.
>>> vw
=
db.getas("people[name:S,addr:S,city:S,state:S,zip:S]")
>>>
vw.properties()
{'city': Property('S', 'city'),
'state': Property('S', 'state'), 'name': Property('S', 'name'), 'zip':
Property('S', 'zip'), 'addr': Property('S', 'addr')}
Don't be scared away by this complexity, if you are making and using
your OWN metakit storage, you can avoid all of this. This really
is only useful when using someone else's storage that you don't have
control over and is only added here for the sake of completeness.
Note: storage.contents()
This is one of the neater aspects of metakit. storage.contents()
returns a view which is the structure of the entire database.
Each table in this view is represented as a metakit property of type
"V". Remember that "V" indicates a view or subview type.
This is one of those cases where an example is worth more than this
description.
import
metakit
st = metakit.storage()
vw = st.getas("test[a:S,b:S,c:S]")
metakit.dump(st.contents())
test
------
0 rows
------
Total: 1 rows
print st.contents().properties()
{'test': Property('V', 'test')}
So, if we want to iterate through all the tables or views in the
database:
for
tablename in st.contents().properties():
vw=
st.view(tablename)
Just to prove that all the tables or views are part of the contents
view, we could use the more esoteric:
contents
= st.contents()
row = contents[0]
for tablename in
contents.properties():
vw=
getattr(row, tablename) # same thing as row.<tablename>
Note:
storage.save and storage.load
There is a lot of magic in the load and save operations. For
example, metakit usually doesn't reclaim disk space when tables are
dropped or rows are removed from views. The disk space will
gradually become used when new data is added. Saving a storage to
a new file will optimize disk usage. This can be useful for
databases where lots of data is dropped.
file
= open("newdatabase.mk", "wb")
storage.save(file)
file.close()
storage =
metakit.storage("newdatabase.mk", 1)
storage.save can also be used to serialize a metakit storage for
transport. This is normally done with the StringIO or cStringIO
class. In the following snippet, server.send and client.get are left as exercises
for the reader :)
try:
import cStringIO as StringIO
except ImportError: import StringIO
# server
file = StringIO.StringIO()
storage.save(file)
server.send(file.getvalue())
# client
data = client.get()
file = StringIO.StringIO(data)
storage.load(file)
If you are writing your own socket or html server protocol, the metakit
serialized format knows how big it is. From various emails with
Jean-Claude Wippler:
How to I get the storage size from the
serialized dump?
It's in bytes 4..7 of the stream (one
day to become 3..7), a big-endian int. This normally also holds
for a MK datafile, btw. So the way to do this is read 8 bytes,
determine size, read size-8, put the two pieces back together, then
un-serialize.
How can I ship a subset of a storage?
W.r.t. the storage format, there is a
trick which may come in handy. Consider:
- ...
- commit changes
- restructure, deleting rows and views
- optionally also add views and some info
- serialize (to file or in-memory)
- rollback
- ...
The above has the effect of
"extracting" data from a datafile, and using it to "ship" a subset.
Note: Row
references.
Try to keep
row references around as long as possible. This is especially
true when writing GUI applications. Every time you index a view,
you will go through the mechanics of finding the row in the database
and returning it as a python object. While this is a very quick
operation, they still add up. Note that this is true for most
python operations when using lists. So use:
>>>
row = view[index]
>>> shoe = row.shoe
>>> shoesize =
row.shoesize
instead of:
>>>
shoe = view[index].shoe
>>> shoesize =
view[index].shoesize
Note: indices
Mk4py uses index views all over the place. You can consider an
indexed view as the result of many view.select calls. Just like
each call to view.select or view.find returns the matching index in a
view, an index view contains a whole bunch of matching indices.
Many operations return an indexed view: view.filter,
view.indices. In general, you will use an indexed view with view.remapwith or view.remove
Note:
view.filter
view.filter is really wonderfully powerful. Most of the powerful
queries will be created using a combination of this, view.join and
view.select.
To generate a derived view from view.filter use view.remapwith.
Example: return the subview where all
id's are greater than or equal to 10
indexvw
= vw.filter(lambda row: row.id >= 10)
resultvw = vw.remapwith(indexvw)
Example: remove all rows where the id
is greater than 10
indexvw
= vw.filter(lambda row: row.id >= 10)
vw.remove(indexvw)
Example
code.
Note: view.indices
view.indices has one annoying bit. One would expect that:
subset
= view[10:20]
indicies = view.indices(subset)
would return the proper indicies view, but all rows have an index of -1
which is fairly bad news, since -1 is the index of the last row in the
view and calls to things like view.remove will happily remove the last
view, multiple times!
The proper way to get this type of subset is:
subset
= view.select(10,19)
indices = view.indices(subset)
Note the non-pythonic view.select(10,19) instead of view.select(10,20)
as view.select includes the last row where view[10:20] does not.
Here is some test code.
view.indices works just great with derived
views of the types generated by view.select or view.sort.
Note: derived
views
select and sort are called derived
views
because they map back to another view. Most of the time these can
be
considered as identical to the base view in that when a row is deleted
from a derived view it is also deleted from the base view.
However, in
some cases Mk4py cannot make the necessary connection to the base
view. For instance, this occurs when sorting a mapping
view or selecting from a sorted view.
In these cases a derived view will be returned as a read only
view.
Read only views cannot be modified! However, all is not lost! In
these
cases you can generate a modifiable derived view using view.indices and
view.remapwith.
Here is some example
code.
Note: flattened
view
Flattened views are useful then viewing views or tables with
subviews. For example, consider the following table:
vw =
st.getas("person[name:S,affiliation[group:S]")
And you wanted to iterate through everybodies affiliation you would
have to use two loops:
for
row in vw:
for
affiliation in row.affiliation:
print row.name, affiliation.group
alternatively you could create a flattened view
for
row in vw.flatten(vw.affiliation):
print
row.name, row.group
Example code.
Note: view.join
A joined view combines two seperate views on a given property.
These two views are joined when the property in each view has the same
value.
import
metakit
storage = metakit.storage()
vw =
storage.getas("test1[a:I,b:S]")
vw2 =
storage.getas("test2[a:I,c:S]")
vw.append((0, "view1"))
vw2.append((0, "view2"))
vw2.append((1, "view2"))
metakit.dump(vw.join(vw2, vw.a))
a
b c
-
----- -----
0
view1 view2
-
----- -----
Total: 1
rows
Notice that the output doesn't contain the row in vw2 where a is
1. This is because it doesn't exist in vw. The output of a
join is a read only view and cannot be modified.
What if I want to join two views with
different property names?
Use view.rename to rename one of
the view columns. For instance (see the bold text below):
import
metakit
storage = metakit.storage()
vw =
storage.getas("test1[a:I,b:S]")
vw2 =
storage.getas("test2[aa:I,c:S]")
vw.append((0, "view1"))
vw2.append((0, "view2"))
vw2.append((1, "view2"))
temp =
vw2.rename('aa', 'a')
metakit.dump(vw.join(temp, vw.a))
a
b c
-
----- -----
0
view1 view2
-
----- -----
Total: 1
rows
What if there are properties in both
views with the same name and different types?
Example, if you join the views "test[a:I,b:F]" with
"test2[a:I,b:S]" what happens to the b attribute? This is
essentially undefined, you can either get the b:F column or the b:S
column depending on whether you are joining test to test2 or
test2 to test. The safest bet is use view.rename
to rename one of the views b property.
Note: view.product
The cartesian product of two views is fairly straightforward, it
creates a view that combines all rows of view1 with all views of
view2. In essence the operation is as follows:
for
row1 in view1:
for row2 in
view2:
# do something with row1 and row2
Why do this? This is useful when combining two views in complex
fashions. Here is an SQL-style example
select * from view1, view2 where view1.a > view2.c
What this does is select all rows from view1 and view2 where the row in
view1.a is greater then the row in view2.c. The looping version
would be:
for
row1 in view1:
for row2 in
view2:
if row1.a > row2.c:
result.append((row1, row2))
the view.product version would be :
tmp
= view1.product(view2)
indices = tmp.filter(lambda row:
row.a > row.c)
result = tmp.remapwith(indices)
view.product has many of the same caveats as view.join
when dealing with properties in both views that have the same names.
Note: view.hash
Hashing views can be confusing at first. Like other mapping
views, hash views are controlled through another view that manages all
of the gory details of maintaining a hash. Here is the basic hash
recipe:
view
= storage.getas("test[key:S,value1:F,value2:B]")
hashvw =
storage.getas("__test_hash__[_H:I,_R:I]")
view = view.hash(hashvw, 1)
If you follow this recipe you will hardly ever go wrong. You must
do this every time you want to access your hashed view! Notice
three things:
- The original view is replaced with the hash view (created with
view.hash). This is because the original view should never be
touched after this! Replacing the original view with the hash
view ensures that this will never happen.
- In general, I name the hash view "__%s_hash__"%viewname
This helps me ensure that I always have a unique hash name.
- storage.view("test") doesn't know about the hashed view so don't
use this with hashed views! Use the recipe above.
You can create hashes on multiple keys, in the above example using:
view
= view.hash(hashvw, 2)
would
create a unique key using the first two properties described in the
string passed to storage.getas. In this case they are of type
(string, float). Note that just like python dictionaries, there
can only be one row with the same key.
Note: view.blocked
From the mainling list:
I'll use plain English, and let others come up with accurate Python:
* instead of defining a view "blah[a:I,b:S,c:D]", define
blah[_B[a:I,b:S,c:D]]
* in other words, don't define a view of rows, but a view of views of
rows
* when you open the view, replace:
view = storage.view("blah")
with
view = storage.view("blah").blocked()
* or you can use getas, just make sure the structure is as above
* in other words, don't just use the raw view but pass it through
blocked()
* that's it
You cannot mix things. When blocked, never access the unblocked
view.
You cannot convert data as is, the only way to do so is to copy all
data in. In C++ there is a call to insert one view into another
(compatible) one, but I think in Python you'll have to copy row by row.
Implementation details http://www.equi4.com/mkblocked.html
Python example:
import
metakit, random, time
st = metakit.storage("test.mk", 1)
# create a blocked view and
# order the view on the first integer value
vw = st.getas("large_view[_B[key:I,data:B]]").blocked().ordered(1)
t1 = time.time()
for i in range(1000000):
vw.append((i, str(i)))
if i % 10000 == 0:
# commit every 10000 entries
print i
st.commit()
t2 = time.time()
st.commit()
print (t2-t1), "seconds to load", len(vw), "entries"
# now test lookup times
lookup = []
size = len(vw)
for i in range(1000):
lookup.append(int(random.random()*size))
t1 = time.time()
for i in lookup:
vw.find(key=i)
t2 = time.time()
print (t2-t1), "seconds to lookup up", len(lookup), "random entries"
print "or", (t2-t1)/len(lookup), "seconds per lookup"
Output
from my 2Ghz Pentium 4 running windows 2000
46.3960000277 seconds to load
1000000 entries
0.0600000619888 seconds to lookup
up 1000 random entries
or 6.00000619888e-005 seconds per
lookup
>>>
Note: view.ordered
Unlike view.hash, an ordered view can have more than one entry with the
same key. Also unlike view.hash, ordered uses a case insensitive
ordering, i.e. "king" and "KING" are not identical.
import
metakit
storage = metakit.storage()
view =
storage.getas("test[key:S,value1:F]").ordered()
for i in range(10):
if i % 2 == 0:
view.append(("king", i))
else:
view.append(("KING", i))
metakit.dump(view)
key
value1
----
------
king
0.0
KING
1.0
king
2.0
KING
3.0
king
4.0
KING
5.0
king
6.0
KING
7.0
king
8.0
KING
9.0
----
------
Total: 10
rows
© 2003 Jean-Claude Wippler <jcw@equi4.com>
tutorial/remove.py 0000755 0000765 0000765 00000000731 10327247041 015016 0 ustar brian brian 0000000 0000000 """Indicies Test"""
import metakit
st = metakit.storage()
vw = st.getas("test[a:I]").ordered()
# populate rows
for i in range(10000):
vw.append(i)
indices = vw.indices(vw)
assert indices[0].index != -1, "Failed retrieving index of view"
subset = vw[0:10]
indices = vw.indices(subset)
print "This table should not be populated with -1's"
metakit.dump(indices)
# the indices should not be -1!!!
assert indices[0].index != -1, "Failed retrieving index of subset"
tutorial/select.py 0000755 0000765 0000765 00000000362 10327247041 015000 0 ustar brian brian 0000000 0000000 """Select sort and derived views"""
import metakit
st = metakit.storage()
vw = st.getas("test[a:I,b:S,c:F]").ordered()
# populate rows
for i in range(100):
vw.append(a=1, b=str(i), c=float(i))
vw = vw.sort()
print vw.select(a=1,c=1.0)
tutorial/select_sort.py 0000755 0000765 0000765 00000001054 10327247041 016046 0 ustar brian brian 0000000 0000000 """Select sort and derived views"""
import metakit
st = metakit.storage()
vw = st.getas("test[a:I,b:S]").ordered()
# populate rows
for i in range(100):
vw.append(a=i)
# When select or sort are used on ordered views the result
# is returned in a read only view!!!
derived = vw.sort()
try:
derived[0].a = 20
except TypeError:
print "caught TypeError when trying to modify read only view"
# We can generate a modifiable view using vw.indices and
# vw.remapwith
subset = vw.indices(derived)
derived = vw.remapwith(subset)
derived[0].a = 20
tutorial/serialization.py 0000755 0000765 0000765 00000002064 10327247041 016377 0 ustar brian brian 0000000 0000000 """Metakit serialization example"""
import metakit
try:
import cStringIO as StringIO
except:
import StringIO
st = metakit.storage()
vw1 = st.getas("table_that_I_do_not_want[a:S,b:S,c:S]")
vw2 = st.getas("real_deal[id:I,number:F]")
for c in "ABCDEFGHIJKLMNOP":
vw1.append(c)
for i in range(100):
vw2.append((i, float(i)))
# this is an in-memory table so we can't commit
# we'll just save the state for later
# this could have been done with st.commit
file = StringIO.StringIO()
st.save(file)
initialState = file.getvalue()
# drop the bad table
st.getas("table_that_I_do_not_want")
file = StringIO.StringIO()
st.save(file)
finalState = file.getvalue()
# we can 'ship' finalState to a client somewhere
st2 = metakit.storage()
st2.load(StringIO.StringIO(finalState))
# we should only have one table here
print "Final state after dumping"
metakit.dump(st2.contents())
print
# now reload the initialState
# this could also be done with st.rollback()
print "reloaded initial state"
st.load(StringIO.StringIO(initialState))
metakit.dump(st.contents())
tutorial/test.cmk 0000755 0000765 0000765 00000007750 10327247041 014632 0 ustar brian brian 0000000 0000000 JL 耀 JL
? @ @@ @ @ @ @ A A A 0A @A PA `A pA A A A A A A A A A A A A A A A A B B B B B B B B B $B (B ,B 0B 4B 8B